diff --git a/.gitignore b/.gitignore index 59ca0d434e57..94cc9baccad9 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,7 @@ tests/mxnet_unit_tests coverage.xml # Local CMake build config -cmake_options.yml \ No newline at end of file +cmake_options.yml + +# header file generated at compile time +include/mkldnn/mkldnn_version.h diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 3ffea8694adf..3943914eed66 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5 +Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn index 7de7e5d02bf6..41bee20d7eb4 160000 --- a/3rdparty/mkldnn +++ b/3rdparty/mkldnn @@ -1 +1 @@ -Subproject commit 7de7e5d02bf687f971e7668963649728356e0c20 +Subproject commit 41bee20d7eb4a67feeeeb8d597b3598994eb1959 diff --git a/3rdparty/mshadow b/3rdparty/mshadow index 6e94643bdf1d..1d79ecfdb4c9 160000 --- a/3rdparty/mshadow +++ b/3rdparty/mshadow @@ -1 +1 @@ -Subproject commit 6e94643bdf1d51a505b147f28c358fb71070b8fd +Subproject commit 1d79ecfdb4c9234537e1bf5148f44a1af54501ec diff --git a/3rdparty/tvm b/3rdparty/tvm index 0f053c82a747..21935dcbf56a 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439 +Subproject commit 21935dcbf56ad3bd66ebff9891a6bc3865b8106d diff --git a/CMakeLists.txt b/CMakeLists.txt index 896c7b75a1ec..2142a09d6d2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,6 +51,7 @@ mxnet_option(USE_TENSORRT "Enable infeference optimization with TensorRT mxnet_option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF) mxnet_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF) mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF) +mxnet_option(BUILD_CYTHON_MODULES "Build cython modules." OFF) message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}") message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}") @@ -147,9 +148,11 @@ else(MSVC) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -g") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_ASSERTIONS") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") add_definitions(-DNDEBUG=1) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -g") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_ASSERTIONS") else() add_definitions(-DNDEBUG=1) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") @@ -159,7 +162,7 @@ else(MSVC) elseif(SUPPORT_MSSE2) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2") endif() - set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}") if(SUPPORT_CXX14) add_definitions(-DDMLC_USE_CXX11=1) add_definitions(-DDMLC_USE_CXX14=1) @@ -834,3 +837,12 @@ endif() set(LINT_DIRS "include src plugin cpp-package tests") set(EXCLUDE_PATH "src/operator/contrib/ctc_include") add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -DEXCLUDE_PATH=${EXCLUDE_PATH} -P ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake/lint.cmake) + +if(BUILD_CYTHON_MODULES) + include(cmake/BuildCythonModules.cmake) + add_cython_modules(2) # Build cython module for python2 if python2 is found + add_cython_modules(3) # Build cython module for python3 if python3 is found + if((NOT ${PYTHON2_FOUND}) AND (NOT ${PYTHON3_FOUND})) + message(FATAL_ERROR "No python interpreter found to build cython modules") + endif() +endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f0ec80e2725c..c76f8c6edbc8 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -69,6 +69,8 @@ The committers are the granted write access to the project. - Patric is a parallel computing expert and a major contributor to the MXNet MKL-DNN backend. * [Tao Lv](https://github.com/TaoLv) - Tao is a major contributor to the MXNet MKL-DNN backend and performance on CPU. +* [Zach Kimberg](https://github.com/zachgk) + - Zach is one of the major maintainers of the MXNet Scala package. ### Become a Committer @@ -237,9 +239,10 @@ List of Contributors * [Zhennan Qin](https://github.com/ZhennanQin) * [Zhiyuan Huang](https://github.com/huangzhiyuan) * [Zak Jost](https://github.com/zjost) +* [Nick Guletskii](https://github.com/nickguletskii) * [Shoubhik Bhattacharya](https://github.com/shoubhik) -* [Zach Kimberg](https://github.com/zachgk) * [Rohit Srivastava](https://github.com/access2rohit) +* [Caner Turkmen](https://github.com/canerturkmen) Label Bot --------- diff --git a/KEYS b/KEYS index 7b78ea97e3ea..ff503f7efb7e 100644 --- a/KEYS +++ b/KEYS @@ -688,4 +688,63 @@ n4aiPNGpG7CDmCNnGMJgNYEEbqe1RQ7B4xwmNmGJvdVJRsTfy5557hZNfIfVkdES QTXMfTPP627GwzHQXTdAn9CSGW5FkaSHTVTCZhalBHhAFMDg86ZGUxZDYwhf3s6W 44liPzisQFRxRFOwEubvmw== =dQvb + +-----END PGP PUBLIC KEY BLOCK----- +pub rsa4096 2019-05-15 [SC] + 228ADD932CB218723E61D09E043071126325F0EC +uid [ultimate] Zach Kimberg +sig 3 043071126325F0EC 2019-05-15 Zach Kimberg +sub rsa4096 2019-05-15 [E] +sig 043071126325F0EC 2019-05-15 Zach Kimberg + +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBFzcgWsBEADpXccvPwwE1tNd/f5L1+x5+Kcaw1jLhb1y3/S9Um8iE3wZsRvR +pxNXhw8n+obqVicWFrShCI2rS6yUZFeWAP8X3XWM47sx93y/fFpg4+mDP0Ejl0op +VmZeiX4MCwloMWRMpN5XtiLzilNVUuisa0UhHJaQ67eOjQuZac/nbJojptBaBa9D +zf/1TLAd7mTTk8TBosouSd13gCX262EJb2n2hOYl2hx59Tky1CYNoHpQYdfH+u0U +bwOfMEzbrrD8HyqF1eeEu8EagWKc5piByOWn6smBjpU2uBqBh8N6MH/mY5aDsqGB +wkexiAsq/sKbPi0iFJ0CWmSls69Twe0vmW+THh7SWfGhbUxGwHsuYYIQnjcTHSMg +HZHhB0RrjqiYtyfSvqo0mSOgwtZAX0dg4uCyZtPPeRo9X5qIl8DVPVtKqybdVsX1 +06Pt8EaSSFlxPJN/giw86GflsP2hL3ttjB/p3/8oa1ffgl+Z8xjrkwlDBnKL2BI5 +sfad/l//oPd41IheNji6C2TdnZYyWRpDumes1Jr924E25bAcy3lI82QDdpyHSp4v +9+LG3NRpqLzQ/LpgBZnpjSnVMN1xBdwXpJ87omKM+fzgG0qiScBKko8jGYeRr/IR +sX1ofHUIty57zzWEUc5MN/zgtnIxY3ZSHs1erfnZm5JDou+1YblxkiMGZQARAQAB +tCBaYWNoIEtpbWJlcmcgPHphY2hna0BhcGFjaGUub3JnPokCTgQTAQgAOBYhBCKK +3ZMsshhyPmHQngQwcRJjJfDsBQJc3IFrAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4B +AheAAAoJEAQwcRJjJfDs1hgQAMMQkFmjWIOvHC63qJPeMP/BwgSeyANczohTWhBk +fkpoxFHW5nGxOJePyHqINufy0G0eQLWzBQp/VFTTFadS/tPL0gZi2JvOXSLmroLa +LBGcCkChTjXj1ah7pDq9J3KlHeOY6fzDDA9+3+8XRXMaROL2Bi3ax3jk0hhQ9iMh +jH8iROHsAZAJg7CQ9MUCnlJlyS2mCNVOWBlsKWjoOW4s3pptvH8VBXnYRQfcTZbN +VDwbwxvpq8cDgK+YJ+53MsgxCiYXQ0xtYOwYeThOdf29hRdOJBLwPpbZIlwCgYF5 +xeZinRCh10eVc8UvSFurXT4i91XulWs1pSMXv6xEV5LkXINzpfU8zzfCdpMYpq7n +j7SJc+x5EhjUt8LQsa6ohVHsmEMQpEuX18uOxaA+WE2XAn4Y5kty9Hm1g6osSFEl +40eGio9rz3zd63cHU3c9ccKAg0oxtWnsAw/kvSfExCg40kCvtrmrSoZt+iQnnw1l +isvYtrzEjQiufbF4wryIfFcP/BZEZ4KeDvC6cU6tKyH8gYlW89EYfW92E4rfvofs +o7i0Lo4vVVBmVqCoCORcsJsfRvZs5BpG4SXGD9kOsrz+LVmm3Y7vf68ycc7cTbKs +rlIGLbMzIfk43s9i+C9IRgCo7PjImi7cbEBi52FvriR37ispDhvm2WX4i9o1DNl0 +FPeiuQINBFzcgWsBEADeqh6lsnBd0dzCNANIAhR8EsbRxuy3ihg52RVZR3HcUYyl +osMlXWhGxz2HD/Tt6hKMv32oUXik14gIY9hsyXEBZpDsQAwW3fbyM2JJhTmup/ag +F7Cy76NL1GvgPQ3soClGoXNn6m+W6wDW0shR0yDCfPpWY/h56Ub+7CTcfk4STbgV +Erib+fr7sFlzGobpCOWq80k7wb70ak1J57sEBg1wJFVyme9OJGUfkUAznX8TPFy4 +o4esggkc9Wnr4hzMpk5n6/J0YVl//2YvVFlY8fvnw2pmHnyJpoMqf6ZomT2YhSJ6 +Z0Ni+tyr/CCTNpF7lKvRn9TEbP9Ll2YAWXFGK+MAz6fGmijm7V39IQN/F6duXq+5 +xrsN+tB8udxSRYgp3jG3jSdWvryC8XYyYcXKhgPhq5+PNzRCbkh2/rqqrgAjOOFS +9kIMi95r1Rtb72CUevShHfC1WOulp9fzjt/zA5IpRlI5944CbBVa5wpAS4WyEBTb +FCD9SCK2We2Nu7lJAfjdgKYQeVu3USOQaCRib/eNv2o28veqERV1ZlquefO1qtDa +rstGbiFMI+CRMN5E6Y57gAxaOGud+H3o+DhdrzSTTHGLXsge/upjnRaS6PN3eEaK +gLtUKYMuXjNBoSMSNylSkGxvlJSUQWAPEbn6fHUaZSSIugrl9Z9/TcHQTxCd5wAR +AQABiQI2BBgBCAAgFiEEIordkyyyGHI+YdCeBDBxEmMl8OwFAlzcgWsCGwwACgkQ +BDBxEmMl8Oyc1g//Z7wnTImcKyFaNNxGMHic5NPpf3e+zIqsLnDqKnUkiWxXMstE +3pY0aX8b44fXy/QrUm18jC5LdDd/qH6sXdbBb9hBPwXmp6/WT5vSCn+Bnrj7iPE5 +DWr5mM1cisosn20UGQnb9wVGNrVD3GUwylQ58mu6ehbPTQ3Jah1DtVqpx8YfN9fR +W5PPomKd8zRnYQ7i7nwkj32hWmBW0Kd+lgtCUunT2diic3w1PkQU4IL4Cr9wL4lO +iLN5YVD4D2JeUC7t4mB5EJ3UT1/IcFYIwF0ULYhD1Ke00JocQ6pEVaGkg4Ll4wLN +uO7kSOWbhvHwpH2uPtsAfTJ0h3MFsOSLypN+BdEiLNQ54c4U1zQ9BHzk6xJ3U37U +eSQr5nNq3ceqjtH//7PR5/+OpeTbYuS/75LcujyKP73SqoZLS+41MNzmLnG5nBhJ +dROfxO+mRLuY7fgZWlDBLAfe8Rmwfd7pxWusggBQ1MQjvweYRbBXwVxog04hL4uY +Z0/2Tt5t15CGyVCY7HpvnpPTmHKHcSKlRiFHOp+kNLWbxFC0ryMBntYjinAvk1xy +ihUvH40rKlgxbV3+KS/Ew88D2tJ2JGCACx2yzS3trw+oUXugoaAQiileXQfu47SI +dM5GPiAt2UECZ9v4WthkRGygnoPiL/4IyDFsS9yDX7mOBSycVmT5R+VXmOs= +=z8mO -----END PGP PUBLIC KEY BLOCK----- diff --git a/Makefile b/Makefile index b578683a74b6..741c5f0190f2 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ endif # CFLAGS for debug ifeq ($(DEBUG), 1) - CFLAGS += -g -O0 + CFLAGS += -g -O0 -D_GLIBCXX_ASSERTIONS else CFLAGS += -O3 -DNDEBUG=1 endif @@ -594,7 +594,7 @@ lint: cpplint rcpplint jnilint pylint cpplint: 3rdparty/dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package tests \ - --exclude_path src/operator/contrib/ctc_include + --exclude_path src/operator/contrib/ctc_include include/mkldnn pylint: python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py @@ -612,7 +612,7 @@ doxygen: # Cython build cython: - cd python; python setup.py build_ext --inplace --with-cython + cd python; $(PYTHON) setup.py build_ext --inplace --with-cython cython2: cd python; python2 setup.py build_ext --inplace --with-cython @@ -677,6 +677,26 @@ rclean: $(RM) -r R-package/src/image_recordio.h R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \ R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz +build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar: + mkdir -p build + svn co http://svn.apache.org/repos/asf/creadur/rat/branches/0.12-release/ build/rat; \ + cd build/rat; \ + mvn -Dmaven.test.skip=true install; + +ratcheck: build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar + exec 5>&1; \ + RAT_JAR=build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar; \ + OUTPUT=$(java -jar $(RAT_JAR) -E tests/nightly/apache_rat_license_check/rat-excludes -d .|tee >(cat - >&5)); \ + ERROR_MESSAGE="Printing headers for text files without a valid license header"; \ + echo "-------Process The Output-------"; \ + if [[ $OUTPUT =~ $ERROR_MESSAGE ]]; then \ + echo "ERROR: RAT Check detected files with unknown licenses. Please fix and run test again!"; \ + exit 1; \ + else \ + echo "SUCCESS: There are no files with an Unknown License."; \ + fi + + ifneq ($(EXTRA_OPERATORS),) clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN) $(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ diff --git a/NEWS.md b/NEWS.md index ad842ac84786..59f8de831c50 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,24 @@ MXNet Change Log ================ +## 1.4.1 + +Apache MXNet (incubating) 1.4.1 is a maintenance release incorporating important bug fixes and important performance improvements. All users of Apache MXNet (incubating) 1.4.0 are advised to upgrade. You can install Apache MXNet (incubating) 1.4.1 at the usual place. Please review these Release Notes to learn the bug fixes. + +### Bug-fixes +* Java bug-fix cherry pick (#14834) +* Use DEFAULT macro in C APIs (#14767) (#14789) +* Set idx2name for Optimizer object (#14703) (#14772) +* Add pin_device_id option to Gluon DataLoader (#14136) (#14771) +* Tidy up storage allocation and deallocation (#14480) (#14768) +* Add MXEnginePushAsync and MXEnginePushSync C APIs (#14615) (#14770) +* Less cudaGet/SetDevice calls in Gluon execution (#13764) +* Fix nightly build of 1.4.x (#14556) +* Memory fixes. Resolves #10867, and resolves #14080 (#14372) (#14586) +* Fixes for data links (#14526) +* Backport of Windows CI Fixes (#14420) + + ## 1.4.0 - [New Features](#new-features) diff --git a/README.md b/README.md index 3eea2e78fa54..f3e524c79540 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ How to Contribute What's New ---------- +* [Version 1.4.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.4.1) - MXNet 1.4.1 Patch Release. * [Version 1.4.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.4.0) - MXNet 1.4.0 Release. * [Version 1.3.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.1) - MXNet 1.3.1 Patch Release. * [Version 1.3.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.0) - MXNet 1.3.0 Release. diff --git a/amalgamation/README.md b/amalgamation/README.md index 2ecf1626c1e7..b58776e372aa 100644 --- a/amalgamation/README.md +++ b/amalgamation/README.md @@ -114,17 +114,17 @@ To Change ``` #ifdef __GNUC__ - #define MX_TREAD_LOCAL __thread + #define MX_THREAD_LOCAL __thread #elif __STDC_VERSION__ >= 201112L - #define MX_TREAD_LOCAL _Thread_local + #define MX_THREAD_LOCAL _Thread_local #elif defined(_MSC_VER) - #define MX_TREAD_LOCAL __declspec(thread) + #define MX_THREAD_LOCAL __declspec(thread) #endif ``` To ``` -#define MX_TREAD_LOCAL __declspec(thread) +#define MX_THREAD_LOCAL __declspec(thread) ``` **To build arm32 compatible version (e.g. iPhone 5):** diff --git a/benchmark/__init__.py b/benchmark/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/README.md b/benchmark/opperf/README.md new file mode 100644 index 000000000000..99c75be2bf7b --- /dev/null +++ b/benchmark/opperf/README.md @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + +# MXNet Operator Performance Benchmarks + +A Python utility for benchmarking and profiling individual MXNet operator execution. + +With this utility, for each MXNet operator you can get the following details: + +**Timing** +1. Forward execution time +2. Backward execution time +3. Time spent for memory management + +**Memory** +1. Total memory allocated + +# Motivation + +Benchmarks are usually done end-to-end for a given Network Architecture. For example: ResNet-50 benchmarks on ImageNet data. This is good measurement of overall performance and health of a deep learning framework. However, it is important to note the following important factors: +1. Users use a lot more operators that are not part of a standard network like ResNet. Example: Tensor manipulation operators like mean, max, topk, argmax, sort etc. +2. A standard Network Architecture like ResNet-50 is made up of many operators Ex: Convolution2D, Softmax, Dense and more. Consider the following scenarios: + 1. We improved the performance of Convolution2D operator, but due to a bug, Softmax performance went down. Overall, we may observe end to end benchmarks are running fine, we may miss out the performance degradation of a single operator which can accumulate and become untraceable. + 2. You need to see in a given network, which operator is taking maximum time and plan optimization work. With end to end benchmarks, it is hard to get more fine grained numbers at operator level. +3. We need to know on different hardware infrastructure (Ex: CPU with MKLDNN, GPU with NVIDIA CUDA and cuDNN) how different operators performs. With these details, we can plan the optimization work at operator level, which could exponentially boost up end to end performance. +4. You want to have nightly performance tests across all operators in a deep learning framework to catch regressions early. +5. We can integrate this framework with a CI/CD system to run per operator performance tests for PRs. Example: When a PR modifies the kernel of TransposeConv2D, we can run benchmarks of TransposeConv2D operator to verify performance. + +Hence, in this utility, we will build the functionality to allow users and developers of deep learning frameworks to easily run benchmarks for individual operators. + +# How to use + +## Prerequisites + +This utility uses MXNet profiler under the hood to fetch compute and memory metrics. Hence, you need to build MXNet with `USE_PROFILER=1` flag. + +Make sure to build the flavor of MXNet, for example - with/without MKL, with CUDA 9 or 10.1 etc., on which you would like to measure operator performance. Finally, you need to add path to your cloned MXNet repository to the PYTHONPATH. + +``` +export PYTHONPATH=$PYTHONPATH:/path/to/incubator-mxnet/ +``` + +## Usecase 1 - Run benchmarks for all the operators + +Below command runs all the MXNet operators (NDArray) benchmarks with default inputs and saves the final result as JSON in the given file. + +``` +python incubator-mxnet/benchmark/opperf/opperf.py --output-format json --output-file mxnet_operator_benchmark_results.json +``` + +**Other Supported Options:** + +1. **output-format** : `json` or `md` for markdown file output. + +2. **ctx** : `cpu` or `gpu`. By default, cpu on CPU machine, gpu(0) on GPU machine. You can override and set the global context for all operator benchmarks. Example: --ctx gpu(2). + +3. **dtype** : By default, `float32`. You can override and set the global dtype for all operator benchmarks. Example: --dtype float64. + +## Usecase 2 - Run benchmarks for all the operators in a specific category + +For example, you want to run benchmarks for all NDArray Broadcast Binary Operators, Ex: broadcast_add, broadcast_mod, broadcast_pow etc., You just run the following python script. + +``` +#!/usr/bin/python +from benchmark.opperf.tensor_operations.binary_broadcast_operators import run_mx_binary_broadcast_operators_benchmarks + +# Run all Binary Broadcast operations benchmarks with default input values +print(run_mx_binary_broadcast_operators_benchmarks()) +``` + +Output for the above benchmark run, on a CPU machine, would look something like below: + +``` +{'broadcast_mod': [{'avg_time_forward_broadcast_mod': 28.7063, 'avg_time_mem_alloc_cpu/0': 4194.3042, + 'avg_time_backward_broadcast_mod': 12.0954, 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}, + {'avg_time_forward_broadcast_mod': 2.7332, 'avg_time_mem_alloc_cpu/0': 400.0, + 'avg_time_backward_broadcast_mod': 1.1288, 'inputs': {'lhs': (10000, 10), 'rhs': (10000, 10)}}, + {'avg_time_forward_broadcast_mod': 30.5322, 'avg_time_mem_alloc_cpu/0': 4000.0, + 'avg_time_backward_broadcast_mod': 225.0255, 'inputs': {'lhs': (10000, 1), 'rhs': (10000, 100)}}], + 'broadcast_power': [{'avg_time_backward_broadcast_power': 49.5871, 'avg_time_forward_broadcast_power': 18.0954, + 'avg_time_mem_alloc_cpu/0': 4194.3042, 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}, + {'avg_time_backward_broadcast_power': 4.6623, 'avg_time_forward_broadcast_power': 1.8283, + 'avg_time_mem_alloc_cpu/0': 400.0, 'inputs': {'lhs': (10000, 10), 'rhs': (10000, 10)}}, + {'avg_time_backward_broadcast_power': 279.922, 'avg_time_forward_broadcast_power': 24.4621, + 'avg_time_mem_alloc_cpu/0': 4000.0, 'inputs': {'lhs': (10000, 1), 'rhs': (10000, 100)}}], +..... +..... +``` + +## Usecase 3 - Run benchmarks for specific operator +For example, you want to run benchmarks for `nd.add` operator in MXNet, you just run the following python script. + +``` +#!/usr/bin/python +import mxnet as mx +from mxnet import nd + +from benchmark.opperf.utils.benchmark_utils import run_performance_test + +add_res = run_performance_test(nd.add, run_backward=True, dtype='float32', ctx=mx.cpu(), + inputs=[{"lhs": (1024, 1024), + "rhs": (1024, 1024)}], + warmup=10, runs=25) +``` + +Output for the above benchmark run, on a CPU machine, would look something like below: + +``` +{'add': [{'avg_time_mem_alloc_cpu/0': 102760.4453, + 'avg_time_forward_broadcast_add': 4.0372, + 'avg_time_backward_broadcast_add': 5.3841, + 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}]} + +``` + +## Usecase 3.1 - Run benchmarks for group of operators with same input +For example, you want to run benchmarks for `nd.add`, `nd.sub` operator in MXNet, with the same set of inputs. You just run the following python script. + +``` +#!/usr/bin/python +import mxnet as mx +from mxnet import nd + +from benchmark.opperf.utils.benchmark_utils import run_performance_test + +add_res = run_performance_test([nd.add, nd.sub], run_backward=True, dtype='float32', ctx=mx.cpu(), + inputs=[{"lhs": (1024, 1024), + "rhs": (1024, 1024)}], + warmup=10, runs=25) +``` + +Output for the above benchmark run, on a CPU machine, would look something like below: + +``` +{'add': [{'avg_time_mem_alloc_cpu/0': 102760.4453, + 'avg_time_forward_broadcast_add': 4.0372, + 'avg_time_backward_broadcast_add': 5.3841, + 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}], +'subtract': [{'avg_time_forward_broadcast_sub': 5.5137, + 'avg_time_mem_alloc_cpu/0': 207618.0469, + 'avg_time_backward_broadcast_sub': 7.2976, + 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}} + ]} + +``` +# How does it work under the hood? + +Under the hood, executes NDArray operator using randomly generated data. Use MXNet profiler to get summary of the operator execution: +1. Memory +2. Computation time (forward, backward) + +See the design proposal document for more details - https://cwiki.apache.org/confluence/display/MXNET/MXNet+Operator+Benchmarks + +**NOTE:** + +This utility queries MXNet operator registry to fetch all operators registered with MXNet, generate inputs and run benchmarks. +However, fully automated tests are enabled only for simpler operators such as - broadcast operators, element_wise operators etc... For the purpose of readability and giving more control to the users, complex operators such as convolution (2D, 3D), Pooling, Recurrent are not fully automated but expressed as default rules. +See `utils/op_registry_utils.py` for more details. + +# TODO + +All contributions are welcome. Below is the list of desired features: + +1. Cover all MXNet operators. +2. Enhance MXNet profiler with additional APIs to programmatically fetch and process profiler data. +3. Integration with CI/CD system to run operator benchmarks for PR builds, nightly builds. +4. Dashboards and other modes of presentation of results for analyzing and planning tasks such as operator performance improvements. +5. Randomized Tensor Shape generation for profiling to identify bottlenecks in the operators. diff --git a/benchmark/opperf/__init__.py b/benchmark/opperf/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/opperf/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/custom_operations/__init__.py b/benchmark/opperf/custom_operations/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/opperf/custom_operations/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/custom_operations/custom_operations.py b/benchmark/opperf/custom_operations/custom_operations.py new file mode 100644 index 000000000000..f26aed9b5b28 --- /dev/null +++ b/benchmark/opperf/custom_operations/custom_operations.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx + +""" +MXNet's Custom Operator Benchmark Tests. + +It does a simple element wise addition to make sure computation +is not too much and we can observe custom operator logistics overhead. +""" + + +# 1. Define Custom Operator - Element wise Addition Multiplication +class CustomAddOne(mx.operator.CustomOp): + def forward(self, is_train, req, in_data, out_data, aux): + self.assign(out_data[0], req[0], in_data[0] + 1) + + def backward(self, req, out_grad, in_data, out_data, in_grad, aux): + self.assign(in_grad[0], req[0], out_grad[0]) + + +@mx.operator.register("CustomAddOne") +class CustomAddOneProp(mx.operator.CustomOpProp): + def __init__(self): + super(CustomAddOneProp, self).__init__(need_top_grad=True) + + def list_arguments(self): + return ['in'] + + def list_outputs(self): + return ['output'] + + def infer_shape(self, in_shape): + # inputs, outputs, aux + return [in_shape[0]], [in_shape[0]], [] + + def create_operator(self, ctx, shapes, dtypes): + return CustomAddOne() + + +"""Helps to benchmark MXNet's Custom Op for Element wise addition on a (1000, 1) tensor. + Performs both forward and backward operation. + + This test mainly uncovers core custom op overhead in MXNet. + + Benchmark will be done on the following operation: + native_add -> native_add -> native_add -> CUSTOM_ADD -> native_add -> native_add -> native_add + + By default run on 'float32' precision. +""" + +# TODO diff --git a/benchmark/opperf/nd_operations/README.md b/benchmark/opperf/nd_operations/README.md new file mode 100644 index 000000000000..7aa220c4368a --- /dev/null +++ b/benchmark/opperf/nd_operations/README.md @@ -0,0 +1,143 @@ + + + + + + + + + + + + + + + + + +# TODO - Operators not covered in this Benchmark Utility + +**NOTE:** This list is AUTOGENERATED when you run opperf.py utility + +0. LogisticRegressionOutput +1. broadcast_axes +2. ravel_multi_index +3. multi_sgd_mom_update +4. smooth_l1 +5. scatter_nd +6. reshape +7. one_hot +8. linalg_potri +9. mp_sgd_update +10. multi_sgd_update +11. signum_update +12. Convolution_v1 +13. repeat +14. Custom +15. softmax_cross_entropy +16. SwapAxis +17. norm +18. Softmax +19. rmspropalex_update +20. fill_element_0index +21. cast +22. UpSampling +23. BatchNorm_v1 +24. CTCLoss +25. LRN +26. cast_storage +27. pick +28. GridGenerator +29. sample_multinomial +30. Activation +31. LinearRegressionOutput +32. Pooling_v1 +33. ftml_update +34. Crop +35. ElementWiseSum +36. diag +37. Reshape +38. Pad +39. linalg_gemm2 +40. crop +41. rmsprop_update +43. RNN +44. argmin +45. SoftmaxOutput +46. linalg_extractdiag +47. sgd_mom_update +48. SequenceLast +49. Deconvolution +50. flip +51. SequenceReverse +52. swapaxes +53. SVMOutput +54. linalg_trsm +55. where +56. SoftmaxActivation +57. signsgd_update +58. slice +59. linalg_gelqf +60. softmin +61. linalg_gemm +62. BilinearSampler +63. mp_sgd_mom_update +64. choose_element_0index +65. tile +66. space_to_depth +67. gather_nd +68. argsort +69. SequenceMask +70. reshape_like +71. slice_axis +72. stack +73. topk +74. khatri_rao +75. multi_mp_sgd_update +76. linalg_sumlogdiag +77. broadcast_to +78. IdentityAttachKLSparseReg +79. sort +80. SpatialTransformer +81. Concat +82. uniform +83. InstanceNorm +84. expand_dims +85. multi_mp_sgd_mom_update +86. reverse +87. add_n +88. clip +89. ctc_loss +90. shape_array +91. unravel_index +92. linalg_potrf +93. Cast +94. broadcast_like +95. Embedding +96. linalg_makediag +97. transpose +98. linalg_syrk +99. squeeze +101. ROIPooling +102. ftrl_update +103. SliceChannel +104. slice_like +105. depth_to_space +106. linalg_maketrian +108. pad +109. LayerNorm +110. split +111. MAERegressionOutput +112. Correlation +113. argmax +114. batch_take +115. L2Normalization +116. broadcast_axis +117. linalg_trmm +118. linalg_extracttrian +119. normal +120. take +121. MakeLoss +122. sgd_update +123. adam_update +124. concat \ No newline at end of file diff --git a/benchmark/opperf/nd_operations/__init__.py b/benchmark/opperf/nd_operations/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/opperf/nd_operations/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py new file mode 100644 index 000000000000..7f93621eb2ec --- /dev/null +++ b/benchmark/opperf/nd_operations/binary_operators.py @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Performance benchmark tests for MXNet NDArray Binary Operations - covers both broadcast and element_wise. +1. Operators are automatically fetched from MXNet operator registry. +2. Default Inputs are generated. See rules/default_params.py. You can override the default values. + +Below 20 binary broadcast Operators are covered: + +['broadcast_add', 'broadcast_div', 'broadcast_equal', 'broadcast_greater', 'broadcast_greater_equal', +'broadcast_hypot', 'broadcast_lesser', 'broadcast_lesser_equal', 'broadcast_logical_and', +'broadcast_logical_or', 'broadcast_logical_xor', 'broadcast_maximum', 'broadcast_minimum', +'broadcast_minus', 'broadcast_mod', 'broadcast_mul', 'broadcast_not_equal', 'broadcast_plus', +'broadcast_power', 'broadcast_sub'] + +Below 4 binary element_wise Operators are covered: +['elemwise_add', 'elemwise_mul', 'elemwise_sub', 'elemwise_div'] + +""" +import mxnet as mx + +from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks +from benchmark.opperf.utils.op_registry_utils import get_all_broadcast_binary_operators, \ + get_all_elemen_wise_binary_operators + + +def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the binary + broadcast operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Fetch all Binary Broadcast Operators + mx_binary_broadcast_ops = get_all_broadcast_binary_operators() + # Run benchmarks + mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, warmup, runs) + return mx_binary_op_results + + +def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the binary + element_wise operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Fetch all Binary Element_wise Operators + mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators() + # Run benchmarks + mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, warmup, runs) + return mx_binary_op_results diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py new file mode 100644 index 000000000000..69a0f4c23121 --- /dev/null +++ b/benchmark/opperf/nd_operations/gemm_operators.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import nd +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list + +"""Performance benchmark tests for MXNet NDArray GEMM Operators. + +1. dot +2. batch_dot + +TODO +3. As part of default tests, following needs to be added: + 3.1 Sparse dot. (csr, default) -> row_sparse + 3.2 Sparse dot. (csr, row_sparse) -> default + 3.3 With Transpose of lhs + 3.4 With Transpose of rhs +4. 1D array: inner product of vectors +""" + + +def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the GEMM + operators (dot, batch_dot) in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Benchmark tests for dot and batch_dot operators + dot_benchmark_res = run_performance_test( + [nd.dot], run_backward=True, + dtype=dtype, ctx=ctx, + inputs=[{"lhs": (1024, 1024), + "rhs": (1024, 1024)}, + {"lhs": (1000, 10), + "rhs": (1000, 10), + "transpose_b": True}, + {"lhs": (1000, 1), + "rhs": (100, 1000), + "transpose_a": True, + "transpose_b": True}], + warmup=warmup, runs=runs) + + batch_dot_benchmark_res = run_performance_test( + [nd.batch_dot], run_backward=True, + dtype=dtype, ctx=ctx, + inputs=[{"lhs": (32, 1024, 1024), + "rhs": (32, 1024, 1024)}, + {"lhs": (32, 1000, 10), + "rhs": (32, 1000, 10), + "transpose_b": True}, + {"lhs": (32, 1000, 1), + "rhs": (32, 100, 1000), + "transpose_a": True, + "transpose_b": True}], + warmup=warmup, runs=runs) + + # Prepare combined results for GEMM operators + mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res) + return mx_gemm_op_results diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py new file mode 100644 index 000000000000..16ea2c6f64f4 --- /dev/null +++ b/benchmark/opperf/nd_operations/nn_activation_operators.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import nd +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list + +"""Performance benchmark tests for MXNet NDArray Activation Operators. + +1. LeakyRelu + 1.1 Elu + 1.2 Selu + 1.3 Leaky + 1.4 PRelu + 1.5 RRelu +3. Hard_Sigmoid +4. Softmax +5. Log_Softmax + +""" + + +def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the activation + operators (relu, sigmoid, softmax) in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Relu and its variation + relu_benchmark_res = run_performance_test([nd.LeakyReLU], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (1024, 1024), "act_type": "leaky", "slope": 0.1}, + {"data": (10000, 1), "act_type": "leaky", "slope": 0.1}, + {"data": (10000, 100), "act_type": "leaky", "slope": 0.1}, + {"data": (1024, 1024), "act_type": "elu", "slope": 0.1}, + {"data": (10000, 1), "act_type": "elu", "slope": 0.1}, + {"data": (10000, 100), "act_type": "elu", "slope": 0.1}, + {"data": (1024, 1024), "act_type": "selu"}, + {"data": (10000, 1), "act_type": "selu"}, + {"data": (10000, 100), "act_type": "selu"}, + {"data": (1024, 1024), "act_type": "prelu", "gamma": (1, 1024)}, + {"data": (10000, 1), "act_type": "prelu", "gamma": (1, 1)}, + {"data": (10000, 100), "act_type": "prelu", "gamma": (1, 100)} + ], + warmup=warmup, + runs=runs) + + # Sigmoid => Covered as part of Unary ops + # Hard_Sigmoid + hard_sigmoid_benchmark_res = run_performance_test([nd.hard_sigmoid], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (1024, 1024), "alpha": 0.25, "beta": 0.5}, + {"data": (10000, 1), "alpha": 0.25, "beta": 0.5}, + {"data": (10000, 100), "alpha": 0.25, "beta": 0.5} + ], + warmup=warmup, + runs=runs) + + # Softmax, LogSoftmax + softmax_benchmark_res = run_performance_test([nd.softmax, nd.log_softmax], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (1024, 1024), "axis": -1, "temperature": 0.5}, + {"data": (10000, 1), "axis": -1, "temperature": 0.5}, + {"data": (10000, 100), "axis": -1, "temperature": 0.5} + ], + warmup=warmup, + runs=runs) + + # Prepare combined results + mx_activation_op_results = merge_map_list(relu_benchmark_res + hard_sigmoid_benchmark_res + softmax_benchmark_res) + return mx_activation_op_results diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py new file mode 100644 index 000000000000..d91b285f41aa --- /dev/null +++ b/benchmark/opperf/nd_operations/nn_basic_operators.py @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import nd +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list + +"""Performance benchmark tests for MXNet NDArray basic NN Operators. + +1. FullyConnected +2. Dropout +3. BatchNorm + +""" + + +def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + # FullyConnnected operator benchmarks + fc_benchmark_res = run_performance_test([nd.FullyConnected], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (32, 3, 256, 256), + "num_hidden": 64, + "weight": (64, 3 * 256 * 256), + "bias": (64,), + "flatten": True}, + {"data": (32, 3, 256, 256), + "num_hidden": 64, + "weight": (64, 256), + "bias": (64,), + "flatten": False}], + warmup=warmup, + runs=runs) + + # Dropout benchmarks + dropout_benchmark_res = run_performance_test([nd.Dropout], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (32, 3, 256, 256), + "p": 0.5, + "mode": "always"}, + {"data": (10000, 10), + "p": 0.5, + "mode": "always"}], + warmup=warmup, + runs=runs) + # BatchNorm benchmarks + batchnorm_benchmark_res = run_performance_test([nd.BatchNorm], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": (32, 3, 256, 256), + "gamma": (3,), + "beta": (3,), + "moving_mean": (3,), + "moving_var": (3,)}, + {"data": (32, 3, 10000, 10), + "gamma": (3,), + "beta": (3,), + "moving_mean": (3,), + "moving_var": (3,)}], + warmup=warmup, + runs=runs) + # Prepare combined results + mx_basic_nn_results = merge_map_list(fc_benchmark_res + dropout_benchmark_res + batchnorm_benchmark_res) + return mx_basic_nn_results diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py new file mode 100644 index 000000000000..e4749ec90de4 --- /dev/null +++ b/benchmark/opperf/nd_operations/nn_conv_operators.py @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import nd +from benchmark.opperf.utils.benchmark_utils import run_performance_test +from benchmark.opperf.utils.common_utils import merge_map_list + +"""Performance benchmark tests for MXNet NDArray Convolution and Pooling Operators. + +MXNet NDArray Pooling Operators + +1. MaxPool1D +2. MaxPool2D +3. SumPool1D +4. SumPool2D +4. AvgPool1D +5. AvgPool2D +6. GlobalMaxPool1D +7. GlobalMaxPool2D +8. GlobalAvgPool1D +9. GlobalAvgPool2D +10.GlobalSumPool1D +11.GlobalSumPool2D + +(Under the hood uses mx.nd.pooling) + +MXNet NDArray NN Convolution Operators + +1. Conv1D +2. Conv2D +3. Conv1DTranspose (DeConvolution) +4. Conv2DTranspose (DeConvolution) + +(Under the hood uses mx.nd.convolution, mx.nd.Deconvolution) + +""" + + +def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + pool_types = ['avg', 'max', 'sum'] + global_pool_types = [0, 1] + + # Run 1D and 2D Pooling performance runs + pool1d_benchmark_res = [] + pool2d_benchmark_res = [] + for pool_type in pool_types: + for global_pool in global_pool_types: + for pool1d_data in [(32, 3, 256), (32, 3, 64)]: + pool1d_benchmark_res += run_performance_test([nd.Pooling], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": pool1d_data, + "kernel": 3, + "pool_type": pool_type, + "global_pool": global_pool, + "stride": 1, + "pad": 1, + "layout": 'NCW'} + ], + warmup=warmup, + runs=runs) + for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: + pool2d_benchmark_res += run_performance_test([nd.Pooling], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": pool2d_data, + "kernel": (3, 3), + "pool_type": pool_type, + "global_pool": global_pool, + "stride": (1, 1), + "pad": (0, 0), + "layout": 'NCHW'} + ], + warmup=warmup, + runs=runs) + # Prepare combined results + mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res) + return mx_pooling_op_results + + +def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + # Conv1D Benchmarks + conv1d_benchmark_res = [] + for conv_data in [(32, 3, 256), (32, 3, 64)]: + conv1d_benchmark_res += run_performance_test([nd.Convolution], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": conv_data, + "weight": (64, 3, 3,), + "bias": (64,), + "kernel": (3,), + "stride": (1,), + "dilate": (1,), + "pad": (0,), + "num_filter": 64, + "layout": 'NCW'} + ], + warmup=warmup, + runs=runs) + # Conv2D Benchmarks + conv2d_benchmark_res = [] + for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]: + conv2d_benchmark_res += run_performance_test([nd.Convolution], + run_backward=True, + dtype=dtype, + ctx=ctx, + inputs=[{"data": conv_data, + "weight": (64, 3, 3, 3), + "bias": (64,), + "kernel": (3, 3), + "stride": (1, 1), + "dilate": (1, 1), + "pad": (0, 0), + "num_filter": 64, + "layout": 'NCHW'} + ], + warmup=warmup, + runs=runs) + # Prepare combined results + mx_conv_op_results = merge_map_list(conv1d_benchmark_res + conv2d_benchmark_res) + return mx_conv_op_results diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py new file mode 100644 index 000000000000..bad8c8e4c040 --- /dev/null +++ b/benchmark/opperf/nd_operations/random_sampling_operators.py @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Performance benchmark tests for MXNet NDArray Random Sampling Operations. +1. Operators are automatically fetched from MXNet operator registry. +2. Default Inputs are generated. See rules/default_params.py. You can override the default values. + +Below 16 random sampling Operators are covered: + +['random_exponential', 'random_gamma', 'random_generalized_negative_binomial', 'random_negative_binomial', +'random_normal', 'random_poisson', 'random_randint', 'random_uniform', 'sample_exponential', 'sample_gamma', +'sample_generalized_negative_binomial', 'sample_multinomial', 'sample_negative_binomial', 'sample_normal', +'sample_poisson', 'sample_uniform'] + +""" + +import mxnet as mx + +from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks +from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators + + +def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the random sampling + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Fetch all Random Sampling Operators + mx_random_sample_ops = get_all_random_sampling_operators() + # Run benchmarks + mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, warmup, runs) + return mx_random_sample_op_results diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py new file mode 100644 index 000000000000..5bfe06621136 --- /dev/null +++ b/benchmark/opperf/nd_operations/reduction_operators.py @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Performance benchmark tests for MXNet NDArray Reduction Operations. +1. Operators are automatically fetched from MXNet operator registry. +2. Default Inputs are generated. See rules/default_params.py. You can override the default values. + +Below 10 reduction Operators are covered: + +['max', 'max_axis', 'mean', 'min', 'min_axis', 'nanprod', 'nansum', 'prod', 'sum', 'sum_axis'] + +""" + +import mxnet as mx + +from benchmark.opperf.utils.op_registry_utils import get_all_reduction_operators +from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks + + +def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the reduction + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Fetch all Reduction Operators + mx_reduction_broadcast_ops = get_all_reduction_operators() + # Run benchmarks + mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, warmup, runs) + return mx_reduction_op_results diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py new file mode 100644 index 000000000000..a562eebf2a92 --- /dev/null +++ b/benchmark/opperf/nd_operations/unary_operators.py @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Performance benchmark tests for MXNet NDArray Unary Operations. +1. Operators are automatically fetched from MXNet operator registry. +2. Default Inputs are generated. See rules/default_params.py. You can override the default values. + +Below 54 unary Operators are covered: + +['BlockGrad', 'Flatten', 'abs', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', +'argmax_channel', 'cbrt', 'ceil', 'cos', 'cosh', 'degrees', 'erf', 'erfinv', 'exp', 'expm1', 'fix', 'flatten', +'floor', 'gamma', 'gammaln', 'identity', 'log', 'log10', 'log1p', 'log2', 'logical_not', 'make_loss', 'negative', +'ones_like', 'radians', 'rcbrt', 'reciprocal', 'relu', 'rint', 'round', 'rsqrt', 'shuffle', 'sigmoid', 'sign', +'sin', 'sinh', 'size_array', 'softsign', 'sqrt', 'square', 'stop_gradient', 'tan', 'tanh', 'trunc', 'zeros_like'] + +""" + +import mxnet as mx + +from benchmark.opperf.utils.op_registry_utils import get_all_unary_operators +from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks + + +def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50): + """Runs benchmarks with the given context and precision (dtype)for all the unary + operators in MXNet. + + Parameters + ---------- + ctx: mx.ctx + Context to run benchmarks + dtype: str, default 'float32' + Precision to use for benchmarks + warmup: int, default 10 + Number of times to run for warmup + runs: int, default 50 + Number of runs to capture benchmark results + + Returns + ------- + Dictionary of results. Key -> Name of the operator, Value -> Benchmark results. + + """ + # Fetch all Unary Operators + mx_unary_broadcast_ops = get_all_unary_operators() + # Run benchmarks + mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, warmup, runs) + return mx_unary_op_results diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py new file mode 100755 index 000000000000..34c6cf96b723 --- /dev/null +++ b/benchmark/opperf/opperf.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# -*- coding: utf-8 -*- + +"""Commandline utility to run operator benchmarks""" + +import argparse +import logging +import os +import sys + +import mxnet as mx + +from benchmark.opperf.nd_operations.unary_operators import run_mx_unary_operators_benchmarks +from benchmark.opperf.nd_operations.binary_operators import run_mx_binary_broadcast_operators_benchmarks, \ + run_mx_binary_element_wise_operators_benchmarks +from benchmark.opperf.nd_operations.gemm_operators import run_gemm_operators_benchmarks +from benchmark.opperf.nd_operations.random_sampling_operators import run_mx_random_sampling_operators_benchmarks +from benchmark.opperf.nd_operations.reduction_operators import run_mx_reduction_operators_benchmarks +from benchmark.opperf.nd_operations.nn_activation_operators import run_activation_operators_benchmarks +from benchmark.opperf.nd_operations.nn_conv_operators import run_pooling_operators_benchmarks, \ + run_convolution_operators_benchmarks +from benchmark.opperf.nd_operations.nn_basic_operators import run_nn_basic_operators_benchmarks + +from benchmark.opperf.utils.common_utils import merge_map_list, save_to_file +from benchmark.opperf.utils.op_registry_utils import get_operators_with_no_benchmark + + +def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32'): + """Run all the MXNet operators (NDArray) benchmarks. + + Returns + ------- + Dictionary of benchmark results. + """ + mxnet_operator_benchmark_results = [] + + # *************************MXNET TENSOR OPERATOR BENCHMARKS***************************** + + # Run all Unary operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # Run all Binary Broadcast, element_wise operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx, + dtype=dtype)) + mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx, + dtype=dtype)) + + # Run all GEMM operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx, + dtype=dtype)) + + # Run all Random sampling operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # Run all Reduction operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # ************************ MXNET NN OPERATOR BENCHMARKS **************************** + + # Run all basic NN operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # Run all Activation operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # Run all Pooling operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # Run all Convolution operations benchmarks with default input values + mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype)) + + # ****************************** PREPARE FINAL RESULTS ******************************** + final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results) + return final_benchmark_result_map + + +def _parse_mxnet_context(ctx): + if not ctx: + raise ValueError("Context cannot be null or empty") + + if ctx.lower() in ['cpu', 'gpu']: + return mx.context.Context(ctx) + elif ctx.lower().startwith('gpu('): + device_id = int(ctx[4:-1]) + return mx.gpu(device_id) + +def main(): + # 1. GET USER INPUTS + parser = argparse.ArgumentParser( + description='Run all the MXNet operators (NDArray) benchmarks') + + parser.add_argument('--ctx', type=str, default='cpu', + help='Global context to run all benchmarks. By default, cpu on a ' + 'CPU machine, gpu(0) on a GPU machine. ' + 'Valid Inputs - cpu, gpu, gpu(0), gpu(1)...') + parser.add_argument('--dtype', type=str, default='float32', help='DType (Precision) to run benchmarks. By default, ' + 'float32. Valid Inputs - float32, float64.') + parser.add_argument('-f', '--output-format', type=str, default='json', + choices=['json', 'md'], + help='Benchmark result output format. By default, json. ' + 'Valid Inputs - json, md') + + parser.add_argument('-o', '--output-file', type=str, default='./mxnet_operator_benchmarks.json', + help='Name and path for the ' + 'output file.') + + args = parser.parse_args() + logging.info(f"Running MXNet operator benchmarks with the following options: {args}") + assert not os.path.isfile(args.output_file), f"Output file {args.output_file} already exists." + + # 2. RUN BENCHMARKS + ctx = _parse_mxnet_context(args.ctx) + dtype = args.dtype + final_benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=args.dtype) + + # 3. PREPARE OUTPUTS + save_to_file(final_benchmark_results, args.output_file, args.output_format) + + # 4. Generate list of MXNet operators not covered in benchmarks + ops_not_covered = get_operators_with_no_benchmark(final_benchmark_results.keys()) + for idx, op in enumerate(ops_not_covered): + print(f"{idx}. {op}") + return 0 + +if __name__ == '__main__': + sys.exit(main()) + diff --git a/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md b/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md new file mode 100644 index 000000000000..9e2ffee25e1c --- /dev/null +++ b/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md @@ -0,0 +1,322 @@ + + + + + + + + + + + + + + + + + +# MXNet Operator Benchmarks + +## Settings + +1. MXNet - v1.4.1 +2. Instance - C5.8x + +| Operator | Avg Forward Time (ms) | Avg. Backward Time (ms) | Max Mem Usage (Storage) (Bytes) | Inputs | +| :---: | :---: | :---: | :---:| :--- | +| shuffle | 0.8901 | --- | 4194.3042 | {'data': (1024, 1024)} | +| shuffle | 1.2146 | --- | 40.0 | {'data': (10000, 1)} | +| shuffle | 1.8777 | --- | 4000.0 | {'data': (10000, 100)} | +| broadcast_equal | 0.006 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| broadcast_hypot | 0.0108 | 0.0135 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| ceil | 3.4305 | --- | 4194.3042 | {'data': (1024, 1024)} | +| ceil | 0.0507 | --- | 40.0 | {'data': (10000, 1)} | +| ceil | 3.317 | --- | 4000.0 | {'data': (10000, 100)} | +| sum | 32.4206 | 25.5443 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| sum | 0.3393 | 0.2507 | 0.004 | {'data': (10000, 1), 'axis': 0} | +| sum | 31.0189 | 24.7422 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} | +| broadcast_logical_xor | 0.0068 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| erf | 35.0669 | 16.5842 | 4194.3042 | {'data': (1024, 1024)} | +| erf | 0.3982 | 0.1734 | 40.0 | {'data': (10000, 1)} | +| erf | 29.4103 | 14.3537 | 4000.0 | {'data': (10000, 100)} | +| tanh | 11.2211 | 6.1798 | 2097.1521 | {'data': (1024, 1024)} | +| tanh | 0.1628 | 0.0622 | 40.0 | {'data': (10000, 1)} | +| tanh | 10.7941 | 6.0085 | 4000.0 | {'data': (10000, 100)} | +| arcsinh | 10.0168 | 8.5245 | 2097.1521 | {'data': (1024, 1024)} | +| arcsinh | 0.1111 | 0.0905 | 40.0 | {'data': (10000, 1)} | +| arcsinh | 9.4415 | 7.9082 | 2000.0 | {'data': (10000, 100)} | +| fix | 15.541 | --- | 4194.3042 | {'data': (1024, 1024)} | +| fix | 0.1615 | --- | 40.0 | {'data': (10000, 1)} | +| fix | 14.591 | --- | 4000.0 | {'data': (10000, 100)} | +| broadcast_maximum | 0.0097 | 0.0099 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| sin | 14.4123 | 16.5642 | 2097.1521 | {'data': (1024, 1024)} | +| sin | 0.1459 | 0.156 | 40.0 | {'data': (10000, 1)} | +| sin | 13.821 | 15.4752 | 2000.0 | {'data': (10000, 100)} | +| random_normal | 151.0089 | --- | 4194.3042 | {'shape': (1024, 1024)} | +| random_normal | 1.456 | --- | 40.0 | {'shape': (10000, 1)} | +| random_normal | 144.775 | --- | 2000.0 | {'shape': (10000, 100)} | +| sqrt | 3.3861 | 5.1123 | 2097.1521 | {'data': (1024, 1024)} | +| sqrt | 0.0393 | 0.0548 | 20.0 | {'data': (10000, 1)} | +| sqrt | 3.3037 | 4.7883 | 2000.0 | {'data': (10000, 100)} | +| BlockGrad | 0.3275 | --- | 4194.3042 | {'data': (1024, 1024)} | +| BlockGrad | 0.0161 | --- | 40.0 | {'data': (10000, 1)} | +| BlockGrad | 0.3118 | --- | 4000.0 | {'data': (10000, 100)} | +| sample_exponential | 123.8534 | --- | 8388.6084 | {'lam': [1.0, 8.5], 'shape': (1024, 1024)} | +| sample_exponential | 1.3394 | --- | 80.0 | {'lam': [1.0, 8.5], 'shape': (10000, 1)} | +| sample_exponential | 118.4786 | --- | 8000.0 | {'lam': [1.0, 8.5], 'shape': (10000, 100)} | +| sample_gamma | 529.0305 | --- | 8388.6084 | {'alpha': [0.0, 2.5], 'shape': (1024, 1024), 'beta': [1.0, 0.7]} | +| sample_gamma | 5.7426 | --- | 80.0 | {'alpha': [0.0, 2.5], 'shape': (10000, 1), 'beta': [1.0, 0.7]} | +| sample_gamma | 496.0531 | --- | 8000.0 | {'alpha': [0.0, 2.5], 'shape': (10000, 100), 'beta': [1.0, 0.7]} | +| log2 | 12.3183 | 4.5842 | 2097.1521 | {'data': (1024, 1024)} | +| log2 | 0.1269 | 0.0459 | 40.0 | {'data': (10000, 1)} | +| log2 | 11.6719 | 4.2632 | 4000.0 | {'data': (10000, 100)} | +| broadcast_greater_equal | 0.0092 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| FullyConnected | 18.4677 | 21.6917 | 8.192 | {'data': (32, 3, 256, 256), 'num_hidden': 64, 'weight': (64, 196608), 'bias': (64,), 'flatten': True} | +| FullyConnected | 20.3379 | 38.8295 | 6291.4561 | {'data': (32, 3, 256, 256), 'num_hidden': 64, 'weight': (64, 256), 'bias': (64,), 'flatten': False} | +| cos | 14.8699 | 16.8678 | 2097.1521 | {'data': (1024, 1024)} | +| cos | 0.1511 | 0.1585 | 40.0 | {'data': (10000, 1)} | +| cos | 14.0109 | 15.5246 | 2000.0 | {'data': (10000, 100)} | +| broadcast_mul | 0.0075 | 0.0075 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| arccos | 21.5631 | 12.8768 | 4194.3042 | {'data': (1024, 1024)} | +| arccos | 0.1719 | 0.1084 | 40.0 | {'data': (10000, 1)} | +| arccos | 15.3153 | 7.9161 | 2000.0 | {'data': (10000, 100)} | +| stop_gradient | --- | --- | 4194.3042 | {'data': (1024, 1024)} | +| stop_gradient | --- | --- | 40.0 | {'data': (10000, 1)} | +| stop_gradient | --- | --- | 4000.0 | {'data': (10000, 100)} | +| broadcast_sub | 0.0078 | 0.0059 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| random_poisson | 112.7425 | --- | 4194.3042 | {'shape': (1024, 1024)} | +| random_poisson | 1.0701 | --- | 40.0 | {'shape': (10000, 1)} | +| random_poisson | 114.3405 | --- | 2000.0 | {'shape': (10000, 100)} | +| rsqrt | 4.3564 | 7.0663 | 2097.1521 | {'data': (1024, 1024)} | +| rsqrt | 0.075 | 0.0861 | 40.0 | {'data': (10000, 1)} | +| rsqrt | 4.5076 | 6.6598 | 4000.0 | {'data': (10000, 100)} | +| nansum | 34.2019 | 57.1624 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| nansum | 0.3683 | 0.5326 | 0.002 | {'data': (10000, 1), 'axis': 0} | +| nansum | 32.9698 | 55.4243 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} | +| hard_sigmoid | 7.5926 | 6.5839 | 2097.1521 | {'data': (1024, 1024), 'alpha': 0.25, 'beta': 0.5} | +| hard_sigmoid | 0.1086 | 0.0895 | 40.0 | {'data': (10000, 1), 'alpha': 0.25, 'beta': 0.5} | +| hard_sigmoid | 8.1285 | 6.6014 | 4000.0 | {'data': (10000, 100), 'alpha': 0.25, 'beta': 0.5} | +| softmax | 25.4074 | 9.4933 | 2097.1521 | {'data': (1024, 1024), 'axis': -1, 'temperature': 0.5} | +| softmax | 0.4022 | 0.3145 | 40.0 | {'data': (10000, 1), 'axis': -1, 'temperature': 0.5} | +| softmax | 25.604 | 9.4286 | 4000.0 | {'data': (10000, 100), 'axis': -1, 'temperature': 0.5} | +| random_negative_binomial | 285.8721 | --- | 4194.3042 | {'k': 1, 'p': 1, 'shape': (1024, 1024)} | +| random_negative_binomial | 2.839 | --- | 40.0 | {'k': 1, 'p': 1, 'shape': (10000, 1)} | +| random_negative_binomial | 273.034 | --- | 2000.0 | {'k': 1, 'p': 1, 'shape': (10000, 100)} | +| BatchNorm | 66.062 | 88.4693 | 25165.8359 | {'data': (32, 3, 256, 256), 'gamma': (3,), 'beta': (3,), 'moving_mean': (3,), 'moving_var': (3,)} | +| BatchNorm | 101.3006 | 134.4362 | 38400.0117 | {'data': (32, 3, 10000, 10), 'gamma': (3,), 'beta': (3,), 'moving_mean': (3,), 'moving_var': (3,)} | +| Pooling | 0.5533 | 0.6485 | 49.152 | {'data': (32, 3, 256), 'kernel': 3, 'pool_type': 'avg', 'global_pool': 0, 'stride': 1, 'pad': 1, 'layout': 'NCW'} | +| radians | 3.3238 | 3.9704 | 4194.3042 | {'data': (1024, 1024)} | +| radians | 0.0391 | 0.0436 | 40.0 | {'data': (10000, 1)} | +| radians | 3.2462 | 3.775 | 4000.0 | {'data': (10000, 100)} | +| arctanh | 13.3211 | 6.3172 | 2097.1521 | {'data': (1024, 1024)} | +| arctanh | 0.1498 | 0.0683 | 40.0 | {'data': (10000, 1)} | +| arctanh | 12.5376 | 6.0177 | 2000.0 | {'data': (10000, 100)} | +| nanprod | 34.3464 | 57.9841 | 0.004 | {'data': (1024, 1024), 'axis': ()} | +| nanprod | 0.3638 | 0.5336 | 0.004 | {'data': (10000, 1), 'axis': 0} | +| nanprod | 32.83 | 55.2982 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} | +| elemwise_add | 0.0065 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| cosh | 8.4872 | 10.6597 | 2097.1521 | {'data': (1024, 1024)} | +| cosh | 0.1015 | 0.1201 | 40.0 | {'data': (10000, 1)} | +| cosh | 8.3937 | 10.6244 | 4000.0 | {'data': (10000, 100)} | +| tan | 15.4508 | 6.0752 | 2097.1521 | {'data': (1024, 1024)} | +| tan | 0.1549 | 0.0591 | 40.0 | {'data': (10000, 1)} | +| tan | 14.6992 | 5.802 | 2000.0 | {'data': (10000, 100)} | +| broadcast_not_equal | 0.0054 | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| trunc | 3.493 | --- | 2097.1521 | {'data': (1024, 1024)} | +| trunc | 0.0505 | --- | 40.0 | {'data': (10000, 1)} | +| trunc | 3.1751 | --- | 2000.0 | {'data': (10000, 100)} | +| min_axis | 36.7382 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} | +| min_axis | 0.4225 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} | +| min_axis | 31.3261 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| random_uniform | 44.7633 | --- | 4194.3042 | {'low': 0, 'high': 5, 'shape': (1024, 1024)} | +| random_uniform | 0.4607 | --- | 40.0 | {'low': 0, 'high': 5, 'shape': (10000, 1)} | +| random_uniform | 42.9135 | --- | 4000.0 | {'low': 0, 'high': 5, 'shape': (10000, 100)} | +| abs | 4.3965 | 13.406 | 4194.3042 | {'data': (1024, 1024)} | +| abs | 0.0696 | 0.1374 | 40.0 | {'data': (10000, 1)} | +| abs | 4.3552 | 13.7197 | 4000.0 | {'data': (10000, 100)} | +| broadcast_lesser_equal | 0.0054 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| random_randint | 65.414 | --- | 4194.3042 | {'low': 0, 'high': 5, 'shape': (1024, 1024)} | +| random_randint | 0.6331 | --- | 40.0 | {'low': 0, 'high': 5, 'shape': (10000, 1)} | +| random_randint | 61.32 | --- | 4000.0 | {'low': 0, 'high': 5, 'shape': (10000, 100)} | +| log1p | 13.6758 | 5.2497 | 2097.1521 | {'data': (1024, 1024)} | +| log1p | 0.1493 | 0.0562 | 40.0 | {'data': (10000, 1)} | +| log1p | 12.9494 | 5.0609 | 2000.0 | {'data': (10000, 100)} | +| log | 11.9666 | 5.1096 | 4194.3042 | {'data': (1024, 1024)} | +| log | 0.1306 | 0.0588 | 40.0 | {'data': (10000, 1)} | +| log | 11.8985 | 5.0319 | 2000.0 | {'data': (10000, 100)} | +| round | 14.6427 | --- | 4194.3042 | {'data': (1024, 1024)} | +| round | 0.1424 | --- | 20.0 | {'data': (10000, 1)} | +| round | 13.58 | --- | 2000.0 | {'data': (10000, 100)} | +| sample_negative_binomial | 1263.9417 | --- | 8388.6084 | {'k': [20, 49], 'shape': (1024, 1024), 'p': [0.4, 0.77]} | +| sample_negative_binomial | 12.5213 | --- | 80.0 | {'k': [20, 49], 'shape': (10000, 1), 'p': [0.4, 0.77]} | +| sample_negative_binomial | 1207.5739 | --- | 8000.0 | {'k': [20, 49], 'shape': (10000, 100), 'p': [0.4, 0.77]} | +| max | 30.7008 | 55.863 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| max | 0.3287 | 0.5147 | 0.004 | {'data': (10000, 1), 'axis': 0} | +| max | 29.4913 | 53.255 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} | +| mean | 31.9337 | 35.9235 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| mean | 0.4088 | 0.3453 | 0.002 | {'data': (10000, 1), 'axis': 0} | +| mean | 31.5658 | 34.609 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| sign | 10.1736 | 4.1682 | 4194.3042 | {'data': (1024, 1024)} | +| sign | 0.1251 | 0.0588 | 40.0 | {'data': (10000, 1)} | +| sign | 9.5196 | 3.9109 | 2000.0 | {'data': (10000, 100)} | +| broadcast_power | 0.0117 | 0.0112 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| argmax_channel | 10.9332 | --- | 4.096 | {'data': (1024, 1024)} | +| argmax_channel | 0.2703 | --- | 40.0 | {'data': (10000, 1)} | +| argmax_channel | 10.7759 | --- | 40.0 | {'data': (10000, 100)} | +| flatten | --- | --- | 4194.3042 | {'data': (1024, 1024)} | +| flatten | --- | --- | 40.0 | {'data': (10000, 1)} | +| flatten | --- | --- | 4000.0 | {'data': (10000, 100)} | +| ones_like | 2.127 | --- | 4194.3042 | {'data': (1024, 1024)} | +| ones_like | 0.028 | --- | 40.0 | {'data': (10000, 1)} | +| ones_like | 1.8846 | --- | 4000.0 | {'data': (10000, 100)} | +| negative | 2.6672 | --- | 4194.3042 | {'data': (1024, 1024)} | +| negative | 0.0321 | --- | 40.0 | {'data': (10000, 1)} | +| negative | 2.4958 | --- | 4000.0 | {'data': (10000, 100)} | +| elemwise_mul | 0.0054 | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| batch_dot | 766.5307 | 1365.6267 | 134217.7344 | {'lhs': (32, 1024, 1024), 'rhs': (32, 1024, 1024)} | +| batch_dot | 37.618 | 46.1098 | 128000.0 | {'lhs': (32, 1000, 10), 'rhs': (32, 1000, 10), 'transpose_b': True} | +| batch_dot | 1.3618 | 4.0882 | 6.4 | {'lhs': (32, 1000, 1), 'rhs': (32, 100, 1000), 'transpose_a': True, 'transpose_b': True} | +| sum_axis | 33.2033 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} | +| sum_axis | 0.3155 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} | +| sum_axis | 30.9792 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| floor | 3.5835 | --- | 4194.3042 | {'data': (1024, 1024)} | +| floor | 0.0499 | --- | 20.0 | {'data': (10000, 1)} | +| floor | 3.3519 | --- | 4000.0 | {'data': (10000, 100)} | +| logical_not | 3.0748 | --- | 4194.3042 | {'data': (1024, 1024)} | +| logical_not | 0.0319 | --- | 40.0 | {'data': (10000, 1)} | +| logical_not | 3.0173 | --- | 4000.0 | {'data': (10000, 100)} | +| log10 | 12.3647 | 4.5036 | 2097.1521 | {'data': (1024, 1024)} | +| log10 | 0.1647 | 0.0619 | 40.0 | {'data': (10000, 1)} | +| log10 | 11.7758 | 4.231 | 2000.0 | {'data': (10000, 100)} | +| rcbrt | 11.737 | 14.931 | 2097.1521 | {'data': (1024, 1024)} | +| rcbrt | 0.1241 | 0.1421 | 40.0 | {'data': (10000, 1)} | +| rcbrt | 11.2254 | 14.2139 | 2000.0 | {'data': (10000, 100)} | +| broadcast_logical_or | 0.0093 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| sample_normal | 304.5372 | --- | 8388.6084 | {'mu': [2.0, 2.5], 'shape': (1024, 1024), 'sigma': [1.0, 3.7]} | +| sample_normal | 2.8403 | --- | 80.0 | {'mu': [2.0, 2.5], 'shape': (10000, 1), 'sigma': [1.0, 3.7]} | +| sample_normal | 284.6853 | --- | 8000.0 | {'mu': [2.0, 2.5], 'shape': (10000, 100), 'sigma': [1.0, 3.7]} | +| broadcast_minimum | 0.0073 | 0.0073 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| arctan | 10.4997 | 6.4532 | 2097.1521 | {'data': (1024, 1024)} | +| arctan | 0.1269 | 0.0683 | 40.0 | {'data': (10000, 1)} | +| arctan | 10.1779 | 6.1741 | 2000.0 | {'data': (10000, 100)} | +| broadcast_mod | 0.0131 | 0.0127 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| size_array | 0.0056 | --- | 0.008 | {'data': (1024, 1024)} | +| size_array | 0.005 | --- | 0.008 | {'data': (10000, 1)} | +| size_array | 0.0081 | --- | 0.004 | {'data': (10000, 100)} | +| make_loss | 0.4874 | --- | 4194.3042 | {'data': (1024, 1024)} | +| make_loss | 0.013 | --- | 40.0 | {'data': (10000, 1)} | +| make_loss | 0.3483 | --- | 4000.0 | {'data': (10000, 100)} | +| broadcast_greater | 0.0082 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| gammaln | 49.6217 | 105.7931 | 2097.1521 | {'data': (1024, 1024)} | +| gammaln | 0.4789 | 0.9577 | 40.0 | {'data': (10000, 1)} | +| gammaln | 48.474 | 102.211 | 4000.0 | {'data': (10000, 100)} | +| broadcast_lesser | 0.0084 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| max_axis | 30.1487 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} | +| max_axis | 0.3101 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} | +| max_axis | 29.4315 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| degrees | 3.659 | 4.2964 | 2097.1521 | {'data': (1024, 1024)} | +| degrees | 0.0595 | 0.0538 | 20.0 | {'data': (10000, 1)} | +| degrees | 3.8676 | 4.1255 | 4000.0 | {'data': (10000, 100)} | +| sinh | 8.9259 | 10.3014 | 2097.1521 | {'data': (1024, 1024)} | +| sinh | 0.0989 | 0.1048 | 40.0 | {'data': (10000, 1)} | +| sinh | 8.4579 | 9.7402 | 2000.0 | {'data': (10000, 100)} | +| zeros_like | 2.4764 | --- | 4194.3042 | {'data': (1024, 1024)} | +| zeros_like | 0.0056 | --- | 40.0 | {'data': (10000, 1)} | +| zeros_like | 2.3254 | --- | 4000.0 | {'data': (10000, 100)} | +| arccosh | 6.8035 | 7.7818 | 2097.1521 | {'data': (1024, 1024)} | +| arccosh | 0.0764 | 0.0847 | 40.0 | {'data': (10000, 1)} | +| arccosh | 6.444 | 7.5842 | 2000.0 | {'data': (10000, 100)} | +| prod | 28.2885 | 55.9765 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| prod | 0.2996 | 0.5213 | 0.004 | {'data': (10000, 1), 'axis': 0} | +| prod | 26.9891 | 54.6354 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| random_gamma | 247.5786 | --- | 2097.1521 | {'shape': (1024, 1024)} | +| random_gamma | 2.3986 | --- | 40.0 | {'shape': (10000, 1)} | +| random_gamma | 237.5963 | --- | 2000.0 | {'shape': (10000, 100)} | +| broadcast_minus | --- | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| Flatten | 0.3339 | --- | 4194.3042 | {'data': (1024, 1024)} | +| Flatten | 0.0152 | --- | 40.0 | {'data': (10000, 1)} | +| Flatten | 0.3546 | --- | 4000.0 | {'data': (10000, 100)} | +| expm1 | 9.8241 | 11.7609 | 4194.3042 | {'data': (1024, 1024)} | +| expm1 | 0.1844 | 0.1675 | 40.0 | {'data': (10000, 1)} | +| expm1 | 9.0366 | 10.4387 | 4000.0 | {'data': (10000, 100)} | +| elemwise_div | 0.0064 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| LeakyReLU | 10.3625 | 12.5441 | 4194.3042 | {'data': (1024, 1024), 'act_type': 'leaky', 'slope': 0.1} | +| LeakyReLU | 0.1076 | 0.1277 | 40.0 | {'data': (10000, 1), 'act_type': 'leaky', 'slope': 0.1} | +| LeakyReLU | 9.5913 | 11.7957 | 2000.0 | {'data': (10000, 100), 'act_type': 'leaky', 'slope': 0.1} | +| LeakyReLU | 12.337 | 12.6383 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'elu', 'slope': 0.1} | +| LeakyReLU | 0.1305 | 0.1217 | 40.0 | {'data': (10000, 1), 'act_type': 'elu', 'slope': 0.1} | +| LeakyReLU | 11.652 | 11.8465 | 4000.0 | {'data': (10000, 100), 'act_type': 'elu', 'slope': 0.1} | +| LeakyReLU | 12.4973 | 11.4957 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'selu'} | +| LeakyReLU | 0.1295 | 0.1176 | 40.0 | {'data': (10000, 1), 'act_type': 'selu'} | +| LeakyReLU | 12.2224 | 11.548 | 4000.0 | {'data': (10000, 100), 'act_type': 'selu'} | +| LeakyReLU | 16.9543 | 306.6579 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'prelu', 'gamma': (1, 1024)} | +| LeakyReLU | 0.2859 | 1.9528 | 20.0 | {'data': (10000, 1), 'act_type': 'prelu', 'gamma': (1, 1)} | +| LeakyReLU | 16.0125 | 231.8273 | 2000.0 | {'data': (10000, 100), 'act_type': 'prelu', 'gamma': (1, 100)} | +| rint | 14.9397 | --- | 4194.3042 | {'data': (1024, 1024)} | +| rint | 0.1535 | --- | 40.0 | {'data': (10000, 1)} | +| rint | 14.5915 | --- | 4000.0 | {'data': (10000, 100)} | +| identity | --- | --- | 4194.3042 | {'data': (1024, 1024)} | +| identity | --- | --- | 40.0 | {'data': (10000, 1)} | +| identity | --- | --- | 4000.0 | {'data': (10000, 100)} | +| softsign | 3.9985 | 7.05 | 2097.1521 | {'data': (1024, 1024)} | +| softsign | 0.0486 | 0.0737 | 40.0 | {'data': (10000, 1)} | +| softsign | 3.7662 | 6.7975 | 2000.0 | {'data': (10000, 100)} | +| broadcast_div | 0.0083 | 0.0075 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| square | 4.2037 | 4.9639 | 2097.1521 | {'data': (1024, 1024)} | +| square | 0.0467 | 0.0558 | 40.0 | {'data': (10000, 1)} | +| square | 3.9986 | 4.6533 | 2000.0 | {'data': (10000, 100)} | +| elemwise_sub | 0.0058 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| dot | 14.562 | 29.1605 | 4194.3042 | {'lhs': (1024, 1024), 'rhs': (1024, 1024)} | +| dot | 0.745 | 1.5842 | 2000.0 | {'lhs': (1000, 10), 'rhs': (1000, 10), 'transpose_b': True} | +| dot | 0.0579 | 0.1673 | 0.2 | {'lhs': (1000, 1), 'rhs': (100, 1000), 'transpose_a': True, 'transpose_b': True} | +| broadcast_logical_and | 0.0071 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| broadcast_add | 0.0081 | 0.0066 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| random_exponential | 63.2732 | --- | 4194.3042 | {'shape': (1024, 1024)} | +| random_exponential | 0.6453 | --- | 40.0 | {'shape': (10000, 1)} | +| random_exponential | 59.2788 | --- | 2000.0 | {'shape': (10000, 100)} | +| Dropout | 249.4661 | 23.5141 | 37748.7344 | {'data': (32, 3, 256, 256), 'p': 0.5, 'mode': 'always'} | +| Dropout | 3.9634 | 0.3516 | 600.0 | {'data': (10000, 10), 'p': 0.5, 'mode': 'always'} | +| exp | 8.9413 | --- | 4194.3042 | {'data': (1024, 1024)} | +| exp | 0.0971 | --- | 40.0 | {'data': (10000, 1)} | +| exp | 7.9211 | --- | 4000.0 | {'data': (10000, 100)} | +| random_generalized_negative_binomial | 362.7789 | --- | 2097.1521 | {'shape': (1024, 1024)} | +| random_generalized_negative_binomial | 3.4276 | --- | 40.0 | {'shape': (10000, 1)} | +| random_generalized_negative_binomial | 344.3516 | --- | 4000.0 | {'shape': (10000, 100)} | +| min | 30.8723 | 55.9413 | 0.002 | {'data': (1024, 1024), 'axis': ()} | +| min | 0.3168 | 0.5206 | 0.002 | {'data': (10000, 1), 'axis': 0} | +| min | 29.9547 | 53.8245 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} | +| erfinv | 79.987 | 99.2274 | 2097.1521 | {'data': (1024, 1024)} | +| erfinv | 0.7567 | 0.9105 | 40.0 | {'data': (10000, 1)} | +| erfinv | 76.0479 | 95.5001 | 2000.0 | {'data': (10000, 100)} | +| broadcast_plus | --- | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} | +| arcsin | 16.3157 | 7.6156 | 2097.1521 | {'data': (1024, 1024)} | +| arcsin | 0.1611 | 0.0758 | 40.0 | {'data': (10000, 1)} | +| arcsin | 16.0225 | 7.5081 | 2000.0 | {'data': (10000, 100)} | +| sample_generalized_negative_binomial | 629.1785 | --- | 8388.6084 | {'mu': [2.0, 2.5], 'shape': (1024, 1024), 'alpha': [0.0, 2.5]} | +| sample_generalized_negative_binomial | 6.8681 | --- | 80.0 | {'mu': [2.0, 2.5], 'shape': (10000, 1), 'alpha': [0.0, 2.5]} | +| sample_generalized_negative_binomial | 604.3484 | --- | 8000.0 | {'mu': [2.0, 2.5], 'shape': (10000, 100), 'alpha': [0.0, 2.5]} | +| relu | 11.0979 | 8.3262 | 2097.1521 | {'data': (1024, 1024)} | +| relu | 0.1163 | 0.0853 | 40.0 | {'data': (10000, 1)} | +| relu | 10.6863 | 8.0702 | 4000.0 | {'data': (10000, 100)} | +| cbrt | 11.3121 | 6.5254 | 2097.1521 | {'data': (1024, 1024)} | +| cbrt | 0.1238 | 0.0687 | 40.0 | {'data': (10000, 1)} | +| cbrt | 10.4631 | 6.0997 | 2000.0 | {'data': (10000, 100)} | +| sample_uniform | 89.1332 | --- | 8388.6084 | {'low': [0.0, 2.5], 'shape': (1024, 1024), 'high': [1.0, 3.7]} | +| sample_uniform | 0.8895 | --- | 80.0 | {'low': [0.0, 2.5], 'shape': (10000, 1), 'high': [1.0, 3.7]} | +| sample_uniform | 84.4477 | --- | 8000.0 | {'low': [0.0, 2.5], 'shape': (10000, 100), 'high': [1.0, 3.7]} | +| Convolution | 13.4072 | 17.0238 | 56610.418 | {'data': (32, 3, 256), 'weight': (64, 3, 3), 'bias': (64,), 'kernel': (3,), 'stride': (1,), 'dilate': (1,), 'pad': (0,), 'num_filter': 64, 'layout': 'NCW'} | +| sample_poisson | 512.1068 | --- | 8388.6084 | {'lam': [1.0, 8.5], 'shape': (1024, 1024)} | +| sample_poisson | 4.6203 | --- | 80.0 | {'lam': [1.0, 8.5], 'shape': (10000, 1)} | +| sample_poisson | 474.1238 | --- | 8000.0 | {'lam': [1.0, 8.5], 'shape': (10000, 100)} | +| log_softmax | 21.4413 | 15.7456 | 2097.1521 | {'data': (1024, 1024), 'axis': -1, 'temperature': 0.5} | +| log_softmax | 0.4613 | 0.2958 | 20.0 | {'data': (10000, 1), 'axis': -1, 'temperature': 0.5} | +| log_softmax | 21.9745 | 15.2407 | 4000.0 | {'data': (10000, 100), 'axis': -1, 'temperature': 0.5} | +| gamma | 35.1027 | 124.2015 | 2097.1521 | {'data': (1024, 1024)} | +| gamma | 0.3611 | 1.1177 | 20.0 | {'data': (10000, 1)} | +| gamma | 33.636 | 117.6889 | 2000.0 | {'data': (10000, 100)} | +| reciprocal | 3.4646 | 6.1106 | 2097.1521 | {'data': (1024, 1024)} | +| reciprocal | 0.0413 | 0.0635 | 40.0 | {'data': (10000, 1)} | +| reciprocal | 3.2553 | 5.8762 | 2000.0 | {'data': (10000, 100)} | +| sigmoid | 9.8017 | 5.9639 | 2097.1521 | {'data': (1024, 1024)} | +| sigmoid | 0.1095 | 0.0651 | 40.0 | {'data': (10000, 1)} | +| sigmoid | 9.0443 | 5.7901 | 2000.0 | {'data': (10000, 100)} | \ No newline at end of file diff --git a/benchmark/opperf/rules/__init__.py b/benchmark/opperf/rules/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/opperf/rules/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py new file mode 100644 index 000000000000..59b2aff53570 --- /dev/null +++ b/benchmark/opperf/rules/default_params.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Default Input Tensor shapes to use for benchmarking""" + +"""""" + +# For Unary operators like abs, arccos, arcsin etc.. +DEFAULT_DATA = [(1024, 1024), (10000, 1), (10000, 100)] + +# For Binary broadcast operators like - broadcast_add/sub/mode/logical_and etc.. +DEFAULT_LHS = [[(1024, 1024), (10000, 10), (10000, 1)]] +DEFAULT_RHS = [[(1024, 1024), (10000, 10), (10000, 1)]] + +# For operators like - random_uniform, random_normal etc.. +DEFAULT_SHAPE = [(1024, 1024), (10000, 1), (10000, 100)] +DEFAULT_LOW = [0] +DEFAULT_HIGH = [5] +DEFAULT_K = [1] +DEFAULT_P = [1] + +# For operators like - sample_uniform, sample_normal etc.. +# NOTE: There are many overlapping operators in random_* and sample_*, +# Ex: random_uniform, sample_uniform. Parameter names are same, but, for +# random_* operators they are float/int and for sample_* operators they are NDArray. +# Hence, below we append ND to mark the difference. +DEFAULT_LOW_ND = [[0.0, 2.5]] +DEFAULT_HIGH_ND = [[1.0, 3.7]] +DEFAULT_MU_ND = [[2.0, 2.5]] +DEFAULT_SIGMA = [[1.0, 3.7]] +DEFAULT_ALPHA_ND = [[0.0, 2.5]] +DEFAULT_BETA_ND = [[1.0, 0.7]] +DEFAULT_LAM = [[1.0, 8.5]] +DEFAULT_K_ND = [[20, 49]] +DEFAULT_P_ND = [[0.4, 0.77]] + +# For reduction operators +# NOTE: Data used is DEFAULT_DATA +DEFAULT_AXIS = [(), 0, (0, 1)] + +# Default Inputs. MXNet Op Param Name to Default Input mapping +DEFAULTS_INPUTS = {"data": DEFAULT_DATA, + "lhs": DEFAULT_LHS, + "rhs": DEFAULT_RHS, + "shape": DEFAULT_SHAPE, + "low": DEFAULT_LOW, + "high": DEFAULT_HIGH, + "low_nd": DEFAULT_LOW_ND, + "high_nd": DEFAULT_HIGH_ND, + "mu_nd": DEFAULT_MU_ND, + "sigma": DEFAULT_SIGMA, + "alpha_nd": DEFAULT_ALPHA_ND, + "beta_nd": DEFAULT_BETA_ND, + "lam_nd": DEFAULT_LAM, + "k": DEFAULT_K, + "p": DEFAULT_P, + "k_nd": DEFAULT_K_ND, + "p_nd": DEFAULT_P_ND, + "axis": DEFAULT_AXIS} + +# These are names of MXNet operator parameters that is of type NDArray. +# We maintain this list to automatically recognize these parameters are to be +# given as NDArray and translate users inputs such as a shape tuple, Numpy Array or +# a list to MXNet NDArray. This is just a convenience added so benchmark utility users +# can just say shape of the tensor, and we automatically create Tensors. +PARAMS_OF_TYPE_NDARRAY = ["lhs", "rhs", "data", "base", "exp", + "mu", "sigma", "lam", "alpha", "beta", "gamma", "k", "p", + "low", "high", "weight", "bias", "moving_mean", "moving_var"] diff --git a/benchmark/opperf/utils/__init__.py b/benchmark/opperf/utils/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/benchmark/opperf/utils/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py new file mode 100644 index 000000000000..dc4890b3df0f --- /dev/null +++ b/benchmark/opperf/utils/benchmark_utils.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging + +import mxnet as mx +from mxnet import nd + +from .ndarray_utils import get_mx_ndarray, nd_forward_and_profile, nd_forward_backward_and_profile +from .common_utils import merge_map_list +from .op_registry_utils import prepare_op_inputs +from benchmark.opperf.rules.default_params import PARAMS_OF_TYPE_NDARRAY + + +def _prepare_op_inputs(inputs, run_backward, dtype, ctx): + kwargs_list = [] + + for inp in inputs: + kwargs = {} + for key, value in inp.items(): + if key in PARAMS_OF_TYPE_NDARRAY: + kwargs[key] = get_mx_ndarray(ctx=ctx, in_tensor=value, + dtype=dtype, + initializer=nd.normal, + attach_grad=run_backward) + else: + kwargs[key] = value + kwargs_list.append(kwargs) + + return kwargs_list + + +def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list): + if run_backward: + benchmark_helper_func = nd_forward_backward_and_profile + else: + benchmark_helper_func = nd_forward_and_profile + + # Warm up, ignore the profiler output + _, _ = benchmark_helper_func(op, warmup, **kwargs_list[0]) + + # Run Benchmarks + op_benchmark_result = {op.__name__: []} + logging.info(f"Begin Benchmark - {op.__name__}") + for idx, kwargs in enumerate(kwargs_list): + _, profiler_output = benchmark_helper_func(op, runs, **kwargs) + + # Add inputs used for profiling this operator into result + profiler_output["inputs"] = inputs[idx] + op_benchmark_result[op.__name__].append(profiler_output) + logging.info(f"Complete Benchmark - {op.__name__}") + return op_benchmark_result + + +def run_performance_test(ops, inputs, run_backward=True, + dtype='float32', ctx=mx.cpu(), + warmup=10, runs=50): + """Run operator benchmark for given operator or list of operators, ops, with the given inputs. + + Returns benchmark results as a list of dictionary where each dictionary represents benchmarks result per operator. + key -> name of the operator and value -> map of results (forward time, backward time, time spent in memory + operations. + + Parameters + ---------- + ops: [Str] + One or list of operators to benchmark. Should be an NDArray operator. + inputs: map + Inputs for operator. Key should be name of parameter for operator. + Example: inputs = {"lhs": (1024, 1024), "rhs": (1024, 1024)} for mx.nd.add + run_backward: Boolean, Default is True + Should we have backward operator benchmarks. + dtype: Str, default 'float32' + Precision to use for input tensors. Defaults to float32. Example: 'float32', 'int64' + ctx: mx.ctx, default mx.cpu() + Context to use for benchmarks. Default to mx.cpu() + warmup: int, default 10 + Number of warmup runs + runs: int, default 50 + Number of runs for capturing benchmark results + + Returns + ------- + List of dictionary of benchmark results. key -> name of the operator, Value is benchmark results. + + """ + kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx) + + if not isinstance(ops, list): + ops = [ops] + + op_benchmark_result = [] + for op in ops: + if hasattr(mx.nd, op.__name__): + benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list) + else: + raise ValueError("Unknown NDArray operator provided to benchmark. - ", op.__name__) + op_benchmark_result.append(benchmark_result) + return op_benchmark_result + + +def run_op_benchmarks(ops, dtype, ctx, warmup, runs): + # For each operator, run benchmarks + mx_op_benchmark_results = [] + for _, op_params in ops.items(): + # Prepare inputs for the operator + inputs = prepare_op_inputs(op_params) + # Run benchmarks + cur_op_res = run_performance_test(op_params["nd_op_handle"], + run_backward=op_params["has_backward"], + dtype=dtype, ctx=ctx, + inputs=inputs, + warmup=warmup, runs=runs) + mx_op_benchmark_results += cur_op_res + + # Prepare combined results for all operators + mx_op_benchmark_results = merge_map_list(mx_op_benchmark_results) + return mx_op_benchmark_results diff --git a/benchmark/opperf/utils/common_utils.py b/benchmark/opperf/utils/common_utils.py new file mode 100644 index 000000000000..9fe2e19b13b3 --- /dev/null +++ b/benchmark/opperf/utils/common_utils.py @@ -0,0 +1,120 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import json +from operator import itemgetter + +from collections import ChainMap + +import logging +logging.basicConfig(level=logging.INFO) + + +def merge_map_list(map_list): + """Merge all the Map in map_list into one final Map. + + Useful when you have a list of benchmark result maps and you want to + prepare one final map combining all results. + + Parameters + ---------- + map_list: List[maps] + List of maps to be merged. + + Returns + ------- + map where all individual maps in the into map_list are merged + + """ + return dict(ChainMap(*map_list)) + + +def save_to_file(inp_dict, out_filepath, out_format='json'): + """Saves the given input dictionary to the given output file. + + By default, saves the input dictionary as JSON file. Other supported formats include: + 1. md + + Parameters + ---------- + inp_dict: map + Input dictionary to be saved + out_filepath: str + Output file path + out_format: str, default 'json' + Format of the output file. Supported options - 'json', 'md'. Default - json. + + """ + if out_format == 'json': + # Save as JSON + with open(out_filepath, "w") as result_file: + json.dump(inp_dict, result_file, indent=4, sort_keys=True) + elif out_format == 'md': + # Save as md + with open(out_filepath, "w") as result_file: + result_file.write(_prepare_markdown(inp_dict)) + else: + raise ValueError("Invalid output file format provided - '{}'. Supported - json, md".format(format)) + + +def get_json(inp_dict): + """Converts a given dictionary to prettified JSON string. + + Parameters + ---------- + inp_dict: map + Input dictionary to be converted to JSON. + + Returns + ------- + Prettified JSON string + + """ + return json.dumps(inp_dict, indent=4) + + +def _prepare_op_benchmark_result(op, op_bench_result): + operator_name = op + avg_forward_time = "---" + avg_backward_time = "---" + max_mem_usage = "---" + inputs = "---" + for key, value in op_bench_result.items(): + if "avg_time_forward" in key: + avg_forward_time = value + elif "avg_time_backward" in key: + avg_backward_time = value + elif "max_storage_mem_alloc_" in key: + max_mem_usage = value + elif "inputs" in key: + inputs = value + return "| {} | {} | {} | {} | {} |".format(operator_name, avg_forward_time, avg_backward_time, + max_mem_usage, inputs) + + +def _prepare_markdown(results): + results_markdown = [ + "| Operator | Avg Forward Time (ms) | Avg. Backward Time (ms) | Max Mem Usage (Storage) (Bytes)" + " | Inputs |", + "| :---: | :---: | :---: | :---:| :--- |"] + + for op, op_bench_results in sorted(results.items(), key=itemgetter(0)): + for op_bench_result in op_bench_results: + results_markdown.append(_prepare_op_benchmark_result(op, op_bench_result)) + + return os.linesep.join(results_markdown) diff --git a/benchmark/opperf/utils/ndarray_utils.py b/benchmark/opperf/utils/ndarray_utils.py new file mode 100644 index 000000000000..7ed2fa107066 --- /dev/null +++ b/benchmark/opperf/utils/ndarray_utils.py @@ -0,0 +1,127 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import mxnet as mx +import mxnet.ndarray as nd + +from .profiler_utils import profile + + +@profile +def nd_forward_backward_and_profile(op, runs, *args, **kwargs): + """Helper function to run a given NDArray operator (op) for 'runs' number of times with + given args and kwargs. Executes both forward and backward pass. + + NOTE: This is a sync call and waits for all the operations execution to complete. + + Parameters + ---------- + op: Str + NDArray operator (Function reference) to execute. Example: mx.nd.add + runs: int + Number of times to execute the operation + args: + Arguments for the NDArray operator (op) being executed. + kwargs: + Key value arguments for the NDArray operator (op) being executed. + + Returns + ------- + any results from NDArray operation execution + + """ + for _ in range(runs): + with mx.autograd.record(): + res = op(*args, **kwargs) + res.backward() + nd.waitall() + return res + + +@profile +def nd_forward_and_profile(op, runs, *args, **kwargs): + """Helper function to run a given NDArray operator (op) for 'runs' number of times with + given args and kwargs. Executes ONLY forward pass. + + NOTE: This is a sync call and waits for all the operations execution to complete. + + Parameters + ---------- + op: Str + NDArray operator (Function reference) to execute. Example: mx.nd.add + runs: int + Number of time to execute the operation + args: + Arguments for the NDArray operator (op) being executed. + kwargs: + Key value arguments for the NDArray operator (op) being executed. + + Returns + ------- + any results from NDArray operation execution + """ + for _ in range(runs): + res = op(*args, **kwargs) + nd.waitall() + return res + + +def get_mx_ndarray(ctx, in_tensor, dtype, initializer, attach_grad=True): + """Helper function to prepare a MXNet NDArray tensor in given Context (ctx) of type (dtype) with given + initializer. You can get a new Tensor by providing only "Shape" or "Numpy NDArray" or another MXNet NDArray as + "in_tensor". + + NOTE: This is a sync call and waits for the Tensor to be created. + + Parameters + ---------- + ctx: mx.ctx, default mx.cpu() + Context of the new MXNet NDArray Tensor. + in_tensor: Numpy NDArray or MXNet NDArray or Tuple of shape + Can be a tuple of shape or Numpy NDArray or MXNet NDArray. + dtype: str + Precision or Dtype of the expected Tensor. Ex: "float32", "Int64" + initializer: + Function reference to the initialize to use. Ex: mx.nd.random.normal, mx.nd.zeros + attach_grad: Boolean, default True + To attach a gradient for the Tensor. Default is True. + + Returns + ------- + MXNet NDArray Tensor. + """ + if isinstance(in_tensor, int) or isinstance(in_tensor, float): + return in_tensor + + if isinstance(in_tensor, tuple): + tensor = initializer(ctx=ctx, shape=in_tensor, dtype=dtype) + elif isinstance(in_tensor, list): + tensor = nd.array(in_tensor, ctx=ctx, dtype=dtype) + elif isinstance(in_tensor, np.ndarray): + tensor = nd.array(in_tensor, ctx=ctx, dtype=dtype) + elif isinstance(in_tensor, mx.ndarray): + tensor = in_tensor.as_in_context(ctx=ctx).astype(dtype=dtype) + else: + raise ValueError("Invalid input type for creating input tensor. Input can be tuple() of shape or Numpy Array or" + " MXNet NDArray. Given - ", in_tensor) + + if attach_grad: + tensor.attach_grad() + + tensor.wait_to_read() + return tensor diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py new file mode 100644 index 000000000000..f5b47cb7a9e4 --- /dev/null +++ b/benchmark/opperf/utils/op_registry_utils.py @@ -0,0 +1,331 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Utilities to interact with MXNet operator registry.""" +import ctypes +import sys +from mxnet.base import _LIB, check_call, py_str, OpHandle, c_str, mx_uint + +from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS + +# We will use all operators inside NDArray Module +mx_nd_module = sys.modules["mxnet.ndarray.op"] + +# Operators where parameter have special criteria that cannot be cleanly automated. +# Example: sample_multinomial operator has a parameter 'data'. It expects values to sum up to 1. +unique_ops = ("sample_multinomial",) + + +def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forward_backward=True): + """From a given list of operators, filter out all operator names starting with given filters and prepares + a dictionary of operator with attributes - 'has_backward' and 'nd_op_handle = mxnet.ndarray.op' + + By default, merge forward and backward operators for a given op into one operator and sets the attribute + 'has_backward' for the operator. + + By default, filter out all Contrib operators that starts with '_contrib' and internal operators that + starts with '_'. + + Parameters + ---------- + operator_names: List[str] + List of operator names. + filters: Tuple(str) + Tuple of filters to apply on operator names. + merge_op_forward_backward: Boolean, Default - True + Merge forward and backward operators for a given op in to one op. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle"}} + """ + mx_operators = {} + operators_with_backward = [] + + if merge_op_forward_backward: + filters += ("_backward",) + + for cur_op_name in operator_names: + if not cur_op_name.startswith(filters): + mx_operators[cur_op_name] = {"has_backward": False, + "nd_op_handle": getattr(mx_nd_module, cur_op_name)} + + if cur_op_name.startswith("_backward_"): + operators_with_backward.append(cur_op_name) + + if merge_op_forward_backward: + # Identify all operators that can run backward. + for op_with_backward in operators_with_backward: + op_name = op_with_backward.split("_backward_")[1] + if op_name in mx_operators: + mx_operators[op_name]["has_backward"] = True + + return mx_operators + + +def _get_all_registered_ops(): + """Get all registered MXNet operator names. + + + Returns + ------- + ["operator_name"] + """ + plist = ctypes.POINTER(ctypes.c_char_p)() + size = ctypes.c_uint() + + check_call(_LIB.MXListAllOpNames(ctypes.byref(size), + ctypes.byref(plist))) + + mx_registered_operator_names = [py_str(plist[i]) for i in range(size.value)] + return mx_registered_operator_names + + +def _get_op_handles(op_name): + """Get handle for an operator with given name - op_name. + + Parameters + ---------- + op_name: str + Name of operator to get handle for. + """ + op_handle = OpHandle() + check_call(_LIB.NNGetOpHandle(c_str(op_name), ctypes.byref(op_handle))) + return op_handle + + +def _get_op_arguments(op_handle): + """Given operator name and handle, fetch operator arguments - number of arguments, + argument names, argument types. + + Parameters + ---------- + op_handle: OpHandle + Handle for the operator + + Returns + ------- + (narg, arg_names, arg_types) + """ + real_name = ctypes.c_char_p() + desc = ctypes.c_char_p() + num_args = mx_uint() + arg_names = ctypes.POINTER(ctypes.c_char_p)() + arg_types = ctypes.POINTER(ctypes.c_char_p)() + arg_descs = ctypes.POINTER(ctypes.c_char_p)() + key_var_num_args = ctypes.c_char_p() + ret_type = ctypes.c_char_p() + + check_call(_LIB.MXSymbolGetAtomicSymbolInfo( + op_handle, ctypes.byref(real_name), ctypes.byref(desc), + ctypes.byref(num_args), + ctypes.byref(arg_names), + ctypes.byref(arg_types), + ctypes.byref(arg_descs), + ctypes.byref(key_var_num_args), + ctypes.byref(ret_type))) + + narg = int(num_args.value) + arg_names = [py_str(arg_names[i]) for i in range(narg)] + arg_types = [py_str(arg_types[i]) for i in range(narg)] + + return narg, arg_names, arg_types + + +def _set_op_arguments(mx_operators): + """Fetch and set operator arguments - nargs, arg_names, arg_types + """ + for op_name in mx_operators: + op_handle = _get_op_handles(op_name) + narg, arg_names, arg_types = _get_op_arguments(op_handle) + mx_operators[op_name]["params"] = {"narg": narg, + "arg_names": arg_names, + "arg_types": arg_types} + + +def _get_all_mxnet_operators(): + # Step 1 - Get all registered op names and filter it + operator_names = _get_all_registered_ops() + mx_operators = _select_ops(operator_names) + + # Step 2 - Get all parameters for the operators + _set_op_arguments(mx_operators) + return mx_operators + + +def prepare_op_inputs(arg_params, arg_values): + inputs = [] + + for arg_value in arg_values: + inp = {} + for arg_name in arg_params["params"]["arg_names"]: + if arg_name in arg_value: + inp[arg_name] = arg_value[arg_name] + inputs.append(inp) + return inputs + + +def prepare_op_inputs(arg_params): + inputs = [] + + # Prepare op to default input mapping + arg_values = {} + for arg_name, arg_type in zip(arg_params["params"]["arg_names"], + arg_params["params"]["arg_types"]): + if "NDArray" in arg_type and arg_name + "_nd" in DEFAULTS_INPUTS: + arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_nd"] + elif arg_name in DEFAULTS_INPUTS: + arg_values[arg_name] = DEFAULTS_INPUTS[arg_name] + elif "float" in arg_type and arg_name + "_float" in DEFAULTS_INPUTS: + arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_float"] + + # Number of different inputs we want to use to test + # the operator + num_input_combinations = max([len(value) for value in arg_values.values()]) + + # Prepare key/value args for param to input value + for idx in range(num_input_combinations): + inp = {} + for arg_name in arg_params["params"]["arg_names"]: + if arg_name in arg_values: + if len(arg_values[arg_name]) == num_input_combinations: + inp[arg_name] = arg_values[arg_name][idx] + else: + # This is required when we want to use a param same across all + # input combination. Example: keeping low and high same for random sampling + # operator for all different types of Tensor shape. + inp[arg_name] = arg_values[arg_name][0] + + inputs.append(inp) + return inputs + + +def get_all_unary_operators(): + """Gets all Unary operators registered with MXNet. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle", "params"}} + """ + # Get all mxnet operators + mx_operators = _get_all_mxnet_operators() + + # Filter for unary broadcast operators + unary_broadcast_mx_operators = {} + for op_name, op_params in mx_operators.items(): + if op_params["params"]["narg"] == 1 and \ + "data" in op_params["params"]["arg_names"]: + unary_broadcast_mx_operators[op_name] = mx_operators[op_name] + return unary_broadcast_mx_operators + + +def get_all_broadcast_binary_operators(): + """Gets all binary broadcast operators registered with MXNet. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle", "params"}} + """ + # Get all mxnet operators + mx_operators = _get_all_mxnet_operators() + + # Filter for binary broadcast operators + binary_broadcast_mx_operators = {} + for op_name, op_params in mx_operators.items(): + if op_name.startswith("broadcast_") and op_params["params"]["narg"] == 2 and \ + "lhs" in op_params["params"]["arg_names"] and \ + "rhs" in op_params["params"]["arg_names"]: + binary_broadcast_mx_operators[op_name] = mx_operators[op_name] + return binary_broadcast_mx_operators + + +def get_all_elemen_wise_binary_operators(): + """Gets all binary elemen_wise operators registered with MXNet. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle", "params"}} + """ + # Get all mxnet operators + mx_operators = _get_all_mxnet_operators() + + # Filter for binary elemen_wise operators + binary_elemen_wise_mx_operators = {} + for op_name, op_params in mx_operators.items(): + if op_name.startswith("elemwise_") and op_params["params"]["narg"] == 2 and \ + "lhs" in op_params["params"]["arg_names"] and \ + "rhs" in op_params["params"]["arg_names"]: + binary_elemen_wise_mx_operators[op_name] = mx_operators[op_name] + return binary_elemen_wise_mx_operators + + +def get_all_random_sampling_operators(): + """Gets all Random Sampling operators registered with MXNet. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle", "params"}} + """ + # Get all mxnet operators + mx_operators = _get_all_mxnet_operators() + + # Filter for Random Sampling operators + random_sampling_mx_operators = {} + for op_name, op_params in mx_operators.items(): + if op_name.startswith(("random_", "sample_")) and op_name not in unique_ops: + random_sampling_mx_operators[op_name] = mx_operators[op_name] + return random_sampling_mx_operators + + +def get_all_reduction_operators(): + """Gets all Reduction operators registered with MXNet. + + Returns + ------- + {"operator_name": {"has_backward", "nd_op_handle", "params"}} + """ + # Get all mxnet operators + mx_operators = _get_all_mxnet_operators() + + # Filter for Reduction operators + reduction_mx_operators = {} + for op_name, op_params in mx_operators.items(): + if op_params["params"]["narg"] == 4 and \ + set(["data", "axis", "exclude", "keepdims"]).issubset(set(op_params["params"]["arg_names"])) \ + and op_name not in unique_ops: + reduction_mx_operators[op_name] = mx_operators[op_name] + return reduction_mx_operators + + +def get_operators_with_no_benchmark(operators_with_benchmark): + """Gets all MXNet operators with not benchmark. + + Retrieve all operators registered with MXNet and prepares a list of operators that are not part of given + operators with benchmark list. + + Parameters + ---------- + operators_with_benchmark: list[Str] + List of operator names that has benchmarks + + Returns + ------- + list[Str] + List of operator names that is registered with MXNet but has no benchmarks. + """ + all_mxnet_operators = _get_all_mxnet_operators().keys() + return list(set(all_mxnet_operators) - set(operators_with_benchmark)) diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py new file mode 100644 index 000000000000..a434d3be1e5c --- /dev/null +++ b/benchmark/opperf/utils/profiler_utils.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import functools + +from .common_utils import merge_map_list +from mxnet import profiler + +""" +TODO: Below we are using logic of parsing the MXNet profiler output string to +fetch the benchmark results. Note that this is a temporary solution till we add +a new utility API into MXNet profiler to get_summary(), reset(). All the below +parsing logic should be removed once these read APIs are available in Profiler. + +""" + + +def _get_memory_profile(memory_profile_results): + memory_profile = {} + for line in memory_profile_results: + if line.startswith("Memory:"): + device_id = line.split()[1] + avg_time_memory_alloc = float(line.split()[-1]) + memory_profile["max_storage_mem_alloc_" + device_id] = avg_time_memory_alloc + + return memory_profile + + +def _get_operator_profile(operator_name, operator_profile_results): + operator_profile = {} + for line in operator_profile_results: + if operator_name in line or operator_name[:3] + " " in line: + operation = line.split()[0] + operation_avg_time = float(line.split()[-1]) + if "_backward" in operation: + operator_profile["avg_time" + operation] = operation_avg_time + else: + operator_profile["avg_time_forward_" + operation] = operation_avg_time + + return operator_profile + + +def parse_profiler_dump(operator_name, profiler_dump): + """Parse the MXNet profiler dump output, fetch Memory profile results and + Operator compute profiler results. + + Parameters + ---------- + profiler_dump: string + MXNet profiler output from mx.profiler.dumps() API. + + Returns + ------- + map, Memory and Compute profiler results. + + """ + if not profiler_dump: + raise AssertionError("Invalid MXNet profiler output provided to parse!") + + """ + MXNet profiler output from mx.profiler.dumps() API looks like below. This function parses + this string profiler output to fetch Memory and Compute metrics. + + Profile Statistics. + Note that counter items are counter values and not time units. + Device Storage + ================= + Name Total Count Time (ms) Min Time (ms) Max Time (ms) Avg Time (ms) + ---- ----------- --------- ------------- ------------- ------------- + Memory: cpu/0 100 2097152.0000 1681915.8750 2097152.0000 207618.0469 + + MXNET_C_API + ================= + Name Total Count Time (ms) Min Time (ms) Max Time (ms) Avg Time (ms) + ---- ----------- --------- ------------- ------------- ------------- + MXNDArrayFree 49 1.1220 0.0170 0.0360 0.0229 + MXAutogradBackwardEx 50 11.5460 0.1980 0.3360 0.2309 + MXNet C API Calls 399 1.9990 1.6010 1.9990 0.1990 + MXImperativeInvokeEx 50 4.4810 0.0700 0.1330 0.0896 + MXNDArrayWaitAll 50 769.0570 14.0200 24.5030 15.3811 + MXAutogradSetIsTraining 100 0.0190 0.0000 0.0010 0.0002 + MXAutogradSetIsRecording 100 0.0400 0.0000 0.0010 0.0004 + MXNet C API Concurrency 798 0.0000 0.0000 0.0010 0.0005 + + operator + ================= + Name Total Count Time (ms) Min Time (ms) Max Time (ms) Avg Time (ms) + ---- ----------- --------- ------------- ------------- ------------- + DeleteVariable 196 1.4490 0.0040 0.0250 0.0074 + _backward_broadcast_add 100 521.2320 4.8070 8.5970 5.2123 + SetValueOp 100 645.8060 5.8820 10.0380 6.4581 + broadcast_add 100 394.8910 3.5230 5.8790 3.9489 + """ + + # String Patterns to look out for when parsing + memory_profile_result_start = "Device Storage" # Helps identify start of Memory profile + c_api_profile_result_start = "MXNET_C_API" # Helps identify end of Memory profile + operator_profile_result_start = "operator" # Helps identify start of Operator profile + + memory_profile_results = [] + operator_profile_results = [] + + # Parse lines corresponding to Memory and Computation profiling + read_memory_profile = False + read_operator_profile = False + for line in profiler_dump.splitlines(): + if line.startswith(memory_profile_result_start): + read_memory_profile = True + elif line.startswith(operator_profile_result_start): + read_operator_profile = True + elif line.startswith(c_api_profile_result_start): + read_memory_profile = False + + if read_memory_profile: + memory_profile_results.append(line) + elif read_operator_profile: + operator_profile_results.append(line) + + # Prepare results + memory_profile = _get_memory_profile(memory_profile_results) + operator_profile = _get_operator_profile(operator_name, operator_profile_results) + + return merge_map_list([memory_profile, operator_profile]) + + +def profile(func): + """Decorator for profiling MXNet operation. + Uses MXNet profiler to collect metrics on memory usage and execution time + of the operation. + + Parameters + ---------- + func: + Operation to be executed and timed. + + Returns + ------- + res, profiler output. res being an return values from operator execution. + profiler output is a dictionary with summary of operation execution. + Example output : { "add": [{"avg_time_mem_alloc_cpu/0": 207618.0469, + "avg_time_forward_broadcast_add": 4.204, + "avg_time_backward_broadcast_add": 5.6288, + "inputs": { + "lhs": [1024, 1024], + "rhs": [1024,1024] + }] + } + """ + + @functools.wraps(func) + def profile_it(*args, **kwargs): + # Profile the operation + profiler.set_config(profile_all=True, aggregate_stats=True) + profiler.set_state('run') + res = func(*args, **kwargs) + profiler.set_state('stop') + + # Prepare the results + profiler_dump = profiler.dumps(reset=True) + + # args[0] is assumed to operator name, if not found check for block name. + # NOTE: This parameter should be removed when we get away from parsing + # profiler output and start using new profiler APIs - get_summary(), reset() + if len(args) > 0: + operator_name = args[0].__name__ + elif 'block' in kwargs: + operator_name = kwargs['block']._op_name + else: + raise ValueError("Unable to identify operator name to extract profiler output!") + + # Get the MXNet profile output + profiler_output = parse_profiler_dump(operator_name, profiler_dump) + return res, profiler_output + + return profile_it diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy index 38cc0b927c43..b8117f39377a 100644 --- a/ci/Jenkinsfile_utils.groovy +++ b/ci/Jenkinsfile_utils.groovy @@ -67,7 +67,7 @@ def pack_lib(name, libs, include_gcov_data = false) { sh returnStatus: true, script: """ set +e echo "Packing ${libs} into ${name}" -echo ${libs} | sed -e 's/,/ /g' | xargs md5sum +for i in \$(echo ${libs} | sed -e 's/,/ /g'); do md5sum \$i; done return 0 """ stash includes: libs, name: name @@ -86,7 +86,7 @@ def unpack_and_init(name, libs, include_gcov_data = false) { sh returnStatus: true, script: """ set +e echo "Unpacked ${libs} from ${name}" -echo ${libs} | sed -e 's/,/ /g' | xargs md5sum +for i in \$(echo ${libs} | sed -e 's/,/ /g'); do md5sum \$i; done return 0 """ if (include_gcov_data) { diff --git a/ci/build_windows.py b/ci/build_windows.py index e8658995b68e..7ec24395e22e 100755 --- a/ci/build_windows.py +++ b/ci/build_windows.py @@ -44,6 +44,8 @@ class BuildFlavour(Enum): WIN_CPU = 'WIN_CPU' WIN_CPU_MKLDNN = 'WIN_CPU_MKLDNN' + WIN_CPU_MKLDNN_MKL = 'WIN_CPU_MKLDNN_MKL' + WIN_CPU_MKL = 'WIN_CPU_MKL' WIN_GPU = 'WIN_GPU' WIN_GPU_MKLDNN = 'WIN_GPU_MKLDNN' @@ -72,8 +74,34 @@ class BuildFlavour(Enum): '-DUSE_LAPACK=1 ' '-DUSE_DIST_KVSTORE=0 ' '-DUSE_MKL_IF_AVAILABLE=1 ' + '-DUSE_MKLDNN=1 ' '-DCMAKE_BUILD_TYPE=Release') + , 'WIN_CPU_MKLDNN_MKL': ('-DUSE_CUDA=0 ' + '-DUSE_CUDNN=0 ' + '-DUSE_NVRTC=0 ' + '-DUSE_OPENCV=1 ' + '-DUSE_OPENMP=1 ' + '-DUSE_PROFILER=1 ' + '-DUSE_BLAS=mkl ' + '-DUSE_LAPACK=1 ' + '-DUSE_DIST_KVSTORE=0 ' + '-DUSE_MKL_IF_AVAILABLE=1 ' + '-DUSE_MKLDNN=1 ' + '-DCMAKE_BUILD_TYPE=Release') + + , 'WIN_CPU_MKL': ('-DUSE_CUDA=0 ' + '-DUSE_CUDNN=0 ' + '-DUSE_NVRTC=0 ' + '-DUSE_OPENCV=1 ' + '-DUSE_OPENMP=1 ' + '-DUSE_PROFILER=1 ' + '-DUSE_BLAS=mkl ' + '-DUSE_LAPACK=1 ' + '-DUSE_DIST_KVSTORE=0 ' + '-DUSE_MKL_IF_AVAILABLE=1 ' + '-DUSE_MKLDNN=0 ' + '-DCMAKE_BUILD_TYPE=Release') , 'WIN_GPU': ('-DUSE_CUDA=1 ' '-DUSE_CUDNN=1 ' '-DUSE_NVRTC=1 ' @@ -218,6 +246,8 @@ def main(): os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build" if 'CUDA_PATH' not in os.environ: os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2" + if 'MKL_ROOT' not in os.environ: + os.environ["MKL_ROOT"] = "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries\\windows\\mkl" windows_build(args) elif system == 'Linux' or system == 'Darwin': diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu index cf76f22a9f0a..1a927c4d5832 100644 --- a/ci/docker/Dockerfile.build.centos7_gpu +++ b/ci/docker/Dockerfile.build.centos7_gpu @@ -29,7 +29,7 @@ RUN /work/centos7_ccache.sh COPY install/centos7_python.sh /work/ RUN /work/centos7_python.sh -ENV CUDNN_VERSION=7.3.1.20 +ENV CUDNN_VERSION=7.6.0.64 COPY install/centos7_cudnn.sh /work/ RUN /work/centos7_cudnn.sh diff --git a/ci/docker/Dockerfile.build.ubuntu_base_cpu b/ci/docker/Dockerfile.build.ubuntu_base_cpu index c3ad2e90fb8d..a75ed0255d82 100644 --- a/ci/docker/Dockerfile.build.ubuntu_base_cpu +++ b/ci/docker/Dockerfile.build.ubuntu_base_cpu @@ -25,6 +25,7 @@ WORKDIR /work/deps RUN apt-get update && apt-get -y install sudo +# Always last ARG USER_ID=0 ARG GROUP_ID=0 COPY install/ubuntu_adduser.sh /work/ diff --git a/ci/docker/Dockerfile.build.ubuntu_base_gpu b/ci/docker/Dockerfile.build.ubuntu_base_gpu index 94e49b6fb297..40e1da657203 100644 --- a/ci/docker/Dockerfile.build.ubuntu_base_gpu +++ b/ci/docker/Dockerfile.build.ubuntu_base_gpu @@ -21,12 +21,11 @@ FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDNN_VERSION=7.3.1.20 - WORKDIR /work/deps RUN apt-get update && apt-get -y install sudo +ENV CUDNN_VERSION=7.6.0.64 COPY install/ubuntu_cudnn.sh /work/ RUN /work/ubuntu_cudnn.sh diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda index 08c67cd660f8..0607ec1a5e75 100644 --- a/ci/docker/Dockerfile.build.ubuntu_build_cuda +++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda @@ -23,8 +23,6 @@ FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDNN_VERSION=7.3.1.20 - WORKDIR /work/deps COPY install/ubuntu_core.sh /work/ @@ -45,6 +43,8 @@ COPY install/ubuntu_clang.sh /work/ RUN /work/ubuntu_clang.sh COPY install/ubuntu_mklml.sh /work/ RUN /work/ubuntu_mklml.sh + +ENV CUDNN_VERSION=7.6.0.64 COPY install/ubuntu_cudnn.sh /work/ RUN /work/ubuntu_cudnn.sh @@ -62,4 +62,3 @@ RUN /work/ubuntu_adduser.sh COPY runtime_functions.sh /work/ WORKDIR /work/mxnet -ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu index 2df9f5887f54..35dcf3ed7410 100644 --- a/ci/docker/Dockerfile.build.ubuntu_cpu +++ b/ci/docker/Dockerfile.build.ubuntu_cpu @@ -70,6 +70,7 @@ COPY install/ubuntu_docs.sh /work/ COPY install/docs_requirements /work/ RUN /work/ubuntu_docs.sh +# Always last ARG USER_ID=0 ARG GROUP_ID=0 COPY install/ubuntu_adduser.sh /work/ @@ -78,4 +79,3 @@ RUN /work/ubuntu_adduser.sh COPY runtime_functions.sh /work/ WORKDIR /work/mxnet -ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 index 6ec4a1fe415f..46d27e35022b 100644 --- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 +++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 @@ -67,16 +67,16 @@ RUN /work/ubuntu_docs.sh COPY install/ubuntu_tutorials.sh /work/ RUN /work/ubuntu_tutorials.sh +ENV CUDNN_VERSION=7.6.0.64 +COPY install/ubuntu_cudnn.sh /work/ +RUN /work/ubuntu_cudnn.sh + +# Always last ARG USER_ID=0 ARG GROUP_ID=0 COPY install/ubuntu_adduser.sh /work/ RUN /work/ubuntu_adduser.sh -ENV CUDNN_VERSION=7.3.1.20 -COPY install/ubuntu_cudnn.sh /work/ -RUN /work/ubuntu_cudnn.sh - COPY runtime_functions.sh /work/ WORKDIR /work/mxnet -ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 index 2730cc2caee1..19530a212424 100644 --- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 +++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 @@ -67,15 +67,16 @@ RUN /work/ubuntu_docs.sh COPY install/ubuntu_tutorials.sh /work/ RUN /work/ubuntu_tutorials.sh +ENV CUDNN_VERSION=7.6.0.64 +COPY install/ubuntu_cudnn.sh /work/ +RUN /work/ubuntu_cudnn.sh + +# Always last ARG USER_ID=0 ARG GROUP_ID=0 COPY install/ubuntu_adduser.sh /work/ RUN /work/ubuntu_adduser.sh -ENV CUDNN_VERSION=7.3.1.20 -COPY install/ubuntu_cudnn.sh /work/ -RUN /work/ubuntu_cudnn.sh - COPY runtime_functions.sh /work/ WORKDIR /work/mxnet diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 index 316c81d8a6e1..f239eec4af27 100644 --- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 +++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 @@ -67,15 +67,15 @@ RUN /work/ubuntu_docs.sh COPY install/ubuntu_tutorials.sh /work/ RUN /work/ubuntu_tutorials.sh +ENV CUDNN_VERSION=7.6.0.64 +COPY install/ubuntu_cudnn.sh /work/ +RUN /work/ubuntu_cudnn.sh + ARG USER_ID=0 ARG GROUP_ID=0 COPY install/ubuntu_adduser.sh /work/ RUN /work/ubuntu_adduser.sh -ENV CUDNN_VERSION=7.3.1.20 -COPY install/ubuntu_cudnn.sh /work/ -RUN /work/ubuntu_cudnn.sh - COPY runtime_functions.sh /work/ WORKDIR /work/mxnet diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu index 934aded5101d..a667f7b7a94f 100644 --- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu +++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu @@ -20,8 +20,6 @@ FROM nvidia/cuda:10.0-devel-ubuntu16.04 -ENV CUDNN_VERSION=7.3.1.20 - WORKDIR /work/deps COPY install/ubuntu_core.sh /work/ @@ -72,6 +70,7 @@ RUN /work/ubuntu_tutorials.sh COPY install/ubuntu_nightly_tests.sh /work/ RUN /work/ubuntu_nightly_tests.sh +ENV CUDNN_VERSION=7.6.0.64 COPY install/ubuntu_cudnn.sh /work/ RUN /work/ubuntu_cudnn.sh @@ -83,4 +82,3 @@ RUN /work/ubuntu_adduser.sh COPY runtime_functions.sh /work/ WORKDIR /work/mxnet -ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib diff --git a/ci/docker/install/docs_requirements b/ci/docker/install/docs_requirements index 3cfef1e33901..f78dca2bc655 100644 --- a/ci/docker/install/docs_requirements +++ b/ci/docker/install/docs_requirements @@ -26,8 +26,8 @@ h5py==2.8.0rc1 mock==2.0.0 nose==1.3.7 nose-timer==0.7.3 -numpy<=1.15.2,>=1.8.2 -pylint==1.8.3 +numpy>1.16.0,<2.0.0 +pylint==2.3.1; python_version >= '3.0' pypandoc==1.4 recommonmark==0.4.0 requests<2.19.0,>=2.18.4 diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh index e50b6d273b8c..f97ce10e8e85 100755 --- a/ci/docker/install/ubuntu_mklml.sh +++ b/ci/docker/install/ubuntu_mklml.sh @@ -21,5 +21,5 @@ # the whole docker cache for the image set -ex -wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.18/mklml_lnx_2019.0.3.20190220.tgz +wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_lnx_2019.0.5.20190502.tgz tar -zxf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_* diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh index 2d8b019372c7..65982eead389 100755 --- a/ci/docker/install/ubuntu_publish.sh +++ b/ci/docker/install/ubuntu_publish.sh @@ -66,5 +66,5 @@ python2 get-pip.py apt-get remove -y python3-urllib3 -pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 -pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 +pip2 install nose cpplint==1.3.0 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 +pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh index 23158ba4c068..2ca0cceec515 100755 --- a/ci/docker/install/ubuntu_python.sh +++ b/ci/docker/install/ubuntu_python.sh @@ -30,5 +30,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py python3 get-pip.py python2 get-pip.py -pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 -pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 +pip2 install nose cpplint==1.3.0 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 Cython==0.29.7 +pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 Cython==0.29.7 diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh index 4e40426ed85c..d82763e8fd3e 100755 --- a/ci/docker/install/ubuntu_tutorials.sh +++ b/ci/docker/install/ubuntu_tutorials.sh @@ -25,5 +25,5 @@ apt-get update || true apt-get install graphviz python-opencv # sckit-learn past version 0.20 does not support python version 2 and 3.4 -pip2 install jupyter matplotlib Pillow opencv-python "scikit-learn<0.21.0" graphviz tqdm mxboard scipy -pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard scipy +pip2 install jupyter matplotlib Pillow opencv-python "scikit-learn<0.21.0" graphviz==0.8.4 tqdm mxboard scipy gluoncv +pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz==0.8.4 tqdm mxboard scipy gluoncv diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 091ffdf2551d..1ad67280617d 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -41,6 +41,26 @@ scala_prepare() { export MAVEN_OPTS="-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" } +check_cython() { + set -ex + local python_ver=$1 + local is_cython_used=$(python${python_ver} <=1.21.1 requests<2.19.0,>=2.18.4 graphviz<0.9.0,>=0.8.1 -numpy<=1.15.0,>=1.8.2 +numpy>1.16.0,<2.0.0 mock nose nose-timer diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1 index 1c4a72682ae5..df9b15ba1ec3 100644 --- a/ci/windows/test_py2_cpu.ps1 +++ b/ci/windows/test_py2_cpu.ps1 @@ -24,6 +24,10 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home') C:\Python27\Scripts\pip install -r tests\requirements.txt C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest -if (! $?) { Throw ("Error running unittest") } +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train -if (! $?) { Throw ("Error running train tests") } +if (! $?) { Throw ("Error running train tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } +# Adding this extra test since it's not possible to set env var on the fly in Windows. +$env:MXNET_SAFE_ACCUMULATION=1 +C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1 index 8a6c8e9b44f9..f2974ff6f7b6 100644 --- a/ci/windows/test_py2_gpu.ps1 +++ b/ci/windows/test_py2_gpu.ps1 @@ -24,10 +24,14 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home') C:\Python27\Scripts\pip install -r tests\requirements.txt C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest -if (! $?) { Throw ("Error running unittest") } +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error tests\python\train -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } +# Adding this extra test since it's not possible to set env var on the fly in Windows. +$env:MXNET_SAFE_ACCUMULATION=1 +C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1 index a7067f9f3f83..900bfd161cd0 100644 --- a/ci/windows/test_py3_cpu.ps1 +++ b/ci/windows/test_py3_cpu.ps1 @@ -24,6 +24,10 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home') C:\Python37\Scripts\pip install -r tests\requirements.txt C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest -if (! $?) { Throw ("Error running unittest") } +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train -if (! $?) { Throw ("Error running train tests") } +if (! $?) { Throw ("Error running train tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } +# Adding this extra test since it's not possible to set env var on the fly in Windows. +$env:MXNET_SAFE_ACCUMULATION=1 +C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1 index 5fbc9f2f8036..b6e951b291fb 100644 --- a/ci/windows/test_py3_gpu.ps1 +++ b/ci/windows/test_py3_gpu.ps1 @@ -24,10 +24,14 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home') C:\Python37\Scripts\pip install -r tests\requirements.txt C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest -if (! $?) { Throw ("Error running unittest") } +if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train -if (! $?) { Throw ("Error running tests") } +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } +# Adding this extra test since it's not possible to set env var on the fly in Windows. +$env:MXNET_SAFE_ACCUMULATION=1 +C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm +if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) } diff --git a/cmake/BuildCythonModules.cmake b/cmake/BuildCythonModules.cmake new file mode 100644 index 000000000000..d2c3a46f1a71 --- /dev/null +++ b/cmake/BuildCythonModules.cmake @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +function(add_cython_modules python_version) + unset(PYTHON_EXECUTABLE CACHE) + set(PYTHONINTERP_FOUND FALSE) + find_package(PythonInterp ${python_version} EXACT) + if(PYTHONINTERP_FOUND) + find_program(CYTHON_EXECUTABLE NAMES cython) + if(CYTHON_EXECUTABLE) + add_custom_command(COMMAND ${CMAKE_COMMAND} POST_BUILD + -E env MXNET_LIBRARY_PATH=${CMAKE_BINARY_DIR}/libmxnet.so + ${PYTHON_EXECUTABLE} setup.py build_ext --inplace --with-cython + TARGET mxnet + WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/python") + message("-- Cython modules for python${python_version} will be built") + set(PYTHON${python_version}_FOUND 1 PARENT_SCOPE) + else() + message(FATAL_ERROR "-- Cython not found") + endif() + else() + set(PYTHON${python_version}_FOUND 0 PARENT_SCOPE) + endif() +endfunction() diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake index 5f4af2d89c91..e16594794ae8 100644 --- a/cmake/ChooseBlas.cmake +++ b/cmake/ChooseBlas.cmake @@ -18,14 +18,14 @@ set(BLAS "Open" CACHE STRING "Selected BLAS library") set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL") -if(USE_MKL_IF_AVAILABLE) - if(NOT MKL_FOUND) - find_package(MKL) - endif() - if(MKL_FOUND) - if(USE_MKLDNN) - set(BLAS "open") - else() +if(DEFINED USE_BLAS) + set(BLAS "${USE_BLAS}") +else() + if(USE_MKL_IF_AVAILABLE) + if(NOT MKL_FOUND) + find_package(MKL) + endif() + if(MKL_FOUND) set(BLAS "MKL") endif() endif() diff --git a/cmake/DownloadMKLML.cmake b/cmake/DownloadMKLML.cmake index 7b0e5ecf7c9c..73a588fa8afe 100644 --- a/cmake/DownloadMKLML.cmake +++ b/cmake/DownloadMKLML.cmake @@ -19,12 +19,12 @@ message(STATUS "Downloading MKLML...") -set(MKLDNN_RELEASE v0.18) -set(MKLML_RELEASE_FILE_SUFFIX 2019.0.3.20190220) +set(MKLDNN_RELEASE v0.19) +set(MKLML_RELEASE_FILE_SUFFIX 2019.0.5.20190502) -set(MKLML_LNX_MD5 76354b74325cd293aba593d7cbe36b3f) -set(MKLML_WIN_MD5 02286cb980f12af610c05e99dbd78755) -set(MKLML_MAC_MD5 3b28da686a25a4cf995ca4fc5e30e514) +set(MKLML_LNX_MD5 dfcea335652dbf3518e1d02cab2cea97) +set(MKLML_WIN_MD5 ff8c5237570f03eea37377ccfc95a08a) +set(MKLML_MAC_MD5 0a3d83ec1fed9ea318e8573bb5e14c24) if(MSVC) set(MKL_NAME "mklml_win_${MKLML_RELEASE_FILE_SUFFIX}") diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake index 70405566d8ae..51fca23c1161 100644 --- a/cmake/Modules/FindMKL.cmake +++ b/cmake/Modules/FindMKL.cmake @@ -43,55 +43,6 @@ endif() # ---[ Root folders set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -if(USE_MKLDNN) - - find_path(MKL_ROOT include/mkl_blas.h - PATHS $ENV{MKL_ROOT} - ${INTEL_ROOT}/mklml - ${DIRECT_DEPENDENCY_ROOTS} - DOC "Folder contains MKL" - ) - - # ---[ Find include dir - find_path(MKL_INCLUDE_DIR mkl_blas.h PATHS ${MKL_ROOT} PATH_SUFFIXES include) - set(__looked_for MKL_INCLUDE_DIR) - - # ---[ Find libraries - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(__path_suffixes lib lib/ia32) - else() - set(__path_suffixes lib lib/intel64) - endif() - - set(__mkl_libs "") - - if(WIN32) - list(APPEND __mkl_libs mklml_intel) - else() - list(APPEND __mkl_libs mklml_gnu) - endif() - list(APPEND __mkl_libs mkldnn) - - foreach (__lib ${__mkl_libs}) - set(__mkl_lib "${__lib}") - string(TOUPPER ${__mkl_lib} __mkl_lib_upper) - - if(MKL_USE_STATIC_LIBS) - set(__mkl_lib "lib${__mkl_lib}.a") - endif() - - find_library(${__mkl_lib_upper}_LIBRARY - NAMES ${__mkl_lib} - PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.." - PATH_SUFFIXES ${__path_suffixes} - DOC "The path to Intel(R) MKL ${__mkl_lib} library") - mark_as_advanced(${__mkl_lib_upper}_LIBRARY) - - list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY) - list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY}) - endforeach() - -else(USE_MKLDNN) # ---[ Options mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON) @@ -193,7 +144,7 @@ else(USE_MKLDNN) list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY}) endif() -endif(USE_MKLDNN) + include(FindPackageHandleStandardArgs) find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for}) diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj index f81a35803171..68dcbfec5850 100644 --- a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj +++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj @@ -17,6 +17,7 @@ (ns org.apache.clojure-mxnet.image "Image API of Clojure package." + (:refer-clojure :exclude [read]) (:require [t6.from-scala.core :refer [$ $$] :as $] [org.apache.clojure-mxnet.dtype :as dtype] [org.apache.clojure-mxnet.ndarray :as ndarray] @@ -38,8 +39,10 @@ (s/def ::decode-image-opts (s/keys :opt-un [::color-flag ::to-rgb ::output])) -(defn decode-image - "Decodes an image from an input stream with OpenCV +(defn ^:deprecated decode-image + "DEPRECATED: use `decode` instead. + + Decodes an image from an input stream with OpenCV `input-stream`: `InputStream` - Contains the binary encoded image `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1) `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB @@ -60,14 +63,47 @@ ([input-stream] (decode-image input-stream {}))) +(s/def ::color #{:grayscale :color}) +(s/def ::decode-image-opts-2 (s/keys :opt-un [::color ::to-rgb ::output])) + +(defn- color->int [color] + (case color + :grayscale 0 + :color 1)) + +(defn decode + "Decodes an image from an input stream with OpenCV. + `input-stream`: `InputStream` - Contains the binary encoded image + `color`: keyword in `#{:color :grayscale}` - Convert decoded image to + grayscale or color + `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB + format (instead of opencv's default BGR) + `output`: nil or `NDArray` + returns: `NDArray` with dtype uint8 + + Ex: + (decode input-stream) + (decode input-stream {:color :color}) + (decode input-stream {:color :grayscale :output nd})" + ([input-stream {:keys [color to-rgb output] + :or {color :color to-rgb true output nil} + :as opts}] + (util/validate! ::input-stream input-stream "Invalid input stream") + (util/validate! ::decode-image-opts-2 opts "Invalid options for decoding") + (Image/imDecode input-stream (color->int color) to-rgb ($/option output))) + ([input-stream] + (decode input-stream {}))) + (s/def ::filename string?) (s/def ::optional-color-flag (s/or :none nil? :some ::color-flag)) (s/def ::optional-to-rgb (s/or :none nil? :some ::to-rgb)) -(defn read-image - "Reads an image file and returns an ndarray with OpenCV. It returns image in +(defn ^:deprecated read-image + "DEPRECATED: use `read` instead. + + Reads an image file and returns an ndarray with OpenCV. It returns image in RGB by default instead of OpenCV's default BGR. `filename`: string - Name of the image file to be loaded `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1) @@ -95,11 +131,43 @@ ([filename] (read-image filename {}))) +(defn read + "Reads an image file and returns an ndarray with OpenCV. It returns image in + RGB by default instead of OpenCV's default BGR. + `filename`: string - Name of the image file to be loaded + `color`: keyword in `#{:color :grayscale}` - Convert decoded image to + grayscale or color + `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB + format (instead of opencv's default BGR) + `output`: nil or `NDArray` + returns: `NDArray` with dtype uint8 + + Ex: + (read \"cat.jpg\") + (read \"cat.jpg\" {:color :grayscale}) + (read \"cat.jpg\" {:color :color :output nd})" + ([filename {:keys [color to-rgb output] + :or {color :color to-rgb nil output nil} + :as opts}] + (util/validate! ::filename filename "Invalid filename") + (util/validate! ::color color "Invalid color") + (util/validate! ::optional-to-rgb to-rgb "Invalid conversion flag") + (util/validate! ::output output "Invalid output") + (Image/imRead + filename + ($/option (when color (color->int color))) + ($/option to-rgb) + ($/option output))) + ([filename] + (read filename {}))) + (s/def ::int int?) (s/def ::optional-int (s/or :none nil? :some int?)) -(defn resize-image - "Resizes the image array to (width, height) +(defn ^:deprecated resize-image + "DEPRECATED: use `resize` instead. + + Resizes the image array to (width, height) `input`: `NDArray` - source image in NDArray `w`: int - Width of resized image `h`: int - Height of resized image @@ -122,6 +190,30 @@ ([input w h] (resize-image input w h {}))) +(defn resize + "Resizes the image array to (width, height) + `input`: `NDArray` - source image in NDArray + `w`: int - Width of resized image + `h`: int - Height of resized image + `interpolation`: Interpolation method. Default is INTER_LINEAR + `ouput`: nil or `NDArray` + returns: `NDArray` + + Ex: + (resize nd-img 300 300) + (resize nd-img 28 28 {:output nd})" + ([input w h {:keys [interpolation output] + :or {interpolation nil output nil} + :as opts}] + (util/validate! ::ndarray input "Invalid input array") + (util/validate! ::int w "Invalid width") + (util/validate! ::int h "Invalid height") + (util/validate! ::optional-int interpolation "Invalid interpolation") + (util/validate! ::output output "Invalid output") + (Image/imResize input w h ($/option interpolation) ($/option output))) + ([input w h] + (resize input w h {}))) + (defn apply-border "Pad image border with OpenCV. `input`: `NDArray` - source image in NDArray @@ -193,7 +285,17 @@ (s/def ::to-image-ndarray (s/and ::ndarray ::all-bytes ::rgb-array)) -(defn to-image +(defn ^:deprecated to-image + "DEPRECATED: user `ndarray->image` instead. + + Convert a NDArray image in RGB format to a real image. + `input`: `NDArray` - Source image in NDArray + returns: `BufferedImage`" + [input] + (util/validate! ::to-image-ndarray input "Invalid input array") + (Image/toImage input)) + +(defn ndarray->image "Convert a NDArray image in RGB format to a real image. `input`: `NDArray` - Source image in NDArray returns: `BufferedImage`" diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj index 7554089d0ba0..f7f58f8f7c88 100644 --- a/contrib/clojure-package/test/good-test-ndarray-api.clj +++ b/contrib/clojure-package/test/good-test-ndarray-api.clj @@ -106,7 +106,7 @@ - Defined in src/operator/nn/batch_norm.cc:L574 + Defined in src/operator/nn/batch_norm.cc:L572 `data`: Input data to batch normalization `gamma`: gamma array diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj index c7450f8eb5c1..3081304ebdb3 100644 --- a/contrib/clojure-package/test/good-test-symbol-api.clj +++ b/contrib/clojure-package/test/good-test-symbol-api.clj @@ -119,7 +119,7 @@ - Defined in src/operator/nn/batch_norm.cc:L574 + Defined in src/operator/nn/batch_norm.cc:L572 `data`: Input data to batch normalization (optional) `gamma`: gamma array (optional) diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj index 23b88d07e896..fd200f18a78f 100644 --- a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj +++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj @@ -19,73 +19,102 @@ (:require [org.apache.clojure-mxnet.image :as image] [org.apache.clojure-mxnet.ndarray :as ndarray] [clojure.java.io :as io] - [clojure.test :refer :all]) + [clojure.test :refer [deftest is use-fixtures]]) (:import (javax.imageio ImageIO) (java.io File))) (def tmp-dir (System/getProperty "java.io.tmpdir")) (def image-path (.getAbsolutePath (io/file tmp-dir "Pug-Cookie.jpg"))) +(def image-src-path "test/test-images/Pug-Cookie.jpg") -(defn download-image [] - (with-open [in (io/input-stream "https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg") - out (io/output-stream (io/file image-path))] +(defn- cp + "Copy from filepath `from` to filepath `to`." + [from to] + (with-open [in (io/input-stream (io/file from)) + out (io/output-stream (io/file to))] (io/copy in out))) -(defn delete-image [] - (io/delete-file image-path)) +(defn- rm + "Removes `filepath`." + [filepath] + (io/delete-file filepath)) -(defn with-downloaded-image [f] - (download-image) - (f) - (delete-image)) +(defn- with-file + "Provides `src-path` in `dest-path` for the test function `f` to use." + [src-path dest-path] + (fn [f] + (cp src-path dest-path) + (f) + (rm dest-path))) -(use-fixtures :once with-downloaded-image) +(use-fixtures :once (with-file image-src-path image-path)) (deftest test-decode-image - (let [img-arr (image/decode-image - (io/input-stream image-path)) - img-arr-2 (image/decode-image - (io/input-stream image-path) - {:color-flag image/GRAYSCALE})] + (let [img-arr (image/decode-image (io/input-stream image-path)) + img-arr-2 (image/decode-image (io/input-stream image-path) + {:color-flag image/GRAYSCALE})] + (is (= [576 1024 3] (ndarray/shape-vec img-arr))) + (is (= [576 1024 1] (ndarray/shape-vec img-arr-2))))) + +(deftest test-decode + (let [img-arr (image/decode (io/input-stream image-path)) + img-arr-2 (image/decode (io/input-stream image-path) + {:color :grayscale})] (is (= [576 1024 3] (ndarray/shape-vec img-arr))) (is (= [576 1024 1] (ndarray/shape-vec img-arr-2))))) (deftest test-read-image (let [img-arr (image/read-image image-path) - img-arr-2 (image/read-image - image-path - {:color-flag image/GRAYSCALE})] + img-arr-2 (image/read-image image-path {:color-flag image/GRAYSCALE})] + (is (= [576 1024 3] (ndarray/shape-vec img-arr))) + (is (= [576 1024 1] (ndarray/shape-vec img-arr-2))))) + +(deftest test-read + (let [img-arr (image/read image-path) + img-arr-2 (image/read image-path {:color :grayscale})] (is (= [576 1024 3] (ndarray/shape-vec img-arr))) (is (= [576 1024 1] (ndarray/shape-vec img-arr-2))))) (deftest test-resize-image - (let [img-arr (image/read-image image-path) + (let [img-arr (image/read image-path) resized-arr (image/resize-image img-arr 224 224)] (is (= [224 224 3] (ndarray/shape-vec resized-arr))))) -(deftest test-crop-image - (let [img-arr (image/read-image image-path) +(deftest test-resize + (let [img-arr (image/read image-path) + resized-arr (image/resize img-arr 224 224)] + (is (= [224 224 3] (ndarray/shape-vec resized-arr))))) + +(deftest test-fixed-crop + (let [img-arr (image/read image-path) cropped-arr (image/fixed-crop img-arr 0 0 224 224)] (is (= [224 224 3] (ndarray/shape-vec cropped-arr))))) (deftest test-apply-border - (let [img-arr (image/read-image image-path) + (let [img-arr (image/read image-path) padded-arr (image/apply-border img-arr 1 1 1 1)] (is (= [578 1026 3] (ndarray/shape-vec padded-arr))))) (deftest test-to-image - (let [img-arr (image/read-image image-path) - resized-arr (image/resize-image img-arr 224 224) + (let [img-arr (image/read image-path) + resized-arr (image/resize img-arr 224 224) new-img (image/to-image resized-arr)] (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png"))))) +(deftest test-ndarray->image + (let [img-arr (image/read image-path) + resized-arr (image/resize img-arr 224 224) + new-img (image/ndarray->image resized-arr)] + (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png"))))) + (deftest test-draw-bounding-box! (let [orig-img (ImageIO/read (new File image-path)) - new-img (-> orig-img - (image/draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450} - {:x-min 200 :x-max 350 :y-min 440 :y-max 530}] - {:stroke 2 - :names ["pug" "cookie"] - :transparency 0.8 - :font-size-mult 2.0}))] + new-img (image/draw-bounding-box! + orig-img + [{:x-min 190 :x-max 850 :y-min 50 :y-max 450} + {:x-min 200 :x-max 350 :y-min 440 :y-max 530}] + {:stroke 2 + :names ["pug" "cookie"] + :transparency 0.8 + :font-size-mult 2.0})] (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png"))))) diff --git a/cpp-package/example/get_data.sh b/cpp-package/example/get_data.sh index b0913bdb684d..e11077234ade 100755 --- a/cpp-package/example/get_data.sh +++ b/cpp-package/example/get_data.sh @@ -51,11 +51,12 @@ download () { (($? != 0)) && exit 1 || return 0 } +# MNIST dataset from: http://yann.lecun.com/exdb/mnist/ FILES=( - "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz" - "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz" - "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz" - "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz" + "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz" + "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz" + "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz" + "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz" "http://data.mxnet.io/data/mnist_train.csv.gz") for FILE in ${FILES[@]}; do diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp index fa5600190f95..cb952aa69f54 100644 --- a/cpp-package/example/inference/inception_inference.cpp +++ b/cpp-package/example/inference/inception_inference.cpp @@ -302,13 +302,11 @@ void Predictor::PredictImage(const std::string& image_file) { // The output is available in executor->outputs. auto array = executor->outputs[0].Copy(Context::cpu()); - /* * Find out the maximum accuracy and the index associated with that accuracy. * This is done by using the argmax operator on NDArray. */ auto predicted = array.ArgmaxChannel(); - /* * Wait until all the previous write operations on the 'predicted' * NDArray to be complete before we read it. @@ -317,7 +315,7 @@ void Predictor::PredictImage(const std::string& image_file) { */ predicted.WaitToRead(); - int best_idx = predicted.At(0, 0); + int best_idx = predicted.At(0); float best_accuracy = array.At(0, best_idx); if (output_labels.empty()) { @@ -331,9 +329,7 @@ void Predictor::PredictImage(const std::string& image_file) { Predictor::~Predictor() { - if (executor) { - delete executor; - } + delete executor; MXNotifyShutdown(); } diff --git a/cpp-package/example/test_ndarray_copy.cpp b/cpp-package/example/test_ndarray_copy.cpp new file mode 100644 index 000000000000..a3b3011993fa --- /dev/null +++ b/cpp-package/example/test_ndarray_copy.cpp @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ +#include +#include "mxnet/c_api.h" +#include "dmlc/logging.h" +#include "mxnet-cpp/MxNetCpp.h" +using namespace mxnet::cpp; + +enum TypeFlag { + kFloat32 = 0, + kFloat64 = 1, + kFloat16 = 2, + kUint8 = 3, + kInt32 = 4, + kInt8 = 5, + kInt64 = 6, +}; + +/* + * The file is used for testing if there exist type inconsistency + * when using Copy API to create a new NDArray. + * By running: build/test_ndarray. + */ +int main(int argc, char** argv) { + std::vector shape1{128, 2, 32}; + Shape shape2(32, 8, 64); + + int gpu_count = 0; + if (MXGetGPUCount(&gpu_count) != 0) { + LOG(ERROR) << "MXGetGPUCount failed"; + return -1; + } + + Context context = (gpu_count > 0) ? Context::gpu() : Context::cpu(); + + NDArray src1(shape1, context, true, kFloat16); + NDArray src2(shape2, context, false, kInt8); + NDArray dst1, dst2; + dst1 = src1.Copy(context); + dst2 = src2.Copy(context); + NDArray::WaitAll(); + CHECK_EQ(src1.GetDType(), dst1.GetDType()); + CHECK_EQ(src2.GetDType(), dst2.GetDType()); + return 0; +} diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h index 6f37d91aa68e..c4d51c59a532 100644 --- a/cpp-package/include/mxnet-cpp/ndarray.h +++ b/cpp-package/include/mxnet-cpp/ndarray.h @@ -131,18 +131,21 @@ class NDArray { /*! * \brief construct a new dynamic NDArray * \param shape the shape of array - * \param constext context of NDArray + * \param context context of NDArray * \param delay_alloc whether delay the allocation + * \param dtype data type of NDArray */ NDArray(const std::vector &shape, const Context &context, - bool delay_alloc = true); + bool delay_alloc = true, int dtype = 0); /*! * \brief construct a new dynamic NDArray * \param shape the shape of array * \param constext context of NDArray * \param delay_alloc whether delay the allocation + * \param dtype data type of NDArray */ - NDArray(const Shape &shape, const Context &context, bool delay_alloc = true); + NDArray(const Shape &shape, const Context &context, + bool delay_alloc = true, int dtype = 0); NDArray(const mx_float *data, size_t size); /*! * \brief construct a new dynamic NDArray @@ -318,6 +321,12 @@ class NDArray { */ size_t Offset(size_t c, size_t h, size_t w) const; /*! + * \brief return value of the element at (index) + * \param index position + * \return value of one dimensions array + */ + mx_float At(size_t index) const; + /*! * \brief return value of the element at (h, w) * \param h height position * \param w width position diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp index d0438305a62e..ed23c76ddc00 100644 --- a/cpp-package/include/mxnet-cpp/ndarray.hpp +++ b/cpp-package/include/mxnet-cpp/ndarray.hpp @@ -47,17 +47,18 @@ inline NDArray::NDArray(const NDArrayHandle &handle) { blob_ptr_ = std::make_shared(handle); } inline NDArray::NDArray(const std::vector &shape, const Context &context, - bool delay_alloc) { + bool delay_alloc, int dtype) { NDArrayHandle handle; - CHECK_EQ(MXNDArrayCreate(shape.data(), shape.size(), context.GetDeviceType(), - context.GetDeviceId(), delay_alloc, &handle), + CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.size(), context.GetDeviceType(), + context.GetDeviceId(), delay_alloc, dtype, &handle), 0); blob_ptr_ = std::make_shared(handle); } -inline NDArray::NDArray(const Shape &shape, const Context &context, bool delay_alloc) { +inline NDArray::NDArray(const Shape &shape, const Context &context, + bool delay_alloc, int dtype) { NDArrayHandle handle; - CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(), - context.GetDeviceId(), delay_alloc, &handle), + CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.ndim(), context.GetDeviceType(), + context.GetDeviceId(), delay_alloc, dtype, &handle), 0); blob_ptr_ = std::make_shared(handle); } @@ -208,7 +209,7 @@ inline void NDArray::SyncCopyToCPU(std::vector *data, size_t size) { MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data->data(), size); } inline NDArray NDArray::Copy(const Context &ctx) const { - NDArray ret(GetShape(), ctx); + NDArray ret(GetShape(), ctx, true, this->GetDType()); Operator("_copyto")(*this).Invoke(ret); return ret; } @@ -374,11 +375,15 @@ inline void NDArray::Save(const std::string &file_name, } inline size_t NDArray::Offset(size_t h, size_t w) const { - return (h * GetShape()[1]) + w; + auto const shape = GetShape(); + CHECK_EQ(shape.size(), 2) << "The NDArray needs to be 2 dimensional."; + + return (h * shape[1]) + w; } inline size_t NDArray::Offset(size_t c, size_t h, size_t w) const { auto const shape = GetShape(); + CHECK_EQ(shape.size(), 3) << "The NDArray needs to be 3 dimensional."; return h * shape[0] * shape[2] + w * shape[0] + c; } @@ -390,6 +395,13 @@ inline mx_float NDArray::At(size_t c, size_t h, size_t w) const { return GetData()[Offset(c, h, w)]; } +inline mx_float NDArray::At(size_t index) const { + auto shape = GetShape(); + CHECK_EQ(shape.size(), 1) << "The NDArray needs to be 1 dimensional."; + CHECK_LT(index, shape[0]) << "Specified index is out of range."; + return GetData()[index]; +} + inline size_t NDArray::Size() const { size_t ret = 1; for (auto &i : GetShape()) ret *= i; diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh index 2d1f8e4f68e6..ef7fceacfd6e 100755 --- a/cpp-package/tests/ci_test.sh +++ b/cpp-package/tests/ci_test.sh @@ -57,6 +57,9 @@ cp ../../build/cpp-package/example/test_kvstore . cp ../../build/cpp-package/example/test_score . ./test_score 0.93 +cp ../../build/cpp-package/example/test_ndarray_copy . +./test_ndarray_copy + sh unittests/unit_test_mlp_csv.sh cd inference diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js index ec3977601c92..ca4ac363dc41 100644 --- a/docs/_static/js/options.js +++ b/docs/_static/js/options.js @@ -19,7 +19,7 @@ */ /* Installation page display functions for install selector */ -var versionSelect = defaultVersion = 'v1.4.0'; +var versionSelect = defaultVersion = 'v1.4.1'; var platformSelect = 'Linux'; var languageSelect = 'Python'; var processorSelect = 'CPU'; diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html index 34f675853924..a5f0bed636e9 100644 --- a/docs/_static/mxnet-theme/index.html +++ b/docs/_static/mxnet-theme/index.html @@ -23,14 +23,14 @@
-

MXNet 1.4.0 Released

-

This release introduces the Java Inference API and Julia API, as well as Control Flow Operators, MKLDNN optimizations, and SVRG optimization.

- Learn More +

MXNet 1.4.1 Released

+

This patch release features bug fixes and performance improvements.

+ Learn More

A 60-minute Gluon Crash Course

Check out our quick overview of how to use Gluon, the imperative interface of MXNet.

- Learn More + Learn More

Get the latest news from MXNet blogs on Medium

diff --git a/docs/tutorials/tensorrt/wavenet_optimized.png b/docs/_static/tutorials/tensorrt/wavenet_optimized.png similarity index 100% rename from docs/tutorials/tensorrt/wavenet_optimized.png rename to docs/_static/tutorials/tensorrt/wavenet_optimized.png diff --git a/docs/tutorials/tensorrt/wavenet_unoptimized.png b/docs/_static/tutorials/tensorrt/wavenet_unoptimized.png similarity index 100% rename from docs/tutorials/tensorrt/wavenet_unoptimized.png rename to docs/_static/tutorials/tensorrt/wavenet_unoptimized.png diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md index f60e7f141adf..d4358ddcea22 100644 --- a/docs/api/python/ndarray/contrib.md +++ b/docs/api/python/ndarray/contrib.md @@ -75,6 +75,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib` isinf isfinite isnan + index_array index_copy getnnz edge_id diff --git a/docs/api/python/optimization/optimization.md b/docs/api/python/optimization/optimization.md index 03448123a14f..47f99f3602f8 100644 --- a/docs/api/python/optimization/optimization.md +++ b/docs/api/python/optimization/optimization.md @@ -171,8 +171,11 @@ for examples. ```eval_rst .. automodule:: mxnet.optimizer :members: + :exclude-members: NDabs + .. automodule:: mxnet.lr_scheduler :members: + .. automodule:: mxnet.initializer :members: ``` diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md index 2a6a5efe29be..38537f7487c7 100644 --- a/docs/api/python/symbol/contrib.md +++ b/docs/api/python/symbol/contrib.md @@ -72,6 +72,7 @@ In the rest of this document, we list routines provided by the `symbol.contrib` foreach while_loop cond + index_array index_copy getnnz edge_id diff --git a/docs/api/python/symbol/linalg.md b/docs/api/python/symbol/linalg.md index 5b467b501247..436bab78c451 100644 --- a/docs/api/python/symbol/linalg.md +++ b/docs/api/python/symbol/linalg.md @@ -59,6 +59,7 @@ In the rest of this document, we list routines provided by the `symbol.linalg` p makediag extracttrian maketrian + inverse ``` ## API Reference diff --git a/docs/community/contribute.md b/docs/community/contribute.md index 50aa1a607a44..2dd92648715b 100644 --- a/docs/community/contribute.md +++ b/docs/community/contribute.md @@ -74,7 +74,6 @@ Please join either or both of the MXNet mailing lists: To join the MXNet slack channel send request to the contributor mailing list. * email - * [archive](https://the-asf.slackarchive.io/mxnet) ### Social Media diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md index 1e2bf07335d3..e7e101372115 100644 --- a/docs/community/ecosystem.md +++ b/docs/community/ecosystem.md @@ -85,4 +85,4 @@ Community contributions to MXNet have added many new valuable features and funct ## Contributions -Do you know of a project or resource in the MXNet ecosystem that should be listed here? Or would you like to get involved by providing your own contribution? Check out the [guide for contributing to MXNet](contribute.html), and browse the [design proposals](https://cwiki.apache.org/confluence/display/MXNET/Design+Proposals) to see what others are working on. You might find something you would like to help with or use those design docs as a template for your own proposal. Use one of the [developer communication channels](https://mxnet.incubator.apache.org/community/contribute.html#mxnet-dev-communications) if you would like to know more, or [create a GitHub issue](https://github.com/apache/incubator-mxnet/issues/new) if you would like to propose something for the MXNet ecosystem. +Do you know of a project or resource in the MXNet ecosystem that should be listed here? Or would you like to get involved by providing your own contribution? Check out the [guide for contributing to MXNet](contribute.html), and browse the [design proposals](https://cwiki.apache.org/confluence/display/MXNET/Proposals) to see what others are working on. You might find something you would like to help with or use those design docs as a template for your own proposal. Use one of the [developer communication channels](https://mxnet.incubator.apache.org/community/contribute.html#mxnet-dev-communications) if you would like to know more, or [create a GitHub issue](https://github.com/apache/incubator-mxnet/issues/new) if you would like to propose something for the MXNet ecosystem. diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index c5ebd54c55a1..cdd528cd8c8f 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -47,7 +47,7 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 - The maximum number of concurrent threads that do the memory copy job on each GPU. * MXNET_CPU_WORKER_NTHREADS - Values: Int ```(default=1)``` - - The maximum number of scheduling threads on CPU. It specifies how many operators can be run in parallel. + - The maximum number of scheduling threads on CPU. It specifies how many operators can be run in parallel. Note that most CPU operators are parallelized by OpenMP. To change the number of threads used by individual operators, please set `OMP_NUM_THREADS` instead. * MXNET_CPU_PRIORITY_NTHREADS - Values: Int ```(default=4)``` - The number of threads given to prioritized CPU jobs. @@ -146,7 +146,7 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 - Values: 0(false) or 1(true) ```(default=0)``` - If true, MXNet tries to use tree reduction for Push and Pull communication. - Otherwise, MXNet uses the default Push and Pull implementation. - - [Tree reduction technology](http://www.sysml.cc/doc/178.pdf) has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes. + - Tree reduction technology has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes. * MXNET_KVSTORE_LOGTREE - Values: 0(false) or 1(true) ```(default=0)``` @@ -199,6 +199,22 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca - If set to '0', profiler records the events of the symbolic operators. - If set to '1', profiler records the events of all operators. +## Interface between Python and the C API + +* MXNET_ENABLE_CYTHON + - Values: 0(false), 1(true) ```(default=1)``` + - If set to 0, MXNet uses the ctypes to interface with the C API. + - If set to 1, MXNet tries to use the cython modules for the ndarray and symbol. If it fails, the ctypes is used or an error occurs depending on MXNET_ENFORCE_CYTHON. + +* MXNET_ENFORCE_CYTHON + - Values: 0(false) or 1(true) ```(default=0)``` + - This has an effect only if MXNET_ENABLE_CYTHON is 1. + - If set to 0, MXNet fallbacks to the ctypes if importing the cython modules fails. + - If set to 1, MXNet raises an error if importing the cython modules fails. + +If cython modules are used, `mx.nd._internal.NDArrayBase` must be `mxnet._cy3.ndarray.NDArrayBase` for python 3 or `mxnet._cy2.ndarray.NDArrayBase` for python 2. +If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`. + ## Other Environment Variables * MXNET_GPU_WORKER_NSTREAMS @@ -280,6 +296,19 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca - Values: Int ```(default=4)``` - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator. +* MXNET_SUBGRAPH_BACKEND + - Values: String ```(default="")``` + - This variable controls the subgraph partitioning in MXNet. + - This variable is used to perform MKL-DNN FP32 operator fusion and quantization. Please refer to the [MKL-DNN operator list](../tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes. + +* MXNET_SAFE_ACCUMULATION + - Values: Values: 0(false) or 1(true) ```(default=0)``` + - If this variable is set, the accumulation will enter the safe mode, meaning accumulation is done in a data type of higher precision than + the input data type, leading to more accurate accumulation results with a possible performance loss and backward compatibility loss. + For example, when the variable is set to 1(true), if the input data type is float16, then the accumulation will be done + with float32. + - Model accuracies do not necessarily improve with this environment variable turned on. + Settings for Minimum Memory Usage --------------------------------- - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1``` @@ -302,5 +331,5 @@ Settings for controlling OMP tuning - Set ```MXNET_USE_NUM_CORES_OPERATOR_TUNING``` to define num_cores to be used by operator tuning code. - This reduces operator tuning overhead when there are multiple instances of mxnet running in the system and we know that - each mxnet will take only partial num_cores available with system. + each mxnet will take only partial num_cores available with system. - refer: https://github.com/apache/incubator-mxnet/pull/13602 diff --git a/docs/faq/float16.md b/docs/faq/float16.md index 323218ce7df6..465668610413 100644 --- a/docs/faq/float16.md +++ b/docs/faq/float16.md @@ -17,109 +17,158 @@ # Mixed precision training using float16 -In this tutorial you will walk through how one can train deep learning neural networks with mixed precision on supported hardware. You will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy. +In this tutorial we will walk through how one can train deep learning neural networks with mixed precision on supported hardware. We will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy. ## Background -The computational resources required for training deep neural networks have been increasing of late because of complexity of the architectures and size of models. Mixed precision training allows us to reduces the resources required by using lower precision arithmetic. In this approach you can train using 16 bit floating points (half precision) while using 32 bit floating points (single precision) for output buffers of float16 computation. This combination of single and half precision gives rise to the name mixed precision. It allows us to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time. -The float16 data type is a 16 bit floating point representation according to the IEEE 754 standard. It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia's Tensor Cores on a Volta GPU. +The computational resources required for training deep neural networks have been lately increasing because of growing complexity and model size. Mixed precision training allows us to reduce the utilization of the resources by using lower precision arithmetic which is computationally less expensive and less costly in terms of space utilization. In this approach you can train using 16 bit floating point (half precision) while using 32 bit floating point (single precision) for output buffers of float16 computation. This allows one to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time. + +The float16 data type is a 16 bit floating point representation according to the [IEEE 754 standard](https://ieeexplore.ieee.org/document/4610935). It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia's Tensor Cores on a Volta GPU. ## Prerequisites -- Volta range of Nvidia GPUs -- Cuda 9 or higher -- CUDNN v7 or higher -This tutorial also assumes that you understand how to train a network with float32. Please refer to other tutorials [here](http://mxnet.incubator.apache.org/tutorials/index.html) to get started with MXNet and/or Gluon. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision. +- [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) range of Nvidia GPUs (e.g. AWS P3 instance) +- CUDA 9 or higher +- cuDNN v7 or higher + +This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision. ## Using the Gluon API ### Training or Inference -With Gluon, you need to take care of three things to convert a model to support float16. +With Gluon API, you need to take care of three things to convert a model to support computation with float16. -1. Cast the Gluon Block, so as to cast the parameters of layers and change the type of input expected, to float16. This is as simple as calling the [cast](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.cast) method of the Block representing the network. -``` +1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.cast) method of the `Block` representing the network. + +```python net = net.cast('float16') ``` -2. Ensure the data input to the network is of float16 type. If your DataLoader or Iterator produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [`astype`](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.astype) method of ndarrays. -``` +2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.astype) method of NDArrays. + +```python data = data.astype('float16', copy=False) ``` -If you are using images and DataLoader, you can also use a [Cast transform](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.Cast) +If you are using images and DataLoader, you can also use a [Cast transform](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.Cast). -3. It is preferable to use **multi_precision mode of optimizer** when training in float16. This mode of optimizer maintains a master copy of weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence for some networks. (Further discussion on this towards the end.) +3. It is preferable to use **multi_precision mode of optimizer** when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios. ```python optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01) ``` -You can play around with mixed precision using the image classification example [here](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py). We suggest using the Caltech101 dataset option in that example and using a Resnet50_v1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here's a starter command to run this. +You can play around with mixed precision using the image classification [example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py). We suggest using the Caltech101 dataset option in that example and using a ResNet50V1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here's the starter command to run this example. -``` +```bash python image_classification.py --model resnet50_v1 --dataset caltech101 --gpus 0 --num-worker 30 --dtype float16 ``` - ### Fine-tuning -You can also fine-tune in float16, a model which was originally trained in float32. Here is how you would do it. As an example if you are trying to use a model pretrained on the Imagenet dataset from the ModelZoo, you would first fetch the pretrained network and then cast that network to float16. +You can also fine-tune a model, which was originally trained in float32, to use float16. Below is an example of how to fine-tune a pretrained model from the Model Zoo. You would first need to fetch the pretrained network and then cast that network to float16. -``` -pretrained_net = models.get_model(name='resnet50_v2', ctx=ctx, pretrained=True, classes=1000) +```python +import numpy as np +import mxnet as mx +from mxnet.gluon.model_zoo.vision import get_model + + +pretrained_net = get_model(name='resnet50_v2', ctx=mx.cpu(), + pretrained=True, classes=1000) pretrained_net.cast('float16') ``` -Then if you have another Resnet50_v2 model you want to fine-tune, you can just assign the features to that network and then cast it. -``` -net = models.get_model(name='resnet50_v2', ctx=ctx, pretrained=False, classes=101) -net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) +Then, if you have another Resnet50V2 model you want to fine-tune, you can just assign the features to that network and then cast it. + +```python +net = get_model(name='resnet50_v2', ctx=mx.cpu(), + pretrained=False, classes=101) +net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=mx.cpu()) net.features = pretrained_net.features -net.cast(dtype) +net.cast('float16') +``` + +You can check the parameters of the model by calling [summary](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well. + +```python +net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16)) ``` ## Using the Symbolic API Training a network in float16 with the Symbolic API involves the following steps. + 1. Add a layer at the beginning of the network, to cast the data to float16. This will ensure that all the following layers compute in float16. 2. It is advisable to cast the output of the layers before softmax to float32, so that the softmax computation is done in float32. This is because softmax involves large reductions and it helps to keep that in float32 for more precise answer. -3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. This is discussed in some detail below. Here's how you would enable this mode when creating an optimizer. +3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. Here's how you would enable this mode when creating an optimizer. ```python optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01) ``` -There are a few examples of building such networks which can handle float16 input in [examples/image-classification/symbols/](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/symbols). Specifically you could look at the [resnet](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) example. +For a full example, please refer to [resnet.py](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) file on GitHub. A small, relevant excerpt from that file is presented below. -An illustration of the relevant section of the code is below. -``` +```python data = mx.sym.Variable(name="data") + if dtype == 'float16': data = mx.sym.Cast(data=data, dtype=np.float16) -// the rest of the network +# ... the rest of the network net_out = net(data) if dtype == 'float16': net_out = mx.sym.Cast(data=net_out, dtype=np.float32) + output = mx.sym.SoftmaxOutput(data=net_out, name='softmax') ``` -We have an example script which show how to train imagenet with resnet50 using float16 [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/train_imagenet.py) +If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classificatiIfon/train_imagenet.py) -Here's how you can use the above script to train Resnet50 v1 model with synthetic data using float16, so you can try it out even if you don't have the Imagenet dataset handy. -``` +If you don't have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command: + +```bash python train_imagenet.py --network resnet-v1 --num-layers 50 --benchmark 1 --gpus 0 --batch-size 256 --dtype float16 ``` -There's a similar example for fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py). The following command shows how to use that script to fine tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16. +There's a similar example for float16 fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py) of selected models: Inception v3, Inception v4, ResNetV1, ResNet50, ResNext or VGG. The command below shows how to use that script to fine-tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16. + +```bash +python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/.mxnet/dataset/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16 ``` -python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/data/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16 + +If you don't have the `Caltech256` dataset, you can download it using the script below, and convert it into .rec file format using [im2rec utility file](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) + +```python +import os +from os.path import expanduser +import tarfile +import mxnet as mx + + +data_folder = expanduser("~/.mxnet/datasets/") +dataset_name = "256_ObjectCategories" +archive_file = "{}.tar".format(dataset_name) +archive_path = os.path.join(data_folder, archive_file) +data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/" + +if not os.path.isfile(archive_path): + mx.test_utils.download("{}{}".format(data_url, archive_file), + dirname=data_folder) + print('Extracting {} in {}...'.format(archive_file, data_folder)) + tar = tarfile.open(archive_path) + tar.extractall(data_folder) + tar.close() + print('Data extracted.') ``` ## Example training results -Let us consider training a Resnet50 v1 model on the Imagenet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a AWS p3.16x large instance. Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below. + +Let us consider training a Resnet50V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an [AWS p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details) instance. + +Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below. Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup | --- | --- | --- | --- | --- | @@ -127,65 +176,73 @@ Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup | 1024 | float16 | 76.34% | 7.3 hrs | 1.62x | 2048 | float16 | 76.29% | 6.5 hrs | 1.82x | -![Training curves of Resnet50 v1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png) +![Training curves of Resnet50V1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png) -The differences in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes. +The difference in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates, training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes. ## Things to keep in mind ### For performance -Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce you to. +Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce. -1. Nvidia Tensor Cores essentially perform the computation D = A * B + C, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs. +1. Nvidia Tensor Cores essentially perform the computation `D = A * B + C`, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs. 2. When you scale up the batch size ensure that IO and data pre-processing is not your bottleneck. If you see a slowdown this would be the first thing to check. 3. It is advisable to use batch sizes that are multiples of 8 because of the above reason when training with float16. As always, batch sizes which are powers of 2 would be best when compared to those around it. -4. You can check whether your program is using Tensor cores for fast float16 computation by profiling with `nvprof`. -The operations with `s884cudnn` in their names represent the use of Tensor cores. +4. You can check whether your program is using Tensor cores for fast float16 computation by profiling with `nvprof`. The operations with `s884cudnn` in their names represent the use of Tensor cores. -5. When not limited by GPU memory, it can help to set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 2. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace. +5. When not limited by GPU memory, it can help to set the environment variable `MXNET_CUDNN_AUTOTUNE_DEFAULT` to `2`. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace. 6. Please note that float16 on CPU might not be supported for all operators, as in most cases float16 on CPU is much slower than float32. - ### For accuracy #### Multi precision mode + When training in float16, it is advisable to still store the master copy of the weights in float32 for better accuracy. The higher precision of float32 helps overcome cases where gradient update can become 0 if represented in float16. This mode can be activated by setting the parameter `multi_precision` of optimizer params to `True` as in the above example. It has been found that this is not required for all networks to achieve the same accuracy as with float32, but nevertheless recommended. Note that for distributed training, this is currently slightly slower than without `multi_precision`, but still much faster than using float32 for training. -#### Large reductions -Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API there needs to be a cast to float32 before softmax as the above symbolic example code shows. +#### Large reductions + +Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API it needs to be a cast to float32 before softmax as the above symbolic example code shows. #### Loss scaling -For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes. + +For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes. Most of the float16 representable range is not used by activation gradients generally. So you can shift the gradients into float16 range by scaling up the loss by a factor `S`. By the chain rule, this scales up the loss before backward pass, and then you can scale back the gradients before updating the weights. This ensures that training in float16 can use the same hyperparameters as used during float32 training. Here's how you can configure the loss to be scaled up by 128 and rescale the gradient down before updating the weights. -*Gluon* -``` +*Gluon API* + +```python loss = gluon.loss.SoftmaxCrossEntropyLoss(weight=128) -optimizer = mx.optimizer.create('sgd', multi_precision=True, rescale_grad=1.0/128) -``` -*Module* +optimizer = mx.optimizer.create('sgd', + multi_precision=True, + rescale_grad=1.0/128) ``` + +*Module API* + +```python mxnet.sym.SoftmaxOutput(other_args, grad_scale=128.0) -optimizer = mx.optimizer.create('sgd', multi_precision=True, rescale_grad=1.0/128) +optimizer = mx.optimizer.create('sgd', + multi_precision=True, + rescale_grad=1.0/128) ``` Networks like Multibox SSD, R-CNN, bigLSTM and Seq2seq were found to exhibit such behavior. -You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64,128,256,512 are chosen. Refer the linked articles below for more details on this. - -## Video Tutorial - -We also have a video tutorial for using Mixed Precision with MXNet. You can check that out [here](https://www.youtube.com/watch?v=pR4KMh1lGC0) +You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64, 128, 256, 512 are chosen. Refer to the linked articles below for more details on this. ## References + 1. [Training with Mixed Precision User Guide](http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) 2. [Mixed Precision Training at ICLR 2018](https://arxiv.org/pdf/1710.03740.pdf) 3. [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) +## Recommended Next Steps + +* Check out our video tutorial on [Using Mixed Precision with MXNet](https://www.youtube.com/watch?v=pR4KMh1lGC0) \ No newline at end of file diff --git a/docs/faq/index.md b/docs/faq/index.md index 92f1ccde00f4..6c398d46471a 100644 --- a/docs/faq/index.md +++ b/docs/faq/index.md @@ -100,4 +100,4 @@ If you need help with using MXNet, have questions about applying it to a particu We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](https://github.com/apache/incubator-mxnet/issues). ## Roadmap -MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Roadmap). +MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://cwiki.apache.org/confluence/display/MXNET/Roadmap). diff --git a/docs/faq/new_op.md b/docs/faq/new_op.md index 4c10708b944d..2395379bafc1 100644 --- a/docs/faq/new_op.md +++ b/docs/faq/new_op.md @@ -292,6 +292,28 @@ output or nothing to calculating gradient. For more complicated patterns, use `MakeGradNode(op_name, n, heads, dict)` to create gradient entries, where heads are input entries to the backward op, composed from ograds and n->inputs. +When assembling a return vector of `std::vector ret;` a common pattern would be to +either create nodes in place as in: + +``` +ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_xyz_backward", + {n->inputs[1]}, nullptr, &n)) +``` + +Or create the node, modify and then move into NodeEntry's constructor if this node is not to be used +again. This avoids uneccessary copies of the shared_ptr. + +``` +for (size_t i = 0; i < n->inputs.size(); ++i) { + nnvm::NodePtr node = nnvm::Node::Create(); + node->attrs.op = copy_op; + node->inputs = {ograds[0]}; + ret.emplace_back(std::move(node)); +} +``` + +The first case uses RVO and the second in place construction. + #### FCompute\ Simple operators can register FCompute with `.set_attr("FCompute", ...)` and `.set_attr("FCompute", ...)` for both CPU and (optionally) GPU computation. diff --git a/docs/faq/perf.md b/docs/faq/perf.md index e1318b843a03..62b40247081c 100644 --- a/docs/faq/perf.md +++ b/docs/faq/perf.md @@ -34,8 +34,13 @@ Performance is mainly affected by the following 4 factors: ## Intel CPU -For using Intel Xeon CPUs for training and inference, we suggest enabling -`USE_MKLDNN = 1` in `config.mk`. +When using Intel Xeon CPUs for training and inference, the `mxnet-mkl` package is recommended. Adding `--pre` installs a nightly build from master. Without it you will install the latest patched release of MXNet: + +``` +$ pip install mxnet-mkl [--pre] +``` + +Or build MXNet from source code with `USE_MKLDNN=1`. For Linux users, `USE_MKLDNN=1` will be turned on by default. We also find that setting the following environment variables can help: diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md index 7b00b03abefe..dacac09c3d11 100644 --- a/docs/install/build_from_source.md +++ b/docs/install/build_from_source.md @@ -42,14 +42,14 @@ Building from source follows this general two-step flow of building the shared l * [non-Intel CPUs](#recommended-for-Systems-with-non-Intel-CPUs) 2. [Install the language API binding(s)](#installing-mxnet-language-bindings) you would like to use for MXNet. MXNet's newest and most popular API is Gluon. Gluon is built into the Python binding. If Python isn't your preference, you still have more options. MXNet supports several other language APIs: - - [Python (includes Gluon)](../api/python/index.html) - - [C++](../api/c++/index.html) - - [Clojure](../api/clojure/index.html) - - [Java](../api/java/index.html) - - [Julia](../api/julia/index.html) - - [Perl](../api/perl/index.html) - - [R](../api/r/index.html) - - [Scala](../api/scala/index.html) + - [Python (includes Gluon)](../api/python/index.md) + - [C++](../api/c++/index.md) + - [Clojure](../api/clojure/index.md) + - [Java](../api/java/index.md) + - [Julia](../api/julia/index.md) + - [Perl](../api/perl/index.md) + - [R](../api/r/index.md) + - [Scala](../api/scala/index.md)
@@ -58,12 +58,11 @@ MXNet's newest and most popular API is Gluon. Gluon is built into the Python bin Detailed instructions are provided per operating system. Each of these guides also covers how to install the specific [Language Bindings](#installing-mxnet-language-bindings) you require. You may jump to those, but it is recommended that you continue reading to understand more general "build from source" options. -* [Amazon Linux / CentOS / RHEL](centos_setup.html) -* [macOS](osx_setup.html) -* [Raspbian](raspian_setup.html) -* [TX2](tx2_setup.html) -* [Ubuntu](ubuntu_setup.html) -* [Windows](windows_setup.html) +* [Amazon Linux / CentOS / RHEL](centos_setup.md) +* [macOS](osx_setup.md) +* [Devices](https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU) +* [Ubuntu](ubuntu_setup.md) +* [Windows](windows_setup.md)
@@ -231,7 +230,7 @@ For example, you can specify using all cores on Linux as follows: ```bash mkdir build && cd build -cmake -GNinja . +cmake -GNinja .. ninja -v ``` @@ -241,7 +240,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -GNinja . +cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -GNinja .. ninja -v ``` @@ -250,7 +249,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DBLAS=open -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -GNinja . +cmake -DBLAS=open -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -GNinja .. ninja -v ``` @@ -259,7 +258,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DUSE_CUDA=0 -DUSE_MKLDNN=1 -GNinja . +cmake -DUSE_CUDA=0 -DUSE_MKLDNN=1 -GNinja .. ninja -v ``` @@ -268,7 +267,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DUSE_CUDA=0 -DBLAS=open -GNinja . +cmake -DUSE_CUDA=0 -DBLAS=open -GNinja .. ninja -v ``` @@ -278,7 +277,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DUSE_OPENCV=0 -GNinja . +cmake -DUSE_OPENCV=0 -GNinja .. ninja -v ``` @@ -286,7 +285,7 @@ ninja -v ```bash mkdir build && cd build -cmake -DBLAS=apple -DUSE_OPENCV=0 -DUSE_OPENMP=0 -GNinja . +cmake -DBLAS=apple -DUSE_OPENCV=0 -DUSE_OPENMP=0 -GNinja .. ninja -v ``` @@ -295,7 +294,7 @@ ninja -v ```bash brew install llvm mkdir build && cd build -cmake -DBLAS=apple -DUSE_OPENMP=1 -GNinja . +cmake -DBLAS=apple -DUSE_OPENMP=1 -GNinja .. ninja -v ``` diff --git a/docs/install/c_plus_plus.md b/docs/install/c_plus_plus.md index ee21014bc5f1..13c1a87cbd5f 100644 --- a/docs/install/c_plus_plus.md +++ b/docs/install/c_plus_plus.md @@ -18,12 +18,12 @@ ## Build the C++ package The C++ package has the same prerequisites as the MXNet library. -To enable C++ package, just add `USE_CPP_PACKAGE=1` in the [build from source](build_from_source.html) options when building the MXNet shared library. +To enable C++ package, just add `USE_CPP_PACKAGE=1` in the [build from source](build_from_source.md) options when building the MXNet shared library. For example to build MXNet with GPU support and the C++ package, OpenCV, and OpenBLAS, from the project root you would run: ```bash -cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -DUSE_CPP_PACKAGE=1 -GNinja . +cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -DUSE_CPP_PACKAGE=1 -GNinja .. ninja -v ``` @@ -40,7 +40,7 @@ You can find C++ code examples in the `cpp-package/example` folder of the MXNet ## Tutorials -* [MXNet C++ API Basics](https://mxnet.incubator.apache.org/tutorials/c++/basics.html) +* [MXNet C++ API Basics](../tutorials/c++/basics.md) ## Related Topics diff --git a/docs/install/download.md b/docs/install/download.md index cf95c2344f14..808b4b8a72e5 100644 --- a/docs/install/download.md +++ b/docs/install/download.md @@ -21,6 +21,7 @@ These source archives are generated from tagged releases. Updates and patches wi | Version | Source | PGP | SHA | |---------|-------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| 1.4.1 | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz) | [Download](https://apache.org/dist/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz.asc) | [Download](https://apache.org/dist/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz.sha512) | | 1.4.0 | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz) | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.asc) | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.sha512) | | 1.3.1 | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz) | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.asc) | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.sha512) | | 1.3.0 | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz) | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.asc) | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.sha512) | diff --git a/docs/install/index.md b/docs/install/index.md index 10db8d95b44a..5fef5ca47e57 100644 --- a/docs/install/index.md +++ b/docs/install/index.md @@ -28,9 +28,7 @@ download.md java_setup.md osx_setup.md - raspbian_setup.md scala_setup.md - tx2_setup.md ubuntu_setup.md validate_mxnet.md windows_setup.md @@ -39,10 +37,10 @@ Indicate your preferred configuration. Then, follow the customized commands to install MXNet.