From 3fb13ffe9fc57a74a0692bbbb67ab66ea8508d50 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 17 Nov 2017 16:10:58 -0800 Subject: [PATCH 1/5] Refreshed branch bc_tune --- CMakeLists.txt | 22 +- Makefile | 4 + make/config.mk | 6 + src/operator/mshadow_op.h | 94 ++- src/operator/mxnet_op.h | 119 ++- src/operator/operator_tune-inl.h | 758 ++++++++++++++++++ src/operator/operator_tune.cc | 349 ++++++++ src/operator/operator_tune.h | 331 ++++++++ .../tensor/elemwise_binary_broadcast_op.h | 137 +++- src/operator/tensor/init_op.h | 1 + tests/cpp/include/test_core_op.h | 12 +- tests/cpp/include/test_op_runner.h | 36 +- tests/cpp/include/test_tune.h | 333 ++++++++ tests/cpp/include/test_util.h | 21 +- tests/cpp/operator/broadcast_perf.cc | 114 ++- tests/cpp/operator/tune/operator_tune_test.cc | 173 ++++ tests/cpp/test_main.cc | 9 +- 17 files changed, 2425 insertions(+), 94 deletions(-) create mode 100644 src/operator/operator_tune-inl.h create mode 100644 src/operator/operator_tune.cc create mode 100644 src/operator/operator_tune.h create mode 100644 tests/cpp/include/test_tune.h create mode 100644 tests/cpp/operator/tune/operator_tune_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index af681d00aac2..f1eb2ad595f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) +mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON) mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) mxnet_option(USE_PROFILER "Build with Profiler support" OFF) @@ -143,6 +144,8 @@ if(USE_MKL_IF_AVAILABLE) if(NOT MSVC) list(APPEND mxnet_LINKER_LIBS dl) endif() + # If using MKL, use the Intel OMP libraries + list(APPEND mxnet_LINKER_LIBS iomp5) if(USE_MKL_EXPERIMENTAL) add_definitions(-DMKL_EXPERIMENTAL=1) else() @@ -260,11 +263,20 @@ endif() # ---[ OpenMP if(USE_OPENMP) find_package(OpenMP REQUIRED) - if(OPENMP_FOUND) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt) + # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp + set(OPENMP_STANDALONE_BUILD TRUE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp) + list(APPEND mxnet_LINKER_LIBS omp) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + else() + if(OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + endif() endif() elseif(UNIX) list(APPEND mxnet_LINKER_LIBS pthread) @@ -353,6 +365,10 @@ if(USE_PLUGINS_WARPCTC) list(APPEND CUDA ${PLUGINS_CUSRC}) endif() +if(USE_OPERATOR_TUNING) + add_definitions(-DMXNET_USE_OPERATOR_TUNING=1) +endif() + if(USE_PLUGIN_CAFFE) if(NOT USE_CUDA) set(CPU_ONLY ON) diff --git a/Makefile b/Makefile index 8c7ae6e6fdca..8659482f2634 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,10 @@ ifeq ($(USE_MKL2017), 1) LDFLAGS += -liomp5 endif +ifeq ($(USE_OPERATOR_TUNING), 1) + CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1 +endif + # verify existence of separate lapack library when using blas/openblas/atlas # switch off lapack support in case it can't be found # issue covered with this diff --git a/make/config.mk b/make/config.mk index a4774f0da815..eeda36b365e5 100644 --- a/make/config.mk +++ b/make/config.mk @@ -153,6 +153,12 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server # sudo apt-get install -y libcurl4-openssl-dev USE_S3 = 0 +#---------------------------- +# performance settings +#---------------------------- +# Use operator tuning +USE_OPERATOR_TUNING = 1 + # Use gperftools if found USE_GPERFTOOLS = 1 diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h index a34c11791a22..10be627ee73e 100644 --- a/src/operator/mshadow_op.h +++ b/src/operator/mshadow_op.h @@ -30,6 +30,7 @@ #include "math.h" #include "math_functions-inl.h" #include "special_functions-inl.h" +#include "./mxnet_op.h" #ifdef __CUDACC__ #include @@ -39,6 +40,24 @@ namespace mxnet { namespace op { namespace mshadow_op { +/*! + * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' macro outside of the mshadow_op namespace + * See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' + * + * \note An entry for the operator must also be added in operator_tune.cc, which will register it + * for auto-tuning and also hold its workload weight + */ +#define MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(__op$) \ + } MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*) +/*! + * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' macro outside of the mshadow_op namespace + * See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' + * + * \note An entry for the operator must also be added in operator_tune.cc, which will register it + * for auto-tuning and also hold its workload weight + */ +#define MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(__op$) \ + } MXNET_TUNABLE_MSHADOW_OP_BACKWARD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*) #ifdef __CUDA_ARCH__ __constant__ const float PI = 3.14159265358979323846; #else @@ -49,36 +68,41 @@ using std::enable_if; using std::is_unsigned; #define MXNET_UNARY_MATH_OP(name, expr) \ -struct name { \ - template \ - MSHADOW_XINLINE static DType Map(DType a) { \ - return DType(expr); \ - } \ -} + struct name { \ + template \ + MSHADOW_XINLINE static DType Map(DType a) { \ + return DType(expr); \ + } \ + }; \ + MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name) + #define MXNET_UNARY_MATH_OP_NC(name, expr) \ -struct name { \ - template \ - MSHADOW_XINLINE static DType Map(DType a) { \ - return (expr); \ - } \ -} + struct name { \ + template \ + MSHADOW_XINLINE static DType Map(DType a) { \ + return (expr); \ + } \ + }; \ + MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name) #define MXNET_BINARY_MATH_OP(name, expr) \ -struct name { \ - template \ - MSHADOW_XINLINE static DType Map(DType a, DType b) { \ - return DType(expr); \ - } \ -} + struct name { \ + template \ + MSHADOW_XINLINE static DType Map(DType a, DType b) { \ + return DType(expr); \ + } \ + }; \ + MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name) #define MXNET_BINARY_MATH_OP_NC(name, expr) \ -struct name { \ - template \ - MSHADOW_XINLINE static DType Map(DType a, DType b) { \ - return (expr); \ - } \ -} + struct name { \ + template \ + MSHADOW_XINLINE static DType Map(DType a, DType b) { \ + return (expr); \ + } \ + }; \ + MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name) #define MXNET_SIMPLE_UNARY_MATH_OP(name) MXNET_UNARY_MATH_OP(name, math::name(a)) @@ -134,6 +158,7 @@ struct softrelu { } } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(softrelu) MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a)); @@ -154,6 +179,7 @@ struct log10_grad { return DType(0.4342944819f / static_cast(a)); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log10_grad) template<> MSHADOW_XINLINE double log10_grad::Map(double a) { @@ -169,6 +195,7 @@ struct log2_grad { return DType(1.442695041f / static_cast(a)); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log2_grad) template<> MSHADOW_XINLINE double log2_grad::Map(double a) { @@ -263,6 +290,7 @@ struct sign { return DType(0); } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(sign) MXNET_UNARY_MATH_OP_NC(sign_grad, DType(0)); @@ -333,6 +361,7 @@ struct rint { return DType((af - floor) <= (ceil - af) ? floor : ceil); } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rint) /*! \brief used to round number to integer nearest to 0 */ struct fix { @@ -343,6 +372,7 @@ struct fix { return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil); } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(fix) /*! \brief used for generate gradient of MAE loss*/ MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1)); @@ -405,6 +435,7 @@ struct mod { } } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(mod) template<> MSHADOW_XINLINE mshadow::half::half2_t mod::Map @@ -419,6 +450,8 @@ struct mod_grad { return DType(0); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_grad) + template<> MSHADOW_XINLINE double mod_grad::Map(double a, double b) { return 1.0; @@ -454,6 +487,8 @@ struct mod_rgrad { return DType(0); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_rgrad) + template<> MSHADOW_XINLINE double mod_rgrad::Map(double a, double b) { return -::floor(a/b); @@ -517,6 +552,7 @@ struct rmod { } } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rmod) template<> MSHADOW_XINLINE mshadow::half::half2_t rmod::Map @@ -531,6 +567,8 @@ struct rmod_grad { return DType(0); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(rmod_grad) + template<> MSHADOW_XINLINE double rmod_grad::Map(double a, double b) { return -::floor(b/a); @@ -572,6 +610,7 @@ struct clip { } } }; +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(clip) /***** gamma ******/ @@ -585,6 +624,7 @@ struct gamma_grad { return DType(math::tgamma(af) * special_functions::cephes::psi(af)); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gamma_grad) template<> MSHADOW_XINLINE double gamma_grad::Map(double a) { @@ -602,6 +642,7 @@ struct gammaln_grad { return DType(special_functions::cephes::psi(a)); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gammaln_grad) template<> MSHADOW_XINLINE double gammaln_grad::Map(double a) { @@ -633,6 +674,7 @@ struct smooth_l1_loss { } } }; // struct smooth_l1_loss +MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(smooth_l1_loss) /* The derivative of smooth l1 loss is * f'(x) = sigma^2 * x, |x| < 1 / sigma^2 @@ -654,6 +696,7 @@ struct smooth_l1_gradient { } } }; // struct smooth_l1_derivative +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(smooth_l1_gradient) /*! \brief product reducer */ struct product { @@ -755,6 +798,7 @@ struct nansum_grad { return isnan_typed::IsNan(a) ? DType(0) : DType(1); } }; +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nansum_grad) /*! \brief product reducer that ignores NaN values in the input */ struct nanprod { @@ -791,7 +835,7 @@ struct nanprod_grad { return isnan_typed::IsNan(a) ? DType(0) : b / a; } }; - +MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nanprod_grad) } // namespace mshadow_op } // namespace op } // namespace mxnet diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index c34d9c90834d..7a14477f5701 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -31,6 +31,7 @@ #include #include #include +#include "./operator_tune.h" #include "../engine/openmp.h" #ifdef __CUDACC__ @@ -190,8 +191,9 @@ template MSHADOW_XINLINE int dot(const Shape& coord, const Shape& stride) { int ret = 0; #pragma unroll - for (int i = 0; i < ndim; ++i) + for (int i = 0; i < ndim; ++i) { ret += coord[i] * stride[i]; + } return ret; } @@ -346,15 +348,26 @@ struct op_with_req { template struct Kernel; +/*! + * \brief CPU Kernel launcher + * \tparam OP Operator to launch + */ template struct Kernel { - /*! \brief Launch CPU kernel */ + /*! + * \brief Launch a generic CPU kernel. + * When using this for a new kernel op, add declaration and tuning objects to + * operator_tune.cc + * \tparam Args Varargs type to eventually pass to the OP::Map() functoion + * \param N Number of iterations + * \param dest Destination pointer (used to infer DType) + * \param args Varargs to eventually pass to the OP::Map() functoion + */ template inline static void Launch(mshadow::Stream *, const int N, Args... args) { #ifdef _OPENMP const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (omp_threads < 2) { - // Zero means not to use OMP, but don't interfere with external OMP behavior for (int i = 0; i < N; ++i) { OP::Map(i, args...); } @@ -371,14 +384,55 @@ struct Kernel { #endif } + /*! + * \brief Launch CPU kernel which has OMP tuning data available. + * When using this for a new kernel op, add declaration and tuning objects to + * operator_tune.cc + * \tparam PRIMITIVE_OP The primitive operation to use for tuning + * \tparam DType Data type + * \tparam Args Varargs type to eventually pass to the OP::Map() functoion + * \param N Number of iterations + * \param dest Destination pointer (used to infer DType) + * \param args Varargs to eventually pass to the OP::Map() functoion + */ + template + static void LaunchTuned(mshadow::Stream *, const int N, Args... args) { +#ifdef _OPENMP + const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); + if (omp_threads < 2 || !tuned_op::UseOMP( + static_cast(N), static_cast(omp_threads))) { + for (int i = 0; i < N; ++i) { + OP::Map(i, args...); + } + } else { + #pragma omp parallel for num_threads(omp_threads) + for (int i = 0; i < N; ++i) { + OP::Map(i, args...); + } + } +#else + for (int i = 0; i < N; ++i) { + OP::Map(i, args...); + } +#endif + } + + /*! + * \brief Launch custom-tuned kernel where each thread is set to + * operate on a contiguous partition + * \tparam Args Varargs type to eventually pass to the OP::Map() functoion + * \param N Number of iterations + * \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions + */ template inline static void LaunchEx(mshadow::Stream *s, const int N, Args... args) { #ifdef _OPENMP const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - if (omp_threads <= 1) { + if (omp_threads < 2 || !tuned_op::UseOMP( + N, omp_threads, args...)) { OP::Map(0, N, args...); } else { - int length = (N + omp_threads - 1) / omp_threads; + const int length = (N + omp_threads - 1) / omp_threads; #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < N; i += length) { OP::Map(i, i + length > N ? N - i : length, args...); @@ -418,7 +472,7 @@ struct Kernel { } template - inline static void LaunchEx(mshadow::Stream *s, int N, Args... args) { + inline static void LaunchEx(mshadow::Stream *s, const int N, Args... args) { using namespace mshadow::cuda; int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); mxnet_generic_kernel_ex @@ -428,6 +482,43 @@ struct Kernel { }; #endif // __CUDACC__ +/*! + * \brief Wrap Kernel::Launch* with some special-case helpers + */ +template +struct KernelWrapper { + /*! + * \brief Launch 'mshadow_op-type' op (i.e. DType (*)( ... ) { return } + * \tparam Args Varargs type to eventually pass to the OP::Map() function + * \param s Stream object pointer (unused) + * \param N Number of iterations + * \param args Varargs to eventually pass to the OP::Map() functoion + */ + template + MSHADOW_CINLINE static void LaunchMShadowOpEx(mshadow::Stream *s, + const int N, + DType *dest, + Args... args) { + mxnet::op::mxnet_op::Kernel::template LaunchTuned< + typename OP::Operation, DType>(s, N, dest, args...); + } + + /*! + * \brief Launch 'mxnet_op-type' op (i.e. void (*)(int N, DType *out, ... ) + * \tparam Args Varargs type to eventually pass to the OP::Map() function + * \param s Stream object pointer (unused) + * \param N Number of iterations + * \param args Varargs to eventually pass to the OP::Map() functoion + */ + template + MSHADOW_CINLINE static void LaunchMXNetOpEx(mshadow::Stream *s, + const int N, + DType *dest, + Args... args) { + mxnet::op::mxnet_op::Kernel::template LaunchTuned(s, N, dest, args...); + } +}; + /*! * \brief Set to immediate scalar value kernel * \tparam val Scalar immediate @@ -450,7 +541,23 @@ struct set_to_int { */ using set_zero = set_to_int<0>; using set_one = set_to_int<1>; +_MXNET_TUNABLE_MXNET_OP_FWD(set_zero); // _ prefix denotes "already in mxnet_op namespace" +_MXNET_TUNABLE_MXNET_OP_FWD(set_one); } // namespace mxnet_op + +/*! + * \brief Tuning specializations for the simple ops in + * Basically, this overrides mxnet::op::mxnet_op::Kernel::Launch() and + * redirects to mxnet::op::mxnet_op::KernelWrapper::Launch????OpEx(), + * which eventually leads back to mxnet::op::mxnet_op::Kernel::LaunchTuned() + */ +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::identity) +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::plus) +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::minus) +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::mul) +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::div) +MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::right) + } // namespace op } // namespace mxnet diff --git a/src/operator/operator_tune-inl.h b/src/operator/operator_tune-inl.h new file mode 100644 index 000000000000..d0cf7e713958 --- /dev/null +++ b/src/operator/operator_tune-inl.h @@ -0,0 +1,758 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef MXNET_OPERATOR_OPERATOR_TUNE_INL_H_ +#define MXNET_OPERATOR_OPERATOR_TUNE_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "./mxnet_op.h" +#include "./operator_tune.h" + +#if (__GNUC__ >= 4 || (__GNUC__ >= 3 && __GNUC_MINOR__ >= 4)) && !defined(__mips__) +# define HAS_CXA_DEMANGLE 1 +#else +# define HAS_CXA_DEMANGLE 0 +#endif + +#if HAS_CXA_DEMANGLE +#include +#endif + +namespace mxnet { +namespace op { + +#ifndef MXNET_NO_INLINE +#ifdef _MSC_VER +#define MXNET_NO_INLINE __declspec(noinline) +#else +#define MXNET_NO_INLINE __attribute__((noinline)) +#endif +#endif // MXNET_NO_INLINE + +#define OUTSIDE_COUNT_SHIFT 9 + +namespace tune { + +/*! + * \brief Convert TuningMode value to a string representation + * \param tm Scalar TuningMode value + * \return Character pointer to a string representing the TuningMode value + */ +inline const char *TuningModeToString(const TuningMode tm) { + switch (tm) { + case kAuto: + return "Auto"; + case kNeverOMP: + return "NeverOMP"; + case kAlwaysOMP: + return "AlwaysOMP"; + default: + CHECK(false) << "Unknown TuningMode type: " << static_cast(tm); + return ""; + } +} +} // namespace tune + +/*! + * \brief Engine to tune kernel operations + * \tparam DType Data type to be used when tuning the kernel operations + * \remarks The basic concept here is that we time how long a trivial loop takes with and without + * OMP, subtracting the non-OMP run from the OMP run, which gives us the time + * that the OMP overhead takes. Times were found to be relatively invariant with + * regard ot the number of threads/cores on a given machine. + * Secondly, supplied operators are run and timed (for each data type) in order to determine + * their individual time cost. + * + * Knowing the following items, we can determine how long the OMP and non-OMP run + * is expected to take: + * 1) OMP overhead time + * 2) Number of iterations required + * 3) Number of threads to be used if we choose the OMP method + * 4) The data type + * + * Therefore, at Kernel::Launch() time, we can estimate whether it is faster to use OMP or not + * for the given kernel operator. + * + * Results and efficiency of the tuning is tested in the gtest OMP_TUNING test suite + */ +template +class OperatorTune : public OperatorTuneByType { + public: + using Tick = OperatorTuneBase::Tick; + using duration_t = OperatorTuneBase::duration_t; + using OperatorTuneByType::tuning_mode_; + + /*! + * \brief Constructor + */ + OperatorTune() { + TuneAll(); + } + + /*! + * \brief Initialize the OperatorTune object + * \return Whether the OperatorTune object was successfully initialized + */ + static bool Initialize() { + if (!initialized_) { + initialized_ = true; + // Generate some random data for calling the operator kernels + data_set_.reserve(0x100); + std::random_device rd; + std::mt19937 gen(rd()); + if (!std::is_integral::value) { + std::uniform_real_distribution<> dis(-1, 1); + for (int n = 0; n < 0x100; ++n) { + const auto val = static_cast(dis(gen)); + // If too close to zero, try again + if (std::fabs(static_cast(val)) < 1e-5) { + --n; + continue; + } + data_set_.emplace_back(val); + } + } else { + std::uniform_int_distribution<> dis(-128, 127); + for (int n = 0; n < 0x100; ++n) { + const auto val = static_cast(dis(gen)); + // If zero, try again + if (!val) { + --n; + continue; + } + data_set_.emplace_back(val); + } + } + // Use this environment variable to generate new tuning statistics + // In order to avoid printing too many copies, only the float32 object prints + output_tuning_data_ = mshadow::DataType::kFlag == mshadow::kFloat32 + && dmlc::GetEnv("MXNET_OUTPUT_TUNING_DATA", false); + // If outputting tuning data, then also output verbose logging info + OperatorTuneBase::verbose_tuning_info_ = dmlc::GetEnv("MXNET_VERBOSE_TUNING_INFO", false); + + OperatorTuneBase::tuning_weight_scale_ = dmlc::GetEnv("MXNET_TUNING_WEIGHT_SCALE", 0.0); + + // This isn't actually supposed to be multithreaded init, but just to be sure the change is + // seen everywhere, using atomic bool. + if (!OperatorTuneBase::calculated_.load()) { + // Not especially concerned with a race condition, since this hsould + // run when only one thread is active (static init), just don't cache this variable + OperatorTuneBase::calculated_.store(true); + OperatorTuneBase::omp_overhead_ns_ = GetOMPLoopOverhead(); + std::string config = dmlc::GetEnv("MXNET_USE_OPERATOR_TUNING", std::string()); + ParseEnablerConfig(config); + } + + if (OperatorTuneBase::verbose_tuning_info_) { + LOG(INFO) << "OMP overhead: " << OperatorTuneBase::omp_overhead_ns_ << " nanoseconds"; + } + } + return true; + } + + /*! + * \brief Schedule a tuning run + * \tparam OP Operator to tune + * \param tune_func Function to call which tunes the operator + * \return true if the tune operation was scheduled + */ + template + static bool ScheduleTune(void (*tune_func)()) { +#ifdef MXNET_USE_OPERATOR_TUNING + if (tune_func) { + GetTuningList()->push_back(tune_func); + operator_names_.insert(demangle(typeid(OP).name())); + return true; + } + return false; +#else + return true; +#endif + } + + /*! + * \brief Is the template parameter type a tuned kernel? + * \tparam OP kernel operator type + * \return true if the operator/kernel is tuned + */ + template + static bool IsTuned() { + return operator_names_.find(demangle(typeid(OP).name())) != operator_names_.end(); + } + + /*!\ + * \brief Tune all registered kernel operators that haven't already been tuned + */ + static bool TuneAll() { + Initialize(); + std::list *tl = GetTuningList(); + const size_t size_save = tl->size(); // For checking if anything asynchronous is + // adding or removing items, which is forbidden + if (output_tuning_data_ && !tl->empty()) { + // Only emit this once, use the most common case, 'float32' + if (mshadow::DataType::kFlag == mshadow::kFloat32) { + std::cout << "OperatorTuneBase::duration_t " + << "OperatorTuneBase::omp_overhead_ns_ = " << OperatorTuneBase::omp_overhead_ns_ + << ";" << std::endl << std::flush; + } + } + const Tick start = std::chrono::high_resolution_clock::now(); + for (auto i : *tl) { + (*i)(); + } + if (OperatorTuneBase::verbose_tuning_info_) { + const duration_t duration = OperatorTune::GetDurationInNanoseconds(start); + LOG(INFO) << "Op Tuning for " << type_name() + << " took " << (duration / 1000000) << " ms"; + } + CHECK_EQ(size_save, tl->size()) << "Tuning list size should not have changed while tuning"; + tl->clear(); + return true; + } + + /*! + * \brief Return set of operator names that were registered to be tuned. Does not imply + * that the operator has been tuned. + * \return Set of operator/kernel names that were registered for tuning + */ + static const std::unordered_set& TunedOperatorNames() { + return operator_names_; + } + + protected: + /*! + * \brief Get the list of tuning function calls for the operators + * \return Pointer to list of tuning function calls + */ + static std::list *GetTuningList(); + + /*! + * \brief Demangle typeid::name() in order to generate source macros + * \param name C++ Mangled name + * \return Demangled name as string + */ + static inline std::string demangle(const char *name) { +#if HAS_CXA_DEMANGLE + int status = -4; // some arbitrary value to eliminate the compiler warning + std::unique_ptr res{ + abi::__cxa_demangle(name, nullptr, nullptr, &status), + &std::free + }; + return status ? name : res.get(); +#else + return name; +#endif + } + + /*! + * \brief Type name as string + * \tparam T Type + * \return std::string representing the human-readable demangled type name + */ + template static inline std::string type_name() { + return demangle(typeid(T).name()); + } + + /*! \brief Measure OMP overhead for a trivial OMP loop using all cores + * \param omp_thread_count - Number of OMP threads to use in the timing test + * \returns Duration in nanoseconds for the OMP overhead (time to initiate and close the + * OMP session) + */ + static duration_t GetOMPLoopOverhead(const size_t omp_thread_count) { + CHECK_GT(omp_thread_count, 1); // Don't try to use OMP for one thread + int wl_count = OperatorTuneBase::WORKLOAD_COUNT; + + Tick start = std::chrono::high_resolution_clock::now(); + // Use two loops in order to simulate OMP outside timing + for (size_t i = 0; i < OUTSIDE_COUNT; ++i) { + for (int x = 0; x < wl_count; ++x) { + // trivial operation + volatile_int_ += x; + } + } + const OperatorTuneBase::duration_t no_omp_duration = + OperatorTuneBase::GetDurationInNanoseconds(start); + + // Scale OMP iterations by type calculation complexity + double factor; + + // if tuning_weight_scale_ is a number that looks valid, use it as the factor + if (OperatorTuneBase::tuning_weight_scale_ > 0.01) { + factor = OperatorTuneBase::tuning_weight_scale_; + } else { + // These are empirically-determined constants found by balancing between + // a desktop (8 & 12 cpu's) and large cloud instances (32 & 64 cpu's) + switch (mshadow::DataType::kFlag) { + case mshadow::kUint8: + case mshadow::kInt8: + factor = 8.5; + break; + case mshadow::kInt32: + factor = 4.5; + break; + case mshadow::kInt64: + factor = 2; + break; + case mshadow::kFloat64: + factor = 1.25; + break; + case mshadow::kFloat32: + default: + factor = 1.0; + break; + } + } + + wl_count = static_cast(factor * OperatorTuneBase::WORKLOAD_COUNT * omp_thread_count); + start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < OUTSIDE_COUNT; ++i) { + #pragma omp parallel for num_threads(omp_thread_count) + for (int x = 0; x < wl_count; ++x) { + // trivial operation + volatile_int_ += x; + } + } + const duration_t omp_duration = OperatorTuneBase::GetDurationInNanoseconds(start) + - no_omp_duration; + return omp_duration >> OUTSIDE_COUNT_SHIFT; + } + + /*! \brief Measure OMP overhead for a trivial OMP loop using all cores + * \returns Time in nanoseconds to initialize/cleanup when excuting an OMP block + */ + static duration_t GetOMPLoopOverhead() { + // It was found empirically that OMP times was not heavily tied to number of cores, + // so take an average across all core counts + const auto max_cores = static_cast(omp_get_num_procs()) >> 1; + if (max_cores >= 2) { + std::vector core_times; + // Take care of any OMP lazy-init with a throwaway call + for (size_t omp_threads = 2; omp_threads <= max_cores; ++omp_threads) { + GetOMPLoopOverhead(omp_threads); + } + std::vector durations; + durations.reserve(max_cores - 1); + for (size_t omp_threads = 2; omp_threads <= max_cores; ++omp_threads) { + const duration_t duration = GetOMPLoopOverhead(omp_threads); + if (OperatorTuneBase::verbose_tuning_info_) { + LOG(INFO) << "OMP Thread Count: " << omp_threads << ", overhead: " << duration << " ns"; + } + durations.emplace_back(duration); + } + // return median + std::sort(durations.begin(), durations.end()); + return durations[durations.size() >> 1]; + } + return INT_MAX; // If only one core, then never use OMP (say the overhead is huge) + } + + /*! + * \brief Some string utility functions that aren't specific to tuning + */ + struct StringUtil { + /*! + * \brief Terim whitespace from beninning and end of string + * \param s String to trimp + * \return reference to the modified string. This is the same std::string object as what was + * supplied in the parameters + */ + static std::string &trim(std::string *s) { + s->erase(s->begin(), std::find_if(s->begin(), s->end(), [](int ch) { + return !std::isspace(ch); + })); + s->erase(std::find_if(s->rbegin(), s->rend(), [](int ch) { + return !std::isspace(ch); + }).base(), s->end()); + return *s; + } + + /*! + * \brief Tokenize a string into a list of tokens + * \param s String to tokenize + * \return std::list of tokens + */ + static std::list string2list(const std::string &s) { + std::list res; + std::istringstream iss(s); + std::string token; + while (std::getline(iss, token, ',')) { + trim(&token); + if (!token.empty()) { + res.push_back(token); + } + } + return std::move(res); + } + }; + + /*! + * \brief Get data type from string representation + * \warning Do not call from a performance-sensitive area + */ + static int type_from_string(const std::string& type_string) { + if (type_string == "float32") + return mshadow::kFloat32; + if (type_string == "float64") + return mshadow::kFloat64; + if (type_string == "float16") + return mshadow::kFloat16; + if (type_string == "int8") + return mshadow::kInt8; + if (type_string == "uint8") + return mshadow::kUint8; + if (type_string == "int32") + return mshadow::kInt32; + if (type_string == "int64") + return mshadow::kInt64; + return -1; // invalid + } + + /*! + * \brief Parse MXNET_ENABLE_OPERATOR_TUNING environment variable + * \param config String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable + * Values: + * 0=disable all + * 1=enable all + * float32, float16, float32=list of types to enable, and disable those not listed + */ + static void ParseEnablerConfig(std::string config) { + StringUtil::trim(&config); + if (!config.empty()) { + // First disable all + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + OperatorTuneByType::set_tuning_mode(tune::kAlwaysOMP); + // See if it's a non-number (ie type or list of types) + if (!::isdigit(config[0])) { + OperatorTuneByType::set_tuning_mode(tune::kAuto); + std::list tokens = StringUtil::string2list(config); + for (const std::string& stype : tokens) { + // We don't have an enum for halt_t + const int typ = type_from_string(stype); + if (typ >= 0) { + switch (typ) { + case mshadow::kFloat32: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kFloat64: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kFloat16: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kInt8: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kUint8: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kInt32: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + case mshadow::kInt64: + OperatorTuneByType::set_tuning_mode(tune::kAuto); + break; + default: + CHECK(false) << "Unsupported tuning data type: " << stype; + break; + } + } else { + // -1 is error + LOG(WARNING) << "Unknown data type to be tuned: " << stype; + } + } + } else { + if (std::atoi(config.c_str()) > 0) { + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + OperatorTuneByType::set_tuning_mode(tune::kAuto); + } + } + } + } + + /*! \brief Whether this object has been initialized */ + static bool initialized_; + /*! \brief Number of passes to obtain an average */ + static constexpr duration_t OUTSIDE_COUNT = (1 << OUTSIDE_COUNT_SHIFT); + /*! \brief Random data for timing operator calls */ + static std::vector data_set_; + /*! \brief Operators tuned */ + static std::unordered_set operator_names_; + /*! \brief Arbitary object to modify in OMP loop */ + static volatile int volatile_int_; + /*! \brief Output insertable (into code) instantiation+default-value macros */ + static bool output_tuning_data_; +}; + +/*! + * \brief Class that tunes unary operators + * \tparam DType Data type to be used when tuning the kernel operations + */ +template +class UnaryOpTune : public OperatorTune { + protected: + typedef OperatorTune Super; + using duration_t = typename Super::duration_t; + using Tick = typename Super::Tick; + + /*! + * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations + * Used for kernels that take no arguments (ie set_zero) + * \tparam OP Kernel operator + * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations + */ + template + static duration_t GetBlankWorkload() { + DType tmp; + volatile DType *res = &tmp; + const Tick start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) { + // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide + *res += OP::Map(); + } + const duration_t omp_duration = Super::GetDurationInNanoseconds(start); + return omp_duration ? omp_duration : 1; + } + + /*! + * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations + * Used for kernels that take one argument (ie sqrt()) + * \tparam OP Kernel operator + * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations + */ + template + static duration_t GetUnaryWorkload() { + DType tmp; + volatile DType *res = &tmp; + const Tick start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) { + // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide + *res = OP::Map(Super::data_set_[i & 0xFF]); + } + const duration_t omp_duration = Super::GetDurationInNanoseconds(start); + return omp_duration ? omp_duration : 1; + } + + /*! + * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations + * Used for kernels that take two arguments (ie elemwise_add()) + * \tparam OP Kernel operator + * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations + */ + template + static inline duration_t GetBinaryWorkload() { + DType tmp; + volatile DType *res = &tmp; + const Tick start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) { + // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide + *res = OP::Map(Super::data_set_[i & 0xFF], Super::data_set_[(i + 1) & 0xFF]); + } + const duration_t omp_duration = Super::GetDurationInNanoseconds(start); + return omp_duration ? omp_duration : 1; + } + + /*! + * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations + * Used for kernels that take three arguments (ie backwards_grad()) + * \tparam OP Kernel operator + * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations + */ + template + static duration_t GetTertiaryWorkload() { + DType tmp; + volatile DType *res = &tmp; + const Tick start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) { + // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide + *res = OP::Map(Super::data_set_[i & 0xFF], + Super::data_set_[(i + 1) & 0xFF], + Super::data_set_[i & 0xFF]); + } + const duration_t omp_duration = Super::GetDurationInNanoseconds(start); + return omp_duration ? omp_duration : 1; + } + + /*! + * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations + * Used for mxnet-like kernels that take no arguments) + * \tparam OP Kernel operator + * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations + */ + template + static duration_t GetBlankWorkloadEx() { + std::unique_ptr tmp(new DType[Super::WORKLOAD_COUNT]); + DType *tmp_ptr = tmp.get(); + const Tick start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) { + OP::Map(i, tmp_ptr); + } + const duration_t omp_duration = Super::GetDurationInNanoseconds(start); + return omp_duration ? omp_duration : 1; + } + + public: + /*! + * \brief Tune the specified kernel operator. Optionally print out C++ macro that defines the + * tuning data variable and the default tuned value + * This function tunes an operator which takes no arguments + * \tparam OP The kernel operator to be tuned + */ + template + static void TuneBlankOperator() { + mxnet::op::mxnet_op::tuned_op::workload_ = GetBlankWorkload(); + if (Super::output_tuning_data_) { + std::cout << "IMPLEMENT_UNARY_WORKLOAD_FWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } + + /*! + * \brief Tune the specified kernel operator. Optionally print out C++ macro that defines the + * tuning data variable and the default tuned value + * This function tunes an operator which takes one argument + * \tparam OP The kernel operator to be tuned + */ + template + static void TuneUnaryOperator() { + mxnet::op::mxnet_op::tuned_op::workload_ = GetUnaryWorkload(); + if (Super::output_tuning_data_) { + std::cout << "IMPLEMENT_UNARY_WORKLOAD_FWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } + + /*! + * \brief Tune the specified kernel operator. Optionally print out C++ macro that defines the + * tuning data variable and the default tuned value + * This function tunes a backward operator which takes one argument + * \tparam OP The kernel operator to be tuned + */ + template + static void TuneUnaryBackwardOperator() { + mxnet::op::mxnet_op::tuned_op, DType>::workload_ = + GetBinaryWorkload>(); + if (Super::output_tuning_data_) { + std::cout << "IMPLEMENT_UNARY_WORKLOAD_BWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } + + /*! + * \brief Tune the specified "mxnet_op-type" kernel operator. + * Optionally print out C++ macro that defines the + * tuning data variable and the default tuned value + * This function tunes an operator which takes no arguments + * \tparam OP The kernel operator to be tuned + */ + template + static void TuneBlankOperatorEx() { + mxnet::op::mxnet_op::tuned_op::workload_ = GetBlankWorkloadEx(); + if (Super::output_tuning_data_) { + std::cout << "IMPLEMENT_BLANK_WORKLOAD_FWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } + + /*! + * \brief Determine whether to use OMP based upon both timing and configuration using the + * given (templated) operator's workload + * \tparam OP Operator whose workload to use (tuned_op::workload_) + * \param N Number of iterations desired + * \param thread_count Number of OMP threads available to perform the iterations + * \returns Whether it's faster to use OMP for these iterations + */ + template + inline static bool UseOMP(size_t N, size_t thread_count) { + return OperatorTune::UseOMP(N, + thread_count, + static_cast(N) * OP::workload_); + } +}; + +/*! + * \brief Class that tunes binary and unary operators + * \tparam DType Data type to be used when tuning the kernel operations + */ +template +class BinaryOpTune : public UnaryOpTune { + protected: + typedef UnaryOpTune Super; + + public: + /*! + * \brief Tune a generic binary operator + * @tparam OP - Operator type + */ + template + static void TuneBinaryOperator() { + mxnet_op::tuned_op::workload_ = Super::template GetBinaryWorkload(); + if (Super::Super::output_tuning_data_) { + std::cout << "IMPLEMENT_BINARY_WORKLOAD_FWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } + + /*! + * \brief Tune binary backward operator + * \tparam OP - operator + */ + template + static void TuneBinaryBackwardOperator() { + mxnet::op::mxnet_op::tuned_op, DType>::workload_ = + Super::template GetTertiaryWorkload>(); + if (Super::Super::output_tuning_data_) { + std::cout << "IMPLEMENT_BINARY_WORKLOAD_BWD(" + << Super::template type_name() + << "); // NOLINT()" << std::endl << std::flush; // For long lines + } + } +}; + +#undef OUTSIDE_COUNT_SHIFT +#undef WORKLOAD_COUNT_SHIFT + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_OPERATOR_TUNE_INL_H_ diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc new file mode 100644 index 000000000000..9b02b94e790f --- /dev/null +++ b/src/operator/operator_tune.cc @@ -0,0 +1,349 @@ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include "./mxnet_op.h" +#include "./mshadow_op.h" +#include "./tensor/init_op.h" +#include "./operator_tune-inl.h" +#include "./tensor/elemwise_binary_broadcast_op.h" + +namespace mxnet { +namespace op { + +/*! + * \brief Shared static variables for all OperatorTune data types + */ +std::atomic OperatorTuneBase::calculated_(false); +bool OperatorTuneBase::verbose_tuning_info_ = false; +double OperatorTuneBase::tuning_weight_scale_ = 0.0; + +/*! + * \brief Instantiate static variables for OperatorTune, where 'DType' is specified + */ +#define IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(__typ$) \ + template<> bool OperatorTune<__typ$>::initialized_ = false; \ + template<> std::vector<__typ$> OperatorTune<__typ$>::data_set_ = {}; \ + template<> volatile tune::TuningMode OperatorTuneByType<__typ$>::tuning_mode_ = tune::kAuto; \ + template<> volatile int OperatorTune<__typ$>::volatile_int_ = 9; /* arbitrary number */ \ + template<> std::unordered_set OperatorTune<__typ$>::operator_names_ = {}; \ + template<> bool OperatorTune<__typ$>::output_tuning_data_ = false; \ + template<> std::list *OperatorTune<__typ$>::GetTuningList() { \ + static std::list ll; \ + return ≪ \ + } + +/*! + * \brief Static variables for different types (ie OperatorTune, OperatorTune, etc. + */ +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(float); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(double); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(mshadow::half::half_t); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int8_t); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint8_t); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int32_t); +IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int64_t); + +/*! + * \brief Init variable used to facilitate registering a tunable operator during + * static initialization + * \tparam OP Operator type + * \tparam DType Data type + */ +template +struct static_init_var { + static bool init_; +}; + +/*! + * \brief Repeat the given macro and associated arguments for each data type, + * appending the data type to the end of the arguments + */ +#define MSHADOW_MACRO_FOREACH_TYPE(__macro$, ...) \ + __macro$(__VA_ARGS__, float); \ + __macro$(__VA_ARGS__, double); \ + __macro$(__VA_ARGS__, mshadow::half::half_t); \ + __macro$(__VA_ARGS__, uint8_t); \ + __macro$(__VA_ARGS__, int8_t); \ + __macro$(__VA_ARGS__, int32_t); \ + __macro$(__VA_ARGS__, int64_t); + + +#define IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$) \ + namespace mxnet_op { \ + template<> size_t mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::workload_ = INT_MAX / 4; \ + template<> std::vector mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::workload_ex_ = {}; \ + } /* namespace mxnet_op */ + +/*! + * \brief Implement tuning objects for a forward blank (no arguments) kernel operator + */ +#define _IMPLEMENT_BLANK_WORKLOAD_FWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \ + namespace mxnet_op { \ + template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \ + size_t N, size_t omp_threads) { \ + return mxnet::op::UnaryOpTune<__typ$>::UseOMP>( \ + N, omp_threads); \ + }} /* namespace mxnet_op */ \ + template<> bool static_init_var<__op$, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \ + mxnet::op::UnaryOpTune<__typ$>::TuneBlankOperatorEx<__op$>) + +/*! + * \brief Implement tuning objects for a forward unary kernel operator + */ +#define _IMPLEMENT_UNARY_WORKLOAD_FWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \ + namespace mxnet_op { \ + template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \ + size_t N, size_t omp_threads) { \ + return mxnet::op::UnaryOpTune<__typ$>::UseOMP>( \ + N, omp_threads); \ + }} /* namespace mxnet_op */ \ + template<> bool static_init_var<__op$, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \ + mxnet::op::UnaryOpTune<__typ$>::TuneUnaryOperator<__op$>) + +/*! + * \brief Implement tuning objects for a backward unary kernel operator + */ +#define _IMPLEMENT_UNARY_WORKLOAD_BWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(mxnet::op::mxnet_op::backward_grad<__op$>, __typ$); \ + namespace mxnet_op { \ + template<> \ + bool mxnet::op::mxnet_op::tuned_op, __typ$>::UseOMP( \ + size_t N, size_t omp_threads) { \ + return mxnet::op::UnaryOpTune<__typ$>::UseOMP, __typ$>>(N, omp_threads); \ + }} /* namespace mxnet_op */ \ + template<> bool static_init_var, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \ + mxnet::op::UnaryOpTune<__typ$>::TuneUnaryBackwardOperator<__op$>) + +/*! + * \brief Implement tuning objects for a forward binary kernel operator + */ +#define _IMPLEMENT_BINARY_WORKLOAD_FWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \ + namespace mxnet_op { \ + template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \ + size_t N, size_t omp_threads) { \ + return mxnet::op::BinaryOpTune<__typ$>::UseOMP>( \ + N, omp_threads); \ + }} /* namespace mxnet_op */ \ + template<> bool static_init_var<__op$, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \ + mxnet::op::BinaryOpTune<__typ$>::TuneBinaryOperator<__op$>) + +/*! + * \brief Implement tuning objects for a backward binary kernel operator + */ +#define _IMPLEMENT_BINARY_WORKLOAD_BWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(mxnet::op::mxnet_op::backward_grad<__op$>, __typ$); \ + namespace mxnet_op { \ + template<> \ + bool mxnet::op::mxnet_op::tuned_op, __typ$>::UseOMP( \ + size_t N, size_t omp_threads) { \ + return mxnet::op::BinaryOpTune<__typ$>::UseOMP, __typ$>>(N, omp_threads); \ + }} /* namespace mxnet_op */ \ + template<> bool static_init_var, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \ + mxnet::op::BinaryOpTune<__typ$>::TuneBinaryBackwardOperator<__op$>) + +/*! + * \brief Implement tuning objects for a custom forward kernel operator + */ +#define _IMPLEMENT_CUSTOM_WORKLOAD_FWD(__op$, __typ$) \ + IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$<__typ$>, __typ$); \ + template<> bool static_init_var<__op$<__typ$>, __typ$>::init_ = \ + mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$<__typ$>>(\ + __op$<__typ$>::Tune) + +/*! + * \brief Macros for manually adding new blank, unary and binary operators to the tuning set + */ +#define IMPLEMENT_UNARY_WORKLOAD_FWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_UNARY_WORKLOAD_FWD, __op$) + +#define IMPLEMENT_BLANK_WORKLOAD_FWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BLANK_WORKLOAD_FWD, __op$) + +#define IMPLEMENT_UNARY_WORKLOAD_BWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_UNARY_WORKLOAD_BWD, __op$) + +#define IMPLEMENT_BINARY_WORKLOAD_FWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BINARY_WORKLOAD_FWD, __op$) + +#define IMPLEMENT_BINARY_WORKLOAD_BWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BINARY_WORKLOAD_BWD, __op$) + +#define IMPLEMENT_CUSTOM_WORKLOAD_FWD(__op$) \ + MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_CUSTOM_WORKLOAD_FWD, __op$) + +IMPLEMENT_CUSTOM_WORKLOAD_FWD(mxnet::op::mxnet_op::tunable_binary_broadcast_kernel); // NOLINT() + +/*! + * \brief Tuning data and default weights in the case that MXNET_ENABLE_OPERATOR_AUTOTUNE is set + * to zero (thus turning off auto-tuning) + * \note This code can be automatically generated + * by setting the environment variable MXNET_OUTPUT_TUNING_DATA to a positive + * integer value + */ +OperatorTuneBase::duration_t OperatorTuneBase::omp_overhead_ns_ = 5000; +IMPLEMENT_UNARY_WORKLOAD_FWD(mshadow::op::identity); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::identity); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::identity_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::negation); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sigmoid); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sigmoid_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::relu); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::relu_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tanh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tanh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::softrelu); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::softrelu_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::exp); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::exp); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::expm1); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log1p); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log1p_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log2); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log2_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log10); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log10_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sin); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sin_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sinh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sinh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arcsin); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arcsin_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arcsinh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arcsinh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cos); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cos_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cosh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cosh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arccos); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arccos_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arccosh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arccosh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tan); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tan_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arctan); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arctan_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arctanh); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arctanh_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::square); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::square_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::square_root); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::square_root_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal_square_root); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_square_root_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cube_root); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cube_root_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal_cube_root); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_cube_root_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::abs); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sign); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::round); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::floor); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::trunc); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rint); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::fix); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gamma); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gamma_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gammaln); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gammaln_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ceil); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::degrees); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::degrees_grad); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::radians); // NOLINT() +IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::radians_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::plus); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::minus); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::mul); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::div); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::right); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rminus); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rdiv); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_rgrad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_rgrad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rdiv_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::mod); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mod_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mod_rgrad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rmod); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rmod_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::left); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::left); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::right); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::right); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::power); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rpower); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot_grad_left); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::hypot_grad_left); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot_grad_right); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::hypot_grad_right); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::lt); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::lt); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::le); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::le); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gt); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gt); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ge); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ge); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ne); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ne); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::eq); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::eq); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::smooth_l1_loss); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::smooth_l1_gradient); // NOLINT() +IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::mxnet_op::set_to_int<0>); // NOLINT() +IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::mxnet_op::set_to_int<1>); // NOLINT() +IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::PopulateFullIdxRspKernel); // NOLINT() +/*! + * \brief Tuner objects, *not* automatically generated + */ +#ifdef MXNET_USE_OPERATOR_TUNING +static BinaryOpTune binaryOpTuneFloat; +static BinaryOpTune binaryOpTuneDouble; +static BinaryOpTune binaryOpTuneHalf; +static BinaryOpTune binaryOpTuneInt8; +static BinaryOpTune binaryOpTuneUInt8; +static BinaryOpTune binaryOpTuneInt32; +static BinaryOpTune binaryOpTuneInt64; +#endif // MXNET_USE_OPERATOR_TUNING +} // namespace op +} // namespace mxnet diff --git a/src/operator/operator_tune.h b/src/operator/operator_tune.h new file mode 100644 index 000000000000..4f92c9d3cbb4 --- /dev/null +++ b/src/operator/operator_tune.h @@ -0,0 +1,331 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef MXNET_OPERATOR_OPERATOR_TUNE_H_ +#define MXNET_OPERATOR_OPERATOR_TUNE_H_ + +#include +#include +#include +#include +#include + +namespace mxnet { +namespace op { + +#define WORKLOAD_COUNT_SHIFT 11 + +/*! + * \brief Shared data for all data types being tuned, acts as a base class for the higher-level + * templated tunin classes + */ +class OperatorTuneBase { + public: + typedef int64_t duration_t; + + protected: + /*! \brief Have calculated omp_overhead_ yet? */ + static std::atomic calculated_; + /*! \brief Time in nanoseconds for OMP overhead */ + static duration_t omp_overhead_ns_; + /*! \brief Print debug/trace output for tuning info */ + static bool verbose_tuning_info_; + /*! \brief Tuning scale factor */ + static double tuning_weight_scale_; + + public: + typedef std::chrono::high_resolution_clock::time_point Tick; + + /*! + * \brief Get timestamp for "now" + * \return Tick object representing the current itmestamp + */ + static MSHADOW_CINLINE Tick Now() { + return std::move(std::chrono::high_resolution_clock::now()); + } + + /*! + * \brief Get duration in nanoseconds + * \param t1 Start time tick + * \param t2 End time tick + * \return duration in nanoseconds between t1 and t2 + */ + static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &t1, const Tick &t2) { + return static_cast( + std::chrono::duration_cast(t2 - t1).count()); + } + + /*! + * \brief Get duration in nanoseconds between the given 'since' value and now + * \param since Reference time which to calculate the duration + * \return Duration in nanoseconds between the given 'since' value and now + */ + static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &since) { + return GetDurationInNanoseconds(since, Now()); + } + + /*! \brief Loop size to be timed (single op nanos may be too small to store accurately) */ + static constexpr duration_t WORKLOAD_COUNT = (1 << WORKLOAD_COUNT_SHIFT); + + /*! + * \brief Timer convenience class, sets start time as "now" in the constructor + */ + struct Timer { + /*! + * \brief Constructor, sets start time + */ + MSHADOW_CINLINE Timer() + : start_(OperatorTuneBase::Now()) {} + /*! + * \brief Get duration in nanoseconds since construction + * \return Duration in nanoseconds since construction + */ + MSHADOW_CINLINE int64_t duration() const { + return OperatorTuneBase::GetDurationInNanoseconds(start_); + } + + /*! + * \brief Reference start time, set in constructor + */ + const OperatorTuneBase::Tick start_; + }; + + /*! + * \brief Estimate the time to compute with and without OMP, then return whether OMP is faster + * \param N - Number of iterations desired + * \param thread_count - Number of OMP threads available to perform the iterations + * \returns Whether it's faster to use OMP for these iterations + */ + inline static bool IsOMPFaster(size_t N, size_t thread_count, const uint64_t serial_workload) { + if (thread_count >= 2) { + // Compute serial time required + const uint64_t total_serial_time_ns = serial_workload >> WORKLOAD_COUNT_SHIFT; + + // Compute time required for OMP + # items per thread + const uint64_t omp_compute_time_ns = (serial_workload / thread_count) >> WORKLOAD_COUNT_SHIFT; + const uint64_t total_omp_time_ns = omp_overhead_ns_ + omp_compute_time_ns; + + const bool rc = total_omp_time_ns < total_serial_time_ns; + return rc; + } + return false; + } +}; + +namespace tune { +/*! + * \brief Tuning mode for registered kernel operators + */ +enum TuningMode { + kAuto, // Based upon tuning data, choose whether to use OMP for kernel CPU Launch() loops + kNeverOMP, // Don't use OMP for parallelism (legacy behavior for GPU builds) + kAlwaysOMP // Don't use OMP for parallelism (legacy behavior for CPU builds) +}; +} // namespace tune + +template +class OperatorTuneByType : public OperatorTuneBase { + public: + /*! + * \brief Set tuning mode + * \param tuning_mode The tune::TuningMode tuning mode value to set + */ + static MSHADOW_CINLINE void set_tuning_mode(const tune::TuningMode tuning_mode) { + // Use const_cast to get past "assigning non-volatile to volatile warning + const_cast(tuning_mode_) = tuning_mode; + } + + /*! + * \brief Get the current tuning mode + * \return tune::TuningMode value for the current tuning mode + */ + static MSHADOW_CINLINE volatile tune::TuningMode tuning_mode() { + return tuning_mode_; + } + + /*! + * \brief Determine whether to use OMP based upon both timing and configuration + * \param N - Number of iterations desired + * \param thread_count - Number of OMP threads available to perform the iterations + * \returns Whether it's faster to use OMP for these iterations + */ + inline static bool UseOMP(size_t N, size_t thread_count, const uint64_t serial_workload) { +#ifdef MXNET_USE_OPERATOR_TUNING + switch (tuning_mode()) { + case tune::kAuto: + return OperatorTuneBase::IsOMPFaster(N, thread_count, serial_workload); + case tune::kNeverOMP: + return false; + case tune::kAlwaysOMP: + default: + return thread_count > 1; + } +#else + return true; +#endif + } + + protected: + /*! \brief Tuning mode */ + static volatile tune::TuningMode tuning_mode_; +}; + +namespace mxnet_op { +/*! + * \brief Kernel operator wrapper used for tuning data + */ +template +struct tuned_op : public Operation { + /*! \brief nanoseconds to perform WORKLOAD_COUNT operations + * \note It is conceivable that a vector of values could be used for more complex tuning, + * but the need hasn't yet arisen + * \remarks This variable generally needs to be implemented somewhere. Currently this is mostly + * done via macros in operator_tune.cc. If you get undefined reference errors when + * linking, then try to use one of the macros in that file to instantiate the required + * data/functions + */ + static size_t workload_; + + /*! + * \brief Extra workload-calculating information (ie times for sub-portions of the calculation) + */ + static std::vector workload_ex_; + + /*! + * \brief Calls parent class (Operation)'s UseOMP + * \tparam Args Variable arguments passed + * \param N Number of iterations + * \param thread_count Number of threads available + * \param args Variable arguments passed + * \return true if OMP parallelism is recommended + */ + template + static MSHADOW_CINLINE bool UseOMP(size_t N, size_t thread_count, Args... args) { + return Operation::UseOMP(N, thread_count, args...); + } + + /*! + * \brief Call a standard UseOMP() implementation (if it exists). Currently, these + * are implemented in operator_tune.cc for standard unary, binary, + * and argumentless kernels (i.e. mshadow_op::sqrt) + * \tparam Args Variable arguments passed + * \param N Number of iterations + * \param thread_count Number of threads available + * \param args Variable arguments passed + * \return true if OMP parallelism is recommended + */ + static bool UseOMP(size_t N, size_t thread_count); +}; +} // namespace mxnet_op + +/*! + * \brief Calculate workload for a given lambda function + * \tparam Function Lambda type to time for WORKLOAD_COUNT calls + * \param function Lambda to time for WORKLOAD_COUNT calls + * \return median workload for function call (nanoseconds for WORKLOAD_COUNT calls) + */ +template +inline int64_t get_workload(Function function) { + std::multiset durations; + typename OperatorTuneBase::Timer timer; + for (int pass = 0; pass < 3; ++pass) { + for (int i = 0; i < OperatorTuneBase::WORKLOAD_COUNT; ++i) { + function(); + } + } + const OperatorTuneBase::duration_t dd = timer.duration(); + durations.insert(dd); + return *++durations.begin(); // return median value +} + +/*! + * \brief Declare a template specialization for the Kernel::Launch call for the given OP + * wrapped with mxnet_op::op_with_req, using the given OpReqType as the 'req' + * template parameter for 'op_with_req'. This is useful for the standard mshadow_op + * operators which need to be wrapped with op_with_req in order to be used with the + * Kernel::Launch command. + * + * \note Expects to be used within the mxnet::op namespace + * + * For example: + * + * namespace mxnet_op { + * template <> + * template + * inline void Kernel, cpu> + * ::Launch(mshadow::Stream* s, const int N, Args... args) { + * ::mxnet::op::mxnet_op::Kernel, + * cpu>::LaunchMShadowOpEx(s, N, args...); + * } + * } + * + */ +#define MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, __req$) \ + namespace mxnet_op { \ + template<> template \ + inline void Kernel, ::mshadow::cpu>:: \ + Launch(mshadow::Stream<::mshadow::cpu> *s, const int N, Args... args) { \ + /* Launch via LaunchMShadowOpEx() */ \ + KernelWrapper, ::mshadow::cpu>:: \ + LaunchMShadowOpEx(s, N, args...); \ + } \ + } /* namespace mxnet_op */ + +/*! + * \brief Declare template specializations for the Kernel::Launch call for the given OP + * wrapped with mxnet_op::op_with_req, using the all supported OpReqType as the 'req' + * template parameter for 'op_with_req'. This is useful for the standard mshadow_op + * operators which need to be wrapped with op_with_req in order to be used with the + * Kernel::Launch command. + * \note Expects to be used within the mxnet::op namespace + */ +#define MXNET_TUNABLE_MSHADOW_OP(__op$) \ + MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kNullOp); \ + MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kWriteTo); \ + MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kWriteInplace); \ + MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kAddTo); + +#define MXNET_TUNABLE_MSHADOW_OP_BACKWARD(__op$) \ + MXNET_TUNABLE_MSHADOW_OP(mxnet::op::mxnet_op::backward_grad<__op$>) + +#define MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(__op$) \ + MXNET_TUNABLE_MSHADOW_OP(__op$) \ + MXNET_TUNABLE_MSHADOW_OP_BACKWARD(__op$) + +/*! + * \brief mxnet::op::mxnet_op format ops (work directly with Kernel<>::Launch() + * Used from within mxnet::op::mxnet_op namespace + */ +#define _MXNET_TUNABLE_MXNET_OP_FWD(__op$) \ + template<> template inline void Kernel<__op$, ::mshadow::cpu>::Launch( \ + mshadow::Stream<::mshadow::cpu> *s, const int N, Args... args) { \ + /* Launch via LaunchMXNetOpEx() */ \ + KernelWrapper<__op$, ::mshadow::cpu>::LaunchMXNetOpEx(s, N, args...); \ + } + +/*! + * \brief mxnet::op::mxnet_op format ops (work directly with Kernel<>::Launch() + * Used from within mxnet::op + */ +#define MXNET_TUNABLE_MXNET_OP_FWD(__op$) \ + namespace mxnet_op { _MXNET_TUNABLE_MXNET_OP_FWD(__op$) } /* namespace mxnet_op */ + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_OPERATOR_TUNE_H_ diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 211b567ba1bc..75e3a4d610ad 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -25,16 +25,19 @@ #ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_H_ #define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_H_ +#include #include #include #include #include #include +#include #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "./elemwise_binary_op.h" #include "../operator_common.h" #include "broadcast_reduce-inl.h" +#include "../mxnet_op.h" namespace mxnet { namespace op { @@ -135,24 +138,122 @@ inline int BinaryBroadcastShapeCompact(const TShape& lshape, const TShape& rshap } namespace mxnet_op { -template -struct binary_broadcast_kernel { + +/*! + * \brief Type-level specialization base class for binary_broadcast_kernel + * \tparam DType + */ +template struct tunable_binary_broadcast_kernel; + +template +struct binary_broadcast_kernel : public tunable_binary_broadcast_kernel { + /*! \brief Map function for binary_broadcast_kernel */ MSHADOW_XINLINE static void Map(int base, int length, OpReqType req, const Shape& lstride, const Shape& rstride, const Shape& oshape, DType* lhs, DType* rhs, - DType* out, int lsize, int rsize) { - Shape coord = unravel(base, oshape); - index_t lidx = dot(coord, lstride); - index_t ridx = dot(coord, rstride); - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - // starts from 1 to avoid extra inc at end of loop - for (int i = 1; i < length; ++i) { - inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - KERNEL_ASSIGN(out[base+i], req, OP::Map(lhs[lidx], rhs[ridx])); + DType* out) { + if (req != kNullOp) { + Shape coord = unravel(base, oshape); + auto lidx = static_cast(dot(coord, lstride)); + auto ridx = static_cast(dot(coord, rstride)); + if (!is_tuning) { + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); + } + // starts from 1 to avoid extra inc at end of loop + for (int i = 1; i < length; ++i) { + inc(&coord, oshape, &lidx, lstride, &ridx, rstride); + // When tuning, don't actually run the op, since it's not going to be tuned against + // the actual op we'll eventually be using + if (!is_tuning) { + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); + } + } } } + + /*! + * \brief Decide whether to use OpenMP parallelization + * \tparam Args Variable number and types of arguments (passed same args as Map()) + * \param N Number of iterations + * \param thread_count Number of OMP threads available + * \param req Req type (i.e. kNullOp, kWriteTo, kAddTo, etc) + * \param args remaining (unused) arguments + * \return true if OMP parallelization should be used for the N iterations + */ + template + static bool UseOMP(const size_t N, const size_t thread_count, OpReqType req, Args... args) { + if (req != kNullOp) { + switch (OperatorTuneByType::tuning_mode()) { + case tune::kAuto: { + CHECK_GT(thread_count, 0) << "Invalid thread count: " << thread_count; + const uint64_t length = (N + thread_count - 1) / thread_count; + + float wl = tuned_op, DType>::workload_ex_[0] + + tuned_op, DType>::workload_ex_[1] + * length; + // OP::Map() is called 'length' times for each map call + // Get actual price per OP by removing overhead, such as subtracting workload of + // a trivial operation such as set_zero + int64_t subop_actual_workload = + tuned_op::workload_ - tuned_op::workload_; + if (subop_actual_workload < 0) { + subop_actual_workload = 1; + } + wl += 0.75f * subop_actual_workload * length; + return OperatorTuneByType::IsOMPFaster(N, thread_count, static_cast(wl)); + } + case tune::kAlwaysOMP: + return true; + case tune::kNeverOMP: + default: + return false; + } + } + return false; + } }; +/*! + * \brief Type-specific tuning + * \tparam DType + */ +template +struct tunable_binary_broadcast_kernel { + /*! \brief Allows LaunchEx to know the data type for tuning_op<> selection */ + typedef DType DataType; + /*! + * \brief Run-time tuning of sub-op-independent binary_broadcast_kernel + */ + static void Tune() { + constexpr int dim = 2; // Have to pick one to represent all + Shape oshape, lstride, rstride; + for (index_t i = 0; i < dim; ++i) { + oshape[i] = 28U * (i + 1); + lstride[i] = 2U * (i + 1); + rstride[i] = 3U * (i + 1); + } + const int base = 28; + const size_t data_size = lstride.Size() * rstride.Size() * oshape.Size(); + std::unique_ptr data(new DType[data_size]); + memset(data.get(), 0, data_size); // get into cache + tuned_op, DType>::workload_ex_.push_back( + get_workload([&]() { binary_broadcast_kernel<2, DType, mshadow_op::left, true>::Map( + base, 1, kWriteTo, lstride, rstride, oshape, data.get(), data.get(), nullptr); + })); + tuned_op, DType>::workload_ex_.push_back( + get_workload([&]() { binary_broadcast_kernel<2, DType, mshadow_op::left, true>::Map( + base, 1000, kWriteTo, lstride, rstride, oshape, data.get(), data.get(), nullptr); + })); + // Record base time for function + tuned_op, DType>::workload_ex_[1] -= + tuned_op, DType>::workload_ex_[0]; + // Record per-length item adder + tuned_op, DType>::workload_ex_[1] /= 1000; + if (tuned_op, DType>::workload_ex_[1] <= 0) { + tuned_op, DType>::workload_ex_[1] = 1; + } + } +}; } // namespace mxnet_op template @@ -161,7 +262,6 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - using namespace mxnet_op; TShape new_lshape, new_rshape, new_oshape; int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_, &new_lshape, &new_rshape, &new_oshape); @@ -171,13 +271,12 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, mshadow::Stream *s = ctx.get_stream(); MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - Shape oshape = new_oshape.get(); - Shape lstride = calc_stride(new_lshape.get()); - Shape rstride = calc_stride(new_rshape.get()); - Kernel, xpu>::LaunchEx( - s, new_oshape.Size(), req[0], lstride, rstride, oshape, - inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr(), - inputs[0].Size(), inputs[1].Size()); + mshadow::Shape oshape = new_oshape.get(); + mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); + mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); + mxnet_op::Kernel, xpu>:: + template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape, + inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr()); }); }); } diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index 1d30c88011c3..95e8184f8a05 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -272,6 +272,7 @@ struct PopulateFullIdxRspKernel { KERNEL_ASSIGN(out[i], kWriteTo, i); } }; +MXNET_TUNABLE_MXNET_OP_FWD(PopulateFullIdxRspKernel); // Fill in the indices and values of a RowSparse NDArray to represent a zeros NDArray, // instead of the usual compact representation. diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 1bcd0e2df42f..51cbcd71a3b9 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -36,15 +36,15 @@ namespace op { #define COREOP_BWD_OP_NAME_VALUE_NONE "[none]" enum TimingDirection { - Forward, - Backward + kForward, + kBackward }; inline const char *TimingDirectionAsString(const TimingDirection td) { switch (td) { - case Forward: + case kForward: return "Forward"; - case Backward: + case kBackward: return "Backward"; default: CHECK(false) << "Unknown timing direction: " << static_cast(td); @@ -426,7 +426,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inline bool initBackward(const OpProp &opProp, std::vector *in_type) { return true; } inline void forward(const size_t count) { - perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count); + perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kForward, "Forward", count); VTuneResume profile; for (size_t i = 0; i < count; ++i) { Execute(); @@ -435,7 +435,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inline void backward(const size_t count) { CHECK(HasBackward()); - perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count); + perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kBackward, "Backward", count); VTuneResume profile; for (size_t i = 0; i < count; ++i) { ExecuteBackward(); diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 3b06b1a29f06..0992c41f760e 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -44,6 +44,14 @@ class OperatorRunner { public: typedef typename OperatorExecutor::DataType DType; + OperatorRunner() { +#ifdef NDEBUG + total_iterations_ = 50; +#else + total_iterations_ = 5; +#endif + } + /*! * \brief Test operator forward pass * \param isGPU Whether this test is for GPU @@ -130,33 +138,34 @@ class OperatorRunner { int dim = 0, size_t count = 1, const std::vector& timing_shapes = {}) { -#ifdef NDEBUG - size_t COUNT = 50; -#else - size_t COUNT = 5; -#endif if (mxnet::test::quick_test) { - COUNT = 2; + total_iterations_ = 2; count = 1; } test::perf::TimingInstrument timing; std::stringstream ss; - ss << "Timing: " << COUNT << " iterations of " << count << " calls"; + ss << "Timing: " << total_iterations_ << " iterations of " << count << " calls"; if (timing_shapes[0].ndim()) { + size_t lhs_total = 0; ss << ", shape = "; for (size_t i = 0, n = timing_shapes.size(); i < n; ++i) { if (i) { ss << ", "; } ss << timing_shapes[i]; + if (!i) { + lhs_total = timing_shapes[i].Size(); + } } - ss << std::endl << std::flush; + ss << " = " << test::pretty_num(lhs_total) << " items " << std::endl << std::flush; + } + if (!mxnet::test::csv) { + std::cout << ss.str(); } - std::cout << ss.str(); - for (size_t i = 0; i < COUNT; ++i) { + for (size_t i = 0; i < total_iterations_; ++i) { index_t batchSize = 1; index_t channels = 1; index_t depth = 1; @@ -223,16 +232,17 @@ class OperatorRunner { } } - if (verbose_) { + if (verbose_ && !mxnet::test::csv) { timing.print(&std::cout, label); std::cout << std::endl << std::flush; } - return timing.data(); } void set_verbose(bool verbose) { verbose_ = verbose; } + void set_total_iterations(size_t iterations) { total_iterations_ = iterations; } + protected: static constexpr int TEST_BATCH_SIZE = 5; static constexpr int TEST_CHANNELS = 3; @@ -247,6 +257,8 @@ class OperatorRunner { static constexpr int TIMING_DW = 64; /*! \brief verbose output */ bool verbose_ = true; + /*! \brief Tital iterations */ + size_t total_iterations_ = 10; }; } // namespace test diff --git a/tests/cpp/include/test_tune.h b/tests/cpp/include/test_tune.h new file mode 100644 index 000000000000..725aa90a10a7 --- /dev/null +++ b/tests/cpp/include/test_tune.h @@ -0,0 +1,333 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file test_tune.h + * \brief operator tuning tester + * \author Chris Olivier +*/ + +#ifndef TEST_TUNE_H_ +#define TEST_TUNE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../src/operator/operator_tune-inl.h" +#include "./test_util.h" +#include "./test_op.h" +#include "./test_core_op.h" + +namespace mxnet { +namespace test { +namespace tune { + +/*! + * \brief Tuning tests, which whether the correct tuning mode is selected by Auto + * \note This class makes no attempt at being performant (i.e. it does all sorts of slow + * deep copies and that sort of thing), so don't insert any of thios code in the main + * trunk unless you've verified the performance characteristics for that chunk of code + * \tparam DType Data type to test + */ +template +class TuningTester { + public: + using kwargs_t = test::op::kwargs_t; + + using bool_mode_pair = std::pair; + + using shape_vect = std::vector; + using shape_vec_to_bool_map = std::map; + + private: + using ShapesToPerfTimingMap = + std::map; + + /*! + * \brief Run timing test on various data shapes and sizes + * \param isGPU true if the GPU should be used for the timing test + * \param op_kwargs operator parameters + * \param op_name The operator's registered name (with nnvm) + * \param backward_op_name The backward operator's registered name (with nnvm) + * \return ShapesToPerfTimingMap map holsing timing data for shapes + */ + ShapesToPerfTimingMap RunCoreOpTimingTest(const bool isGPU, + const kwargs_t &op_kwargs, + const std::vector& shapes, + const char *op_name, + const char *backward_op_name = "") { + ShapesToPerfTimingMap res; + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( + op_kwargs, op_name, backward_op_name); + + // prime code and cache before the performance runs + test::op::CoreOperatorRunner runner; + runner.set_total_iterations(total_iterations_); + runner.set_verbose(false); + runner.RunBidirectional(false, {{10, 3, 18, 128}}, kwargs, 1); + + // Do the performance runs + const char *pu = isGPU ? "GPU" : "CPU"; + for (const std::vector &this_run_shapes : shapes) { + test::perf::timing_map_t tmap = runner.TimingTest(std::string(op_name) + " Operator " + pu, + isGPU, false, kwargs, + 0, calls_per_iteration_, + this_run_shapes); + CHECK(res.find(this_run_shapes) == res.end()); + res[this_run_shapes] = tmap; + } + return std::move(res); + } + + using tuned_timing_t = std::map< + shape_vect, + std::map<::mxnet::op::tune::TuningMode, test::perf::timing_map_t>, test::less_shapevect>; + + using modesort_t = std::multimap; + + /*! + * \brief Check if the tuning succeeded + * \param mode_sort modesort_t structure produced by 'CalculateModeSort' + * \param closeness_factor fraction of largest standard time (omp, no omp) which is an acceptable + * range + * \return a pair consisting of true or false signifying if the test appears to + * have made the correct decision, and the TuningMode which was closest in timing to + * the Auto mode. + */ + static bool_mode_pair CheckCorrectTuning(const modesort_t &mode_sort, + const double closeness_factor = 0.25) { + CHECK_EQ(mode_sort.size(), 3U); + + // Determine fastest normal mode + ::mxnet::op::tune::TuningMode fastest_standard_mode = ::mxnet::op::tune::kAuto; + for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) { + if (i->second != ::mxnet::op::tune::kAuto) { + fastest_standard_mode = i->second; + break; + } + } + CHECK_NE(fastest_standard_mode, ::mxnet::op::tune::kAuto); + + // We should be closest to the faster of kNeverOMP and kAlwaysOMP + // Take into account some variance, especially if kNeverOMP and kAlwaysOMP are close together + std::map<::mxnet::op::tune::TuningMode, double> mode2time; + for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) { + mode2time[i->second] = i->first; + } + const double time_auto = mode2time[::mxnet::op::tune::kAuto]; + const double time_no_omp = mode2time[::mxnet::op::tune::kNeverOMP]; + const double time_omp = mode2time[::mxnet::op::tune::kAlwaysOMP]; + + // Figure out which one we are closest to and return that to help in the analysis + ::mxnet::op::tune::TuningMode closest_to; + if (fabs(time_auto - time_no_omp) < fabs(time_auto - time_omp)) { + closest_to = ::mxnet::op::tune::kNeverOMP; + } else { + closest_to = ::mxnet::op::tune::kAlwaysOMP; + } + + // If difference between OMP and no OMP is < closeness_factor of largest of the two, + // then we just want to make sure we are close to both of these + const double fastest_standard_time = std::min(time_no_omp, time_omp); + const double allowed_difference = closeness_factor * fastest_standard_time; + const double mustbe_asfast = fastest_standard_time + allowed_difference; + + return { time_auto <= mustbe_asfast || closest_to == fastest_standard_mode, + closest_to }; + } + + public: + /*! + * \brief Given timing statistics, determine if 'Auto' mode made the correct choice. + * \param direction Compute direction for which to check (Forward or Backward) + * \param verbose If true, print the statistical info + * \return A map of shape vectors to a pair consisting of true or false + * signifying if the test appears to have made the correct decision, and the TuningMode + * which was closest in timing to the Auto mode. + */ + shape_vec_to_bool_map CalculateModeSort(const test::op::TimingDirection direction, + bool verbose = true) const { + if (test::csv) { + verbose = false; + } + shape_vec_to_bool_map results; + // Incredibly inefficient method of grouping the results + for (const auto &i : timing_) { + // print shapes + const shape_vect &shapes = i.first; + if (verbose || test::csv) { + if (!test::csv) { + for (size_t x = 0, n = shapes.size(); x < n; ++x) { + const TShape &shape = shapes[x]; + if (x) { + std::cout << ", "; + } + std::cout << shape; + } + const TShape &lhs_shape = shapes[0]; + std::cout << " lhs=" << test::pretty_num(lhs_shape.Size()) << " items"; + std::cout << "\t(" << TimingDirectionAsString(direction) << ")" << std::endl; + } else { + std::cout << test::pretty_num(shapes[0].Size()) << ","; + } + } + const auto &mode2timing = i.second; + modesort_t mode_sort; + for (const auto &j : mode2timing) { + const ::mxnet::op::tune::TuningMode mode = j.first; + const test::perf::timing_map_t &tm = j.second; + if (tm.find(direction) != tm.end()) { + const test::perf::TimingInstrument::Info &info = tm.find(direction)->second; + double duration = info.TimeEach(); + mode_sort.insert({duration, mode}); + if (test::csv) { + std::cout << TimingDirectionAsString(direction) << "," + << ::mxnet::op::tune::TuningModeToString(mode) << "," + << duration << ","; + } + } + } + if (test::csv) { + std::cout << std::endl << std::flush; + } + if (!mode_sort.empty()) { + // Now we have modes sorted by performance, fastest to slowest + const bool_mode_pair result = CheckCorrectTuning(mode_sort); + if (verbose && !test::csv) { + for (const auto &k : mode_sort) { + std::cout << "\t" << ::mxnet::op::tune::TuningModeToString(k.second) + << ": " << k.first << " ms"; + if (k.second == ::mxnet::op::tune::kAuto) { + std::cout << " (" << ::mxnet::op::tune::TuningModeToString(result.second) << ")"; + } + std::cout << std::endl; + } + std::cout << std::flush; + if (!result.first) { + std::cout << "*** WARNING: Wrong OMP state selected ***" << std::endl << std::flush; + } + } + CHECK(results.find(shapes) == results.end()) << "Duplicate entry for set of shapes"; + results[shapes] = result; + } + } + return std::move(results); + } + + /*! + * \brief Perform execution runs for a given forward (and optionally backward) operator + * \param kwargs Parameters for the operator + * \param op_name Name by which the operator is registered with nnvm + * \param backward_op_name Backward operator name + */ + void TestTunedOperator(const kwargs_t &kwargs, + const bool verbose, + const std::vector& shapevec_vectors, + const char *op_name, + const char *backward_op_name = COREOP_BWD_OP_NAME_VALUE_NONE) { + timing_.clear(); + using namespace mxnet::op; + tuned_timing_t timing; + for (int x = 0; x < 1; ++x) { + for (auto mode : {::mxnet::op::tune::kNeverOMP, + ::mxnet::op::tune::kAuto, + ::mxnet::op::tune::kAlwaysOMP + }) { + if (verbose && !test::csv) { + std::cout << std::endl << ::mxnet::op::tune::TuningModeToString(mode) + << std::endl << std::flush; + } + + mxnet::op::OperatorTune::set_tuning_mode(mode); + const ShapesToPerfTimingMap shapes2perfmap = RunCoreOpTimingTest(false, + kwargs, + shapevec_vectors, + op_name, + backward_op_name); + for (const auto &item : shapes2perfmap) { + const shape_vect &shapes = item.first; + const test::perf::timing_map_t &tm = item.second; + timing_[shapes][mode] = tm; + } + } + } + } + + /*! + * \brief Calculate the success rate of the run based upon Auto being close to the faster + * OMP/non-OMP attempt + * \param modes List of directions to use in calculation (Forward, Backward). Empty list means all + * \param verbose Whether to print info + * \return Success rate ratio (#success/#TOTAL) (0.0-1.0) + */ + float CalculateSuccessRate(std::vector directions = {}, + bool verbose = true) const { + size_t count = 0, success = 0; + if (directions.empty()) { + directions = {test::op::kForward, test::op::kBackward}; + } + for (const test::op::TimingDirection direction : directions) { + typename test::tune::TuningTester::shape_vec_to_bool_map res_fwd = + CalculateModeSort(direction, verbose); + for (auto iter = res_fwd.begin(), e = res_fwd.end(); iter != e; ++iter) { + ++count; + if (iter->second.first) { + ++success; + } + } + } + if (count) { + return static_cast(success) / static_cast(count); + } + return 1.0f; // nothing ventured, nothing failed (glass-is-half-full angle) + } + + void set_calls_per_iteration(size_t calls_per_iterations) { + calls_per_iteration_ = calls_per_iterations; + } + size_t calls_per_iteration(size_t calls_per_iterations) const { + return calls_per_iteration_; + } + void set_total_iterations(size_t iterations) { total_iterations_ = iterations; } + size_t total_iterations(size_t iterations) const { return total_iterations_; } + + private: + /*! \brief Number of iterations */ + size_t total_iterations_ = 10; + /*! \brief Calls per iteration */ + size_t calls_per_iteration_ = 50; + /*! \brief Raw timing data */ + tuned_timing_t timing_; +}; + +} // namespace tune +} // namespace test +} // namespace mxnet + +#endif // TEST_TUNE_H_ diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h index edfa2d0660d6..8347a8a9d7b3 100644 --- a/tests/cpp/include/test_util.h +++ b/tests/cpp/include/test_util.h @@ -44,6 +44,7 @@ extern bool unitTestsWithCuda; extern bool debug_output; extern bool quick_test; extern bool performance_run; +extern bool csv; /*! \brief Pause VTune analysis */ struct VTunePause { @@ -672,16 +673,20 @@ struct less_shapevect { }; inline std::string pretty_num(uint64_t val) { - std::string res, s = std::to_string(val); - size_t ctr = 0; - for (int i = static_cast(s.size()) - 1; i >= 0; --i, ++ctr) { - if (ctr && (ctr % 3) == 0) { - res += ","; + if (!test::csv) { + std::string res, s = std::to_string(val); + size_t ctr = 0; + for (int i = static_cast(s.size()) - 1; i >= 0; --i, ++ctr) { + if (ctr && (ctr % 3) == 0) { + res += ","; + } + res.push_back(s[i]); } - res.push_back(s[i]); + std::reverse(res.begin(), res.end()); + return res; + } else { + return std::to_string(val); } - std::reverse(res.begin(), res.end()); - return res; } /*! \brief Change a value during the scope of this declaration */ diff --git a/tests/cpp/operator/broadcast_perf.cc b/tests/cpp/operator/broadcast_perf.cc index 5edba0b6d15f..57b700af93b6 100644 --- a/tests/cpp/operator/broadcast_perf.cc +++ b/tests/cpp/operator/broadcast_perf.cc @@ -26,20 +26,12 @@ #include #include "../include/test_op_runner.h" #include "../include/test_core_op.h" +#include "../include/test_tune.h" using namespace mxnet; using kwargs_t = test::op::kwargs_t; -/*! - * \brief Generic bidirectional sanity test - */ -TEST(BROADCAST_PERF, ExecuteBidirectional) { - test::op::BasicRunCoreOpBidirectional(false, true, {}, - { {2, 3}, {2, 1} }, - "broadcast_add", "_backward_broadcast_add"); -} - static const std::vector> broadcast_shapes() { std::vector> shapes; if (test::performance_run) { @@ -47,9 +39,10 @@ static const std::vector> broadcast_shapes() { { {28, 28}, {28, 1} }, { {64, 28}, {1, 28} }, { {28, 28, 28}, {28, 28, 1} }, - { {128, 128}, {1, 128} }, + { {128, 256}, {1, 256} }, + { {1024, 256}, {1, 256} }, { {1024, 12, 256}, {1024, 1, 1} }, - { {2560, 1280}, {2560, 1} } + { {2560, 1280, 128}, {2560, 1, 128} } }; } else { shapes = { @@ -62,6 +55,15 @@ static const std::vector> broadcast_shapes() { return std::move(shapes); } +/*! + * \brief Generic bidirectional sanity test + */ +TEST(BROADCAST_PERF, ExecuteBidirectional) { + test::op::BasicRunCoreOpBidirectional(false, true, {}, + { broadcast_shapes()[0] }, + "broadcast_add", "_backward_broadcast_add"); +} + template static void RunCoreOpTimingTest(const bool isGPU, const kwargs_t& op_kwargs, @@ -72,7 +74,7 @@ static void RunCoreOpTimingTest(const bool isGPU, // prime code and cache before the performance runs test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, { {2, 3}, {2, 1} }, kwargs, 1); + runner.RunBidirectional(false, { broadcast_shapes()[0] }, kwargs, 1); // Do the performance runs std::vector> shapes = broadcast_shapes(); @@ -87,7 +89,11 @@ static void RunCoreOpTimingTest(const bool isGPU, * \brief ActivationOp timing test for CPU */ TEST(BROADCAST_PERF, TimingCPU) { - RunCoreOpTimingTest(false, {}, "broadcast_add", "_backward_broadcast_add"); + if (!test::csv) { + RunCoreOpTimingTest(false, {}, "broadcast_add", "_backward_broadcast_add"); + } else { + RunCoreOpTimingTest(false, {}, "broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE); + } } #if MXNET_USE_CUDA == 1 @@ -99,3 +105,85 @@ TEST(BROADCAST_PERF, TimingGPU) { } #endif // MXNET_USE_CUDA == 1 +/*! + * \brief Rune a tuning evaluation + * \tparam DType Data type for which to evaluate tuning + */ +template +static float EvaluateTune(bool verbose = true) { + std::vector> binary_operators; + if (test::performance_run) { + binary_operators = { + {"broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE}, + {"broadcast_mul", COREOP_BWD_OP_NAME_VALUE_NONE}, + {"broadcast_div", COREOP_BWD_OP_NAME_VALUE_NONE} + }; + } else { + binary_operators = { + {"broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE} + }; + } + std::vector rates; + for (size_t i = 0, n = binary_operators.size(); i < n; ++i) { + test::tune::TuningTester tuningTester; + tuningTester.set_calls_per_iteration(10); + tuningTester.set_total_iterations(5); + std::cout << "******************************" << std::endl; + std::cout << "Operators: " << binary_operators[i].first << ", " + << binary_operators[i].second << " for type: " << test::type_name() + << std::endl; + std::cout << "******************************" << std::endl; + + // Prime code and cache + test::op::BasicRunCoreOpBidirectional(false, false, {}, + { broadcast_shapes()[0] }, + binary_operators[i].first.c_str(), + binary_operators[i].second.c_str()); + + // Do the performance runs + std::vector> shapes = broadcast_shapes(); + + tuningTester.TestTunedOperator({}, true, shapes, + binary_operators[i].first.c_str(), + binary_operators[i].second.c_str()); + rates.push_back(tuningTester.CalculateSuccessRate({}, verbose)); + } + return std::accumulate(rates.begin(), rates.end(), 0.0f) / rates.size(); +} + +/*! \brief ActivationOp timing test for CPU for float */ +TEST(BROADCAST_PERF, EvaluateTuneTestFloat) { + typedef float DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for double */ +TEST(BROADCAST_PERF, EvaluateTuneTestDouble) { + typedef double DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +TEST(BROADCAST_PERF, EvaluateTuneTestFloat16) { + typedef mshadow::half::half_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int8_t */ +TEST(BROADCAST_PERF, EvaluateTuneTestInt8) { + typedef uint8_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int32_t */ +TEST(BROADCAST_PERF, EvaluateTuneTestInt32) { + typedef int32_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int64_t */ +TEST(BROADCAST_PERF, EvaluateTuneTestInt64) { + typedef int64_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} + diff --git a/tests/cpp/operator/tune/operator_tune_test.cc b/tests/cpp/operator/tune/operator_tune_test.cc new file mode 100644 index 000000000000..5ecb03cc5b53 --- /dev/null +++ b/tests/cpp/operator/tune/operator_tune_test.cc @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include +#include "../../src/operator/activation-inl.h" +#include "../../src/operator/operator_tune-inl.h" +#include "../include/test_op_runner.h" +#include "../include/test_core_op.h" +#include "../include/test_tune.h" + +using namespace mxnet; + +/*! + * \brief ActivationOp timing test for CPU + */ +TEST(OMP_TUNING, ShowAllTunedOps) { + const std::unordered_set& op_names = op::OperatorTune::TunedOperatorNames(); + for (auto iter = op_names.begin(), e_iter = op_names.end(); iter != e_iter; ++iter) { + std::cout << *iter << std::endl; + } +} + +using kwargs_t = test::op::kwargs_t; + +static std::vector> tuning_shapes() { + std::vector> shapes; + if (test::performance_run || test::csv) { + shapes = { + {{1, 1, 28, 28}}, + {{1, 3, 28, 28}}, + {{50, 1, 18, 32}}, + {{25, 3, 64, 64}}, + {{10, 3, 128, 128}}, + {{20, 3, 128, 128}}, + {{30, 3, 128, 128}}, + {{30, 3, 256, 128}}, + }; + } else { + shapes = { + // Non-performance dataset acts as a sanity test + {{1, 1, 28, 28}}, + {{50, 3, 18, 32}} + }; + } + return std::move(shapes); +} + +/*! + * \brief Generic bidirectional sanity test + */ +TEST(OMP_TUNING, ExecuteBidirectional) { + test::op::BasicRunCoreOpBidirectional(false, true, {}, {tuning_shapes()[0]}, + "elemwise_add", "_backward_add"); +} + +/* Some test results: + * AWS c4.8xlarge: + Success rate for type float: 0.90278 + Success rate for type double: 0.88889 + Success rate for type mshadow::half::half_t: 0.83333 + Success rate for type unsigned char: 0.86111 + Success rate for type int: 0.95833 + Success rate for type long: 0.88889 + * desktop: 12-core (6 real CPU cores + hyperthreading) + Success rate for type float: 0.78125 + Success rate for type double: 0.85417 + Success rate for type mshadow::half::half_t: 0.84375 + Success rate for type unsigned char: 0.80208 + Success rate for type int: 0.94444 + Success rate for type long: 1.00000 + */ + +/*! + * \brief Rune a tuning evaluation + * \tparam DType Data type for which to evaluate tuning + */ +template +static float EvaluateTune(const bool verbose = true) { + std::vector> binary_operators; + if (test::csv) { + binary_operators = { + {"elemwise_add", COREOP_BWD_OP_NAME_VALUE_NONE} + }; + } else if (test::performance_run) { + binary_operators = { + {"relu", ""}, // Code can figure out what the backward op is for some + {"sigmoid", ""}, + {"sqrt", ""}, + {"elemwise_add", "_backward_add"}, + {"elemwise_mul", "_backward_mul"}, + {"elemwise_div", "_backward_div"} + }; + } else { + binary_operators = { + {"elemwise_add", "_backward_add"} + }; + } + std::vector rates; + for (size_t i = 0, n = binary_operators.size(); i < n; ++i) { + test::tune::TuningTester tuningTester; + tuningTester.set_calls_per_iteration(10); + tuningTester.set_total_iterations(5); + std::cout << "******************************" << std::endl; + std::cout << "Operators: " << binary_operators[i].first + << ", " << binary_operators[i].second + << " for type: " << test::type_name() + << std::endl; + std::cout << "******************************" << std::endl; + + // Do the performance runs + std::vector> shapes = tuning_shapes(); + + tuningTester.TestTunedOperator({}, verbose, shapes, + binary_operators[i].first.c_str(), + binary_operators[i].second.c_str()); + rates.push_back(tuningTester.CalculateSuccessRate()); + } + return std::accumulate(rates.begin(), rates.end(), 0.0f) / rates.size(); +} + +/*! \brief ActivationOp timing test for CPU for float */ +TEST(OMP_TUNING, EvaluateTuneTestFloat) { + typedef float DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for double */ +TEST(OMP_TUNING, EvaluateTuneTestDouble) { + typedef double DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for float16 */ +TEST(OMP_TUNING, EvaluateTuneTestFloat16) { + typedef mshadow::half::half_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int8_t */ +TEST(OMP_TUNING, EvaluateTuneTestInt8) { + typedef uint8_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int32_t */ +TEST(OMP_TUNING, EvaluateTuneTestInt32) { + typedef int32_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} +/*! \brief ActivationOp timing test for CPU for int64_t */ +TEST(OMP_TUNING, EvaluateTuneTestInt64) { + typedef int64_t DType; + const float result = EvaluateTune(); + std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; +} + diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index fff1ca2ebd66..a882b0b21f2c 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -36,7 +36,8 @@ static bool dumpCallback(const google_breakpad::MinidumpDescriptor& descriptor, } #endif -namespace mxnet { namespace test { +namespace mxnet { +namespace test { bool unitTestsWithCuda = false; #ifdef NDEBUG bool debug_output = false; @@ -45,7 +46,9 @@ bool debug_output = false; #endif bool quick_test = false; bool performance_run = false; -}} +bool csv = false; +} // namespace test +} // namespace mxnet #if MXNET_USE_CUDA @@ -90,6 +93,8 @@ int main(int argc, char ** argv) { mxnet::test::debug_output = true; } else if (!strcmp(argv[x], "--perf")) { mxnet::test::performance_run = true; + } else if (!strcmp(argv[x], "--csv")) { + mxnet::test::csv = true; } else if (!strcmp(argv[x], "--quick") || !strcmp(argv[x], "-q")) { mxnet::test::quick_test = true; } From 38a767d5a3fa8d2f54067a855eb08d6d5641feb8 Mon Sep 17 00:00:00 2001 From: Olivier Date: Fri, 17 Nov 2017 16:48:06 -0800 Subject: [PATCH 2/5] local-build openmp as static --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1eb2ad595f9..8548eabf43ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -266,7 +266,9 @@ if(USE_OPENMP) if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt) # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp set(OPENMP_STANDALONE_BUILD TRUE) + set(LIBOMP_ENABLE_SHARED FALSE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp) + list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5) list(APPEND mxnet_LINKER_LIBS omp) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") From 73f3236d274cd9904f4a30c8d319190f119b6831 Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Mon, 20 Nov 2017 19:07:59 -0800 Subject: [PATCH 3/5] trigger --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8548eabf43ce..2b7aba9d8d8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) -mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON) +mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON AND NOT MSVC) mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) mxnet_option(USE_PROFILER "Build with Profiler support" OFF) From f6e60f98439544fed06d1ad35f0c2c5370971655 Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Mon, 20 Nov 2017 20:47:01 -0800 Subject: [PATCH 4/5] Somehow broadcast found its way back in, removed again --- src/operator/mxnet_op.h | 3 +- src/operator/operator_tune.cc | 2 - .../tensor/elemwise_binary_broadcast_op.h | 129 ++---------- tests/cpp/operator/broadcast_perf.cc | 189 ------------------ 4 files changed, 16 insertions(+), 307 deletions(-) delete mode 100644 tests/cpp/operator/broadcast_perf.cc diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index 7a14477f5701..1d47943082e2 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -428,8 +428,7 @@ struct Kernel { inline static void LaunchEx(mshadow::Stream *s, const int N, Args... args) { #ifdef _OPENMP const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - if (omp_threads < 2 || !tuned_op::UseOMP( - N, omp_threads, args...)) { + if (omp_threads < 2) { OP::Map(0, N, args...); } else { const int length = (N + omp_threads - 1) / omp_threads; diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc index 9b02b94e790f..525a66b6f824 100644 --- a/src/operator/operator_tune.cc +++ b/src/operator/operator_tune.cc @@ -198,8 +198,6 @@ struct static_init_var { #define IMPLEMENT_CUSTOM_WORKLOAD_FWD(__op$) \ MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_CUSTOM_WORKLOAD_FWD, __op$) -IMPLEMENT_CUSTOM_WORKLOAD_FWD(mxnet::op::mxnet_op::tunable_binary_broadcast_kernel); // NOLINT() - /*! * \brief Tuning data and default weights in the case that MXNET_ENABLE_OPERATOR_AUTOTUNE is set * to zero (thus turning off auto-tuning) diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h index 75e3a4d610ad..2317c982853d 100644 --- a/src/operator/tensor/elemwise_binary_broadcast_op.h +++ b/src/operator/tensor/elemwise_binary_broadcast_op.h @@ -25,19 +25,16 @@ #ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_H_ #define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_BROADCAST_OP_H_ -#include #include #include #include #include #include -#include #include "../mshadow_op.h" #include "../elemwise_op_common.h" #include "./elemwise_binary_op.h" #include "../operator_common.h" #include "broadcast_reduce-inl.h" -#include "../mxnet_op.h" namespace mxnet { namespace op { @@ -138,122 +135,24 @@ inline int BinaryBroadcastShapeCompact(const TShape& lshape, const TShape& rshap } namespace mxnet_op { - -/*! - * \brief Type-level specialization base class for binary_broadcast_kernel - * \tparam DType - */ -template struct tunable_binary_broadcast_kernel; - -template -struct binary_broadcast_kernel : public tunable_binary_broadcast_kernel { - /*! \brief Map function for binary_broadcast_kernel */ +template +struct binary_broadcast_kernel { MSHADOW_XINLINE static void Map(int base, int length, OpReqType req, const Shape& lstride, const Shape& rstride, const Shape& oshape, DType* lhs, DType* rhs, - DType* out) { - if (req != kNullOp) { + DType* out, int lsize, int rsize) { Shape coord = unravel(base, oshape); - auto lidx = static_cast(dot(coord, lstride)); - auto ridx = static_cast(dot(coord, rstride)); - if (!is_tuning) { - KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); - } + index_t lidx = dot(coord, lstride); + index_t ridx = dot(coord, rstride); + KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx])); // starts from 1 to avoid extra inc at end of loop for (int i = 1; i < length; ++i) { inc(&coord, oshape, &lidx, lstride, &ridx, rstride); - // When tuning, don't actually run the op, since it's not going to be tuned against - // the actual op we'll eventually be using - if (!is_tuning) { - KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); - } + KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx])); } } - } - - /*! - * \brief Decide whether to use OpenMP parallelization - * \tparam Args Variable number and types of arguments (passed same args as Map()) - * \param N Number of iterations - * \param thread_count Number of OMP threads available - * \param req Req type (i.e. kNullOp, kWriteTo, kAddTo, etc) - * \param args remaining (unused) arguments - * \return true if OMP parallelization should be used for the N iterations - */ - template - static bool UseOMP(const size_t N, const size_t thread_count, OpReqType req, Args... args) { - if (req != kNullOp) { - switch (OperatorTuneByType::tuning_mode()) { - case tune::kAuto: { - CHECK_GT(thread_count, 0) << "Invalid thread count: " << thread_count; - const uint64_t length = (N + thread_count - 1) / thread_count; - - float wl = tuned_op, DType>::workload_ex_[0] - + tuned_op, DType>::workload_ex_[1] - * length; - // OP::Map() is called 'length' times for each map call - // Get actual price per OP by removing overhead, such as subtracting workload of - // a trivial operation such as set_zero - int64_t subop_actual_workload = - tuned_op::workload_ - tuned_op::workload_; - if (subop_actual_workload < 0) { - subop_actual_workload = 1; - } - wl += 0.75f * subop_actual_workload * length; - return OperatorTuneByType::IsOMPFaster(N, thread_count, static_cast(wl)); - } - case tune::kAlwaysOMP: - return true; - case tune::kNeverOMP: - default: - return false; - } - } - return false; - } }; -/*! - * \brief Type-specific tuning - * \tparam DType - */ -template -struct tunable_binary_broadcast_kernel { - /*! \brief Allows LaunchEx to know the data type for tuning_op<> selection */ - typedef DType DataType; - /*! - * \brief Run-time tuning of sub-op-independent binary_broadcast_kernel - */ - static void Tune() { - constexpr int dim = 2; // Have to pick one to represent all - Shape oshape, lstride, rstride; - for (index_t i = 0; i < dim; ++i) { - oshape[i] = 28U * (i + 1); - lstride[i] = 2U * (i + 1); - rstride[i] = 3U * (i + 1); - } - const int base = 28; - const size_t data_size = lstride.Size() * rstride.Size() * oshape.Size(); - std::unique_ptr data(new DType[data_size]); - memset(data.get(), 0, data_size); // get into cache - tuned_op, DType>::workload_ex_.push_back( - get_workload([&]() { binary_broadcast_kernel<2, DType, mshadow_op::left, true>::Map( - base, 1, kWriteTo, lstride, rstride, oshape, data.get(), data.get(), nullptr); - })); - tuned_op, DType>::workload_ex_.push_back( - get_workload([&]() { binary_broadcast_kernel<2, DType, mshadow_op::left, true>::Map( - base, 1000, kWriteTo, lstride, rstride, oshape, data.get(), data.get(), nullptr); - })); - // Record base time for function - tuned_op, DType>::workload_ex_[1] -= - tuned_op, DType>::workload_ex_[0]; - // Record per-length item adder - tuned_op, DType>::workload_ex_[1] /= 1000; - if (tuned_op, DType>::workload_ex_[1] <= 0) { - tuned_op, DType>::workload_ex_[1] = 1; - } - } -}; } // namespace mxnet_op template @@ -262,6 +161,7 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + using namespace mxnet_op; TShape new_lshape, new_rshape, new_oshape; int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_, &new_lshape, &new_rshape, &new_oshape); @@ -271,12 +171,13 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs, mshadow::Stream *s = ctx.get_stream(); MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { BROADCAST_NDIM_SWITCH(ndim, NDim, { - mshadow::Shape oshape = new_oshape.get(); - mshadow::Shape lstride = mxnet_op::calc_stride(new_lshape.get()); - mshadow::Shape rstride = mxnet_op::calc_stride(new_rshape.get()); - mxnet_op::Kernel, xpu>:: - template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape, - inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr()); + Shape oshape = new_oshape.get(); + Shape lstride = calc_stride(new_lshape.get()); + Shape rstride = calc_stride(new_rshape.get()); + Kernel, xpu>::LaunchEx( + s, new_oshape.Size(), req[0], lstride, rstride, oshape, + inputs[0].dptr(), inputs[1].dptr(), outputs[0].dptr(), + inputs[0].Size(), inputs[1].Size()); }); }); } diff --git a/tests/cpp/operator/broadcast_perf.cc b/tests/cpp/operator/broadcast_perf.cc deleted file mode 100644 index 57b700af93b6..000000000000 --- a/tests/cpp/operator/broadcast_perf.cc +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file broadcast_perf.cc - * \brief Perf/profile run of broadcast kernel - * \author Chris Olivier - */ -#include -#include -#include "../include/test_op_runner.h" -#include "../include/test_core_op.h" -#include "../include/test_tune.h" - -using namespace mxnet; - -using kwargs_t = test::op::kwargs_t; - -static const std::vector> broadcast_shapes() { - std::vector> shapes; - if (test::performance_run) { - shapes = { - { {28, 28}, {28, 1} }, - { {64, 28}, {1, 28} }, - { {28, 28, 28}, {28, 28, 1} }, - { {128, 256}, {1, 256} }, - { {1024, 256}, {1, 256} }, - { {1024, 12, 256}, {1024, 1, 1} }, - { {2560, 1280, 128}, {2560, 1, 128} } - }; - } else { - shapes = { - // Non-performance dataset acts as a sanity test - { {28, 28}, {28, 1} }, - { {128, 128}, {128, 1} }, - { {28, 28, 28}, {28, 28, 1} } - }; - } - return std::move(shapes); -} - -/*! - * \brief Generic bidirectional sanity test - */ -TEST(BROADCAST_PERF, ExecuteBidirectional) { - test::op::BasicRunCoreOpBidirectional(false, true, {}, - { broadcast_shapes()[0] }, - "broadcast_add", "_backward_broadcast_add"); -} - -template -static void RunCoreOpTimingTest(const bool isGPU, - const kwargs_t& op_kwargs, - const char *op_name, - const char *backward_op_name = "") { - const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( - op_kwargs, op_name, backward_op_name); - - // prime code and cache before the performance runs - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, { broadcast_shapes()[0] }, kwargs, 1); - - // Do the performance runs - std::vector> shapes = broadcast_shapes(); - const char *pu = isGPU ? "GPU" : "CPU"; - for (const std::vector &shape : shapes) { - runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, - 2, 10, shape); - } -} - -/*! - * \brief ActivationOp timing test for CPU - */ -TEST(BROADCAST_PERF, TimingCPU) { - if (!test::csv) { - RunCoreOpTimingTest(false, {}, "broadcast_add", "_backward_broadcast_add"); - } else { - RunCoreOpTimingTest(false, {}, "broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE); - } -} - -#if MXNET_USE_CUDA == 1 -/*! - * \brief ActivationOp timing test for GPU - */ -TEST(BROADCAST_PERF, TimingGPU) { - RunCoreOpTimingTest(true, {}, "broadcast_add", "_backward_broadcast_add"); -} -#endif // MXNET_USE_CUDA == 1 - -/*! - * \brief Rune a tuning evaluation - * \tparam DType Data type for which to evaluate tuning - */ -template -static float EvaluateTune(bool verbose = true) { - std::vector> binary_operators; - if (test::performance_run) { - binary_operators = { - {"broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE}, - {"broadcast_mul", COREOP_BWD_OP_NAME_VALUE_NONE}, - {"broadcast_div", COREOP_BWD_OP_NAME_VALUE_NONE} - }; - } else { - binary_operators = { - {"broadcast_add", COREOP_BWD_OP_NAME_VALUE_NONE} - }; - } - std::vector rates; - for (size_t i = 0, n = binary_operators.size(); i < n; ++i) { - test::tune::TuningTester tuningTester; - tuningTester.set_calls_per_iteration(10); - tuningTester.set_total_iterations(5); - std::cout << "******************************" << std::endl; - std::cout << "Operators: " << binary_operators[i].first << ", " - << binary_operators[i].second << " for type: " << test::type_name() - << std::endl; - std::cout << "******************************" << std::endl; - - // Prime code and cache - test::op::BasicRunCoreOpBidirectional(false, false, {}, - { broadcast_shapes()[0] }, - binary_operators[i].first.c_str(), - binary_operators[i].second.c_str()); - - // Do the performance runs - std::vector> shapes = broadcast_shapes(); - - tuningTester.TestTunedOperator({}, true, shapes, - binary_operators[i].first.c_str(), - binary_operators[i].second.c_str()); - rates.push_back(tuningTester.CalculateSuccessRate({}, verbose)); - } - return std::accumulate(rates.begin(), rates.end(), 0.0f) / rates.size(); -} - -/*! \brief ActivationOp timing test for CPU for float */ -TEST(BROADCAST_PERF, EvaluateTuneTestFloat) { - typedef float DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} -/*! \brief ActivationOp timing test for CPU for double */ -TEST(BROADCAST_PERF, EvaluateTuneTestDouble) { - typedef double DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} -TEST(BROADCAST_PERF, EvaluateTuneTestFloat16) { - typedef mshadow::half::half_t DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} -/*! \brief ActivationOp timing test for CPU for int8_t */ -TEST(BROADCAST_PERF, EvaluateTuneTestInt8) { - typedef uint8_t DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} -/*! \brief ActivationOp timing test for CPU for int32_t */ -TEST(BROADCAST_PERF, EvaluateTuneTestInt32) { - typedef int32_t DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} -/*! \brief ActivationOp timing test for CPU for int64_t */ -TEST(BROADCAST_PERF, EvaluateTuneTestInt64) { - typedef int64_t DType; - const float result = EvaluateTune(); - std::cout << "Success rate for type " << test::type_name() << ": " << result << std::endl; -} - From 8c90e39066c9d150801d5523e1b0965012c32cd2 Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Mon, 20 Nov 2017 22:27:37 -0800 Subject: [PATCH 5/5] Trigger rebuild