Skip to content

Commit

Permalink
Kernel operator tuning (apache#8686)
Browse files Browse the repository at this point in the history
* Refreshed branch bc_tune

* local-build openmp as static

* trigger

* Somehow broadcast found its way back in, removed again

* Trigger rebuild
  • Loading branch information
cjolivier01 authored and eric-haibin-lin committed Dec 3, 2017
1 parent 6b29e6f commit 84c29d9
Show file tree
Hide file tree
Showing 17 changed files with 2,212 additions and 170 deletions.
24 changes: 21 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC)
mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON AND NOT MSVC)
mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON)
mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON)
mxnet_option(USE_PROFILER "Build with Profiler support" OFF)
Expand Down Expand Up @@ -143,6 +144,8 @@ if(USE_MKL_IF_AVAILABLE)
if(NOT MSVC)
list(APPEND mxnet_LINKER_LIBS dl)
endif()
# If using MKL, use the Intel OMP libraries
list(APPEND mxnet_LINKER_LIBS iomp5)
if(USE_MKL_EXPERIMENTAL)
add_definitions(-DMKL_EXPERIMENTAL=1)
else()
Expand Down Expand Up @@ -260,11 +263,22 @@ endif()
# ---[ OpenMP
if(USE_OPENMP)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt)
# Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
set(OPENMP_STANDALONE_BUILD TRUE)
set(LIBOMP_ENABLE_SHARED FALSE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp)
list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5)
list(APPEND mxnet_LINKER_LIBS omp)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
else()
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
endif()
elseif(UNIX)
list(APPEND mxnet_LINKER_LIBS pthread)
Expand Down Expand Up @@ -353,6 +367,10 @@ if(USE_PLUGINS_WARPCTC)
list(APPEND CUDA ${PLUGINS_CUSRC})
endif()

if(USE_OPERATOR_TUNING)
add_definitions(-DMXNET_USE_OPERATOR_TUNING=1)
endif()

if(USE_PLUGIN_CAFFE)
if(NOT USE_CUDA)
set(CPU_ONLY ON)
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ ifeq ($(USE_MKL2017), 1)
LDFLAGS += -liomp5
endif

ifeq ($(USE_OPERATOR_TUNING), 1)
CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
endif

# verify existence of separate lapack library when using blas/openblas/atlas
# switch off lapack support in case it can't be found
# issue covered with this
Expand Down
6 changes: 6 additions & 0 deletions make/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
# sudo apt-get install -y libcurl4-openssl-dev
USE_S3 = 0

#----------------------------
# performance settings
#----------------------------
# Use operator tuning
USE_OPERATOR_TUNING = 1

# Use gperftools if found
USE_GPERFTOOLS = 1

Expand Down
94 changes: 69 additions & 25 deletions src/operator/mshadow_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "math.h"
#include "math_functions-inl.h"
#include "special_functions-inl.h"
#include "./mxnet_op.h"

#ifdef __CUDACC__
#include <cuda_fp16.h>
Expand All @@ -39,6 +40,24 @@ namespace mxnet {
namespace op {
namespace mshadow_op {

/*!
* \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' macro outside of the mshadow_op namespace
* See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD'
*
* \note An entry for the operator must also be added in operator_tune.cc, which will register it
* for auto-tuning and also hold its workload weight
*/
#define MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(__op$) \
} MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*)
/*!
* \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' macro outside of the mshadow_op namespace
* See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD'
*
* \note An entry for the operator must also be added in operator_tune.cc, which will register it
* for auto-tuning and also hold its workload weight
*/
#define MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(__op$) \
} MXNET_TUNABLE_MSHADOW_OP_BACKWARD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*)
#ifdef __CUDA_ARCH__
__constant__ const float PI = 3.14159265358979323846;
#else
Expand All @@ -49,36 +68,41 @@ using std::enable_if;
using std::is_unsigned;

#define MXNET_UNARY_MATH_OP(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return DType(expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return DType(expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)


#define MXNET_UNARY_MATH_OP_NC(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return (expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return (expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_BINARY_MATH_OP(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return DType(expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return DType(expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_BINARY_MATH_OP_NC(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return (expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return (expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_SIMPLE_UNARY_MATH_OP(name) MXNET_UNARY_MATH_OP(name, math::name(a))

Expand Down Expand Up @@ -134,6 +158,7 @@ struct softrelu {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(softrelu)

MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a));

Expand All @@ -154,6 +179,7 @@ struct log10_grad {
return DType(0.4342944819f / static_cast<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log10_grad)

template<>
MSHADOW_XINLINE double log10_grad::Map<double>(double a) {
Expand All @@ -169,6 +195,7 @@ struct log2_grad {
return DType(1.442695041f / static_cast<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log2_grad)

template<>
MSHADOW_XINLINE double log2_grad::Map<double>(double a) {
Expand Down Expand Up @@ -263,6 +290,7 @@ struct sign {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(sign)

MXNET_UNARY_MATH_OP_NC(sign_grad, DType(0));

Expand Down Expand Up @@ -333,6 +361,7 @@ struct rint {
return DType((af - floor) <= (ceil - af) ? floor : ceil);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rint)

/*! \brief used to round number to integer nearest to 0 */
struct fix {
Expand All @@ -343,6 +372,7 @@ struct fix {
return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(fix)

/*! \brief used for generate gradient of MAE loss*/
MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
Expand Down Expand Up @@ -405,6 +435,7 @@ struct mod {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(mod)

template<>
MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
Expand All @@ -419,6 +450,8 @@ struct mod_grad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_grad)

template<>
MSHADOW_XINLINE double mod_grad::Map<double>(double a, double b) {
return 1.0;
Expand Down Expand Up @@ -454,6 +487,8 @@ struct mod_rgrad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_rgrad)

template<>
MSHADOW_XINLINE double mod_rgrad::Map<double>(double a, double b) {
return -::floor(a/b);
Expand Down Expand Up @@ -517,6 +552,7 @@ struct rmod {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rmod)

template<>
MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
Expand All @@ -531,6 +567,8 @@ struct rmod_grad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(rmod_grad)

template<>
MSHADOW_XINLINE double rmod_grad::Map<double>(double a, double b) {
return -::floor(b/a);
Expand Down Expand Up @@ -572,6 +610,7 @@ struct clip {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(clip)

/***** gamma ******/

Expand All @@ -585,6 +624,7 @@ struct gamma_grad {
return DType(math::tgamma(af) * special_functions::cephes::psi<float>(af));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gamma_grad)

template<>
MSHADOW_XINLINE double gamma_grad::Map<double>(double a) {
Expand All @@ -602,6 +642,7 @@ struct gammaln_grad {
return DType(special_functions::cephes::psi<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gammaln_grad)

template<>
MSHADOW_XINLINE double gammaln_grad::Map<double>(double a) {
Expand Down Expand Up @@ -633,6 +674,7 @@ struct smooth_l1_loss {
}
}
}; // struct smooth_l1_loss
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(smooth_l1_loss)

/* The derivative of smooth l1 loss is
* f'(x) = sigma^2 * x, |x| < 1 / sigma^2
Expand All @@ -654,6 +696,7 @@ struct smooth_l1_gradient {
}
}
}; // struct smooth_l1_derivative
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(smooth_l1_gradient)

/*! \brief product reducer */
struct product {
Expand Down Expand Up @@ -755,6 +798,7 @@ struct nansum_grad {
return isnan_typed::IsNan(a) ? DType(0) : DType(1);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nansum_grad)

/*! \brief product reducer that ignores NaN values in the input */
struct nanprod {
Expand Down Expand Up @@ -791,7 +835,7 @@ struct nanprod_grad {
return isnan_typed::IsNan(a) ? DType(0) : b / a;
}
};

MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nanprod_grad)
} // namespace mshadow_op
} // namespace op
} // namespace mxnet
Expand Down
Loading

0 comments on commit 84c29d9

Please sign in to comment.