Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Refreshed branch bc_tune
Browse files Browse the repository at this point in the history
  • Loading branch information
Olivier committed Nov 18, 2017
1 parent 3107326 commit c8446f1
Show file tree
Hide file tree
Showing 21 changed files with 2,429 additions and 100 deletions.
22 changes: 19 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC)
mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON)
mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON)
mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON)
mxnet_option(USE_PROFILER "Build with Profiler support" OFF)
Expand Down Expand Up @@ -143,6 +144,8 @@ if(USE_MKL_IF_AVAILABLE)
if(NOT MSVC)
list(APPEND mxnet_LINKER_LIBS dl)
endif()
# If using MKL, use the Intel OMP libraries
list(APPEND mxnet_LINKER_LIBS iomp5)
if(USE_MKL_EXPERIMENTAL)
add_definitions(-DMKL_EXPERIMENTAL=1)
else()
Expand Down Expand Up @@ -260,11 +263,20 @@ endif()
# ---[ OpenMP
if(USE_OPENMP)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt)
# Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
set(OPENMP_STANDALONE_BUILD TRUE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp)
list(APPEND mxnet_LINKER_LIBS omp)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
else()
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
endif()
elseif(UNIX)
list(APPEND mxnet_LINKER_LIBS pthread)
Expand Down Expand Up @@ -353,6 +365,10 @@ if(USE_PLUGINS_WARPCTC)
list(APPEND CUDA ${PLUGINS_CUSRC})
endif()

if(USE_OPERATOR_TUNING)
add_definitions(-DMXNET_USE_OPERATOR_TUNING=1)
endif()

if(USE_PLUGIN_CAFFE)
if(NOT USE_CUDA)
set(CPU_ONLY ON)
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ ifeq ($(USE_MKL2017), 1)
LDFLAGS += -liomp5
endif

ifeq ($(USE_OPERATOR_TUNING), 1)
CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
endif

# verify existence of separate lapack library when using blas/openblas/atlas
# switch off lapack support in case it can't be found
# issue covered with this
Expand Down
6 changes: 6 additions & 0 deletions make/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
# sudo apt-get install -y libcurl4-openssl-dev
USE_S3 = 0

#----------------------------
# performance settings
#----------------------------
# Use operator tuning
USE_OPERATOR_TUNING = 1

# Use gperftools if found
USE_GPERFTOOLS = 1

Expand Down
4 changes: 1 addition & 3 deletions src/engine/openmp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace engine {
#endif

static inline bool is_env_set(const char *var) {
return dmlc::GetEnv(var, INT_MIN) == INT_MIN;
return dmlc::GetEnv(var, INT_MIN) != INT_MIN;
}

OpenMP *OpenMP::Get() {
Expand All @@ -55,8 +55,6 @@ OpenMP::OpenMP()
omp_thread_max_ = omp_get_max_threads();
}
}
omp_set_nested(dmlc::GetEnv("OMP_NESTED", false));
omp_set_dynamic(dmlc::GetEnv("OMP_DYNAMIC", false));
#else
enabled_ = false;
omp_thread_max_ = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/engine/threaded_engine_perdevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
}
}
} else {
CHECK_EQ(ctx.dev_mask(), gpu::kDevMask);
CHECK_EQ(ctx.dev_mask(), Context::kGPU);
// GPU execution.
FnProperty prop = opr_block->opr->prop;
bool is_copy = (prop == FnProperty::kCopyFromGPU ||
Expand Down
2 changes: 1 addition & 1 deletion src/io/image_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
#if MXNET_USE_OPENCV
const auto& param = nnvm::get<ImdecodeParam>(attrs.parsed);

CHECK_EQ(inputs[0].ctx().dev_mask(), cpu::kDevMask) << "Only supports cpu input";
CHECK_EQ(inputs[0].ctx().dev_mask(), Context::kCPU) << "Only supports cpu input";
CHECK_EQ(inputs[0].dtype(), mshadow::kUint8) << "Input needs to be uint8 buffer";
inputs[0].WaitToRead();

Expand Down
94 changes: 69 additions & 25 deletions src/operator/mshadow_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "math.h"
#include "math_functions-inl.h"
#include "special_functions-inl.h"
#include "./mxnet_op.h"

#ifdef __CUDACC__
#include <cuda_fp16.h>
Expand All @@ -38,6 +39,24 @@ namespace mxnet {
namespace op {
namespace mshadow_op {

/*!
* \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' macro outside of the mshadow_op namespace
* See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD'
*
* \note An entry for the operator must also be added in operator_tune.cc, which will register it
* for auto-tuning and also hold its workload weight
*/
#define MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(__op$) \
} MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*)
/*!
* \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' macro outside of the mshadow_op namespace
* See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD'
*
* \note An entry for the operator must also be added in operator_tune.cc, which will register it
* for auto-tuning and also hold its workload weight
*/
#define MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(__op$) \
} MXNET_TUNABLE_MSHADOW_OP_BACKWARD(mshadow_op::__op$) namespace mshadow_op { // NOLINT(*)
#ifdef __CUDA_ARCH__
__constant__ const float PI = 3.14159265358979323846;
#else
Expand All @@ -48,36 +67,41 @@ using std::enable_if;
using std::is_unsigned;

#define MXNET_UNARY_MATH_OP(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return DType(expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return DType(expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)


#define MXNET_UNARY_MATH_OP_NC(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return (expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a) { \
return (expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_BINARY_MATH_OP(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return DType(expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return DType(expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_BINARY_MATH_OP_NC(name, expr) \
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return (expr); \
} \
}
struct name { \
template<typename DType> \
MSHADOW_XINLINE static DType Map(DType a, DType b) { \
return (expr); \
} \
}; \
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)

#define MXNET_SIMPLE_UNARY_MATH_OP(name) MXNET_UNARY_MATH_OP(name, math::name(a))

Expand Down Expand Up @@ -133,6 +157,7 @@ struct softrelu {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(softrelu)

MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a));

Expand All @@ -153,6 +178,7 @@ struct log10_grad {
return DType(0.4342944819f / static_cast<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log10_grad)

template<>
MSHADOW_XINLINE double log10_grad::Map<double>(double a) {
Expand All @@ -168,6 +194,7 @@ struct log2_grad {
return DType(1.442695041f / static_cast<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log2_grad)

template<>
MSHADOW_XINLINE double log2_grad::Map<double>(double a) {
Expand Down Expand Up @@ -262,6 +289,7 @@ struct sign {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(sign)

MXNET_UNARY_MATH_OP_NC(sign_grad, DType(0));

Expand Down Expand Up @@ -332,6 +360,7 @@ struct rint {
return DType((af - floor) <= (ceil - af) ? floor : ceil);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rint)

/*! \brief used to round number to integer nearest to 0 */
struct fix {
Expand All @@ -342,6 +371,7 @@ struct fix {
return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil);
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(fix)

/*! \brief used for generate gradient of MAE loss*/
MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
Expand Down Expand Up @@ -404,6 +434,7 @@ struct mod {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(mod)

template<>
MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
Expand All @@ -418,6 +449,8 @@ struct mod_grad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_grad)

template<>
MSHADOW_XINLINE double mod_grad::Map<double>(double a, double b) {
return 1.0;
Expand Down Expand Up @@ -453,6 +486,8 @@ struct mod_rgrad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_rgrad)

template<>
MSHADOW_XINLINE double mod_rgrad::Map<double>(double a, double b) {
return -::floor(a/b);
Expand Down Expand Up @@ -516,6 +551,7 @@ struct rmod {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rmod)

template<>
MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
Expand All @@ -530,6 +566,8 @@ struct rmod_grad {
return DType(0);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(rmod_grad)

template<>
MSHADOW_XINLINE double rmod_grad::Map<double>(double a, double b) {
return -::floor(b/a);
Expand Down Expand Up @@ -571,6 +609,7 @@ struct clip {
}
}
};
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(clip)

/***** gamma ******/

Expand All @@ -584,6 +623,7 @@ struct gamma_grad {
return DType(math::tgamma(af) * special_functions::cephes::psi<float>(af));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gamma_grad)

template<>
MSHADOW_XINLINE double gamma_grad::Map<double>(double a) {
Expand All @@ -601,6 +641,7 @@ struct gammaln_grad {
return DType(special_functions::cephes::psi<float>(a));
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gammaln_grad)

template<>
MSHADOW_XINLINE double gammaln_grad::Map<double>(double a) {
Expand Down Expand Up @@ -632,6 +673,7 @@ struct smooth_l1_loss {
}
}
}; // struct smooth_l1_loss
MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(smooth_l1_loss)

/* The derivative of smooth l1 loss is
* f'(x) = sigma^2 * x, |x| < 1 / sigma^2
Expand All @@ -653,6 +695,7 @@ struct smooth_l1_gradient {
}
}
}; // struct smooth_l1_derivative
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(smooth_l1_gradient)

/*! \brief product reducer */
struct product {
Expand Down Expand Up @@ -754,6 +797,7 @@ struct nansum_grad {
return isnan_typed::IsNan(a) ? DType(0) : DType(1);
}
};
MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nansum_grad)

/*! \brief product reducer that ignores NaN values in the input */
struct nanprod {
Expand Down Expand Up @@ -790,7 +834,7 @@ struct nanprod_grad {
return isnan_typed::IsNan(a) ? DType(0) : b / a;
}
};

MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nanprod_grad)
} // namespace mshadow_op
} // namespace op
} // namespace mxnet
Expand Down
Loading

0 comments on commit c8446f1

Please sign in to comment.