Kernel operator tuning (apache#8686)

* Refreshed branch bc_tune * local-build openmp as static * trigger * Somehow broadcast found its way back in, removed again * Trigger rebuild
eric-haibin-lin · Dec 3, 2017 · 84c29d9 · 84c29d9
1 parent 6b29e6f
commit 84c29d9
Show file tree

Hide file tree

Showing 17 changed files with 2,212 additions and 170 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -35,6 +35,7 @@ mxnet_option(USE_LAPACK           "Build with lapack support" ON IF NOT MSVC)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
 mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
+mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
 mxnet_option(USE_PROFILER         "Build with Profiler support"   OFF)
@@ -143,6 +144,8 @@ if(USE_MKL_IF_AVAILABLE)
     if(NOT MSVC)
       list(APPEND mxnet_LINKER_LIBS dl)
     endif()
+    # If using MKL, use the Intel OMP libraries
+    list(APPEND mxnet_LINKER_LIBS iomp5)
     if(USE_MKL_EXPERIMENTAL)
       add_definitions(-DMKL_EXPERIMENTAL=1)
     else()
@@ -260,11 +263,22 @@ endif()
 # ---[ OpenMP
 if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
-  if(OPENMP_FOUND)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt)
+    # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
+    set(OPENMP_STANDALONE_BUILD TRUE)
+    set(LIBOMP_ENABLE_SHARED FALSE)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp)
+    list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5)
+    list(APPEND mxnet_LINKER_LIBS omp)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  else()
+    if(OPENMP_FOUND)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+    endif()
   endif()
 elseif(UNIX)
   list(APPEND mxnet_LINKER_LIBS pthread)
@@ -353,6 +367,10 @@ if(USE_PLUGINS_WARPCTC)
 	list(APPEND CUDA ${PLUGINS_CUSRC})
 endif()
 
+if(USE_OPERATOR_TUNING)
+  add_definitions(-DMXNET_USE_OPERATOR_TUNING=1)
+endif()
+
 if(USE_PLUGIN_CAFFE)
   if(NOT USE_CUDA)
     set(CPU_ONLY ON)

diff --git a/Makefile b/Makefile
@@ -131,6 +131,10 @@ ifeq ($(USE_MKL2017), 1)
 	LDFLAGS +=  -liomp5
 endif
 
+ifeq ($(USE_OPERATOR_TUNING), 1)
+	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
+endif
+
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this

diff --git a/make/config.mk b/make/config.mk
@@ -153,6 +153,12 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
 # sudo apt-get install -y libcurl4-openssl-dev
 USE_S3 = 0
 
+#----------------------------
+# performance settings
+#----------------------------
+# Use operator tuning
+USE_OPERATOR_TUNING = 1
+
 # Use gperftools if found
 USE_GPERFTOOLS = 1
 

diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
@@ -30,6 +30,7 @@
 #include "math.h"
 #include "math_functions-inl.h"
 #include "special_functions-inl.h"
+#include "./mxnet_op.h"
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -39,6 +40,24 @@ namespace mxnet {
 namespace op {
 namespace mshadow_op {
 
+/*!
+ * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' macro outside of the mshadow_op namespace
+ *        See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD'
+ *
+ * \note An entry for the operator must also be added in operator_tune.cc, which will register it
+ *       for auto-tuning and also hold its workload weight
+ */
+#define MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(__op$) \
+  } MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow_op::__op$) namespace mshadow_op {  // NOLINT(*)
+/*!
+ * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' macro outside of the mshadow_op namespace
+ *        See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD'
+ *
+ * \note An entry for the operator must also be added in operator_tune.cc, which will register it
+ *       for auto-tuning and also hold its workload weight
+ */
+#define MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(__op$) \
+  }  MXNET_TUNABLE_MSHADOW_OP_BACKWARD(mshadow_op::__op$) namespace mshadow_op {  // NOLINT(*)
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
 #else
@@ -49,36 +68,41 @@ using std::enable_if;
 using std::is_unsigned;
 
 #define MXNET_UNARY_MATH_OP(name, expr) \
-struct name { \
-  template<typename DType> \
-  MSHADOW_XINLINE static DType Map(DType a) { \
-    return DType(expr); \
-  } \
-}
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a) { \
+      return DType(expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
+
 
 #define MXNET_UNARY_MATH_OP_NC(name, expr) \
-struct name { \
-  template<typename DType> \
-  MSHADOW_XINLINE static DType Map(DType a) { \
-    return (expr); \
-  } \
-}
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a) { \
+      return (expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
 #define MXNET_BINARY_MATH_OP(name, expr) \
-struct name { \
-  template<typename DType> \
-  MSHADOW_XINLINE static DType Map(DType a, DType b) { \
-    return DType(expr); \
-  } \
-}
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a, DType b) { \
+      return DType(expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
 #define MXNET_BINARY_MATH_OP_NC(name, expr) \
-struct name { \
-  template<typename DType> \
-  MSHADOW_XINLINE static DType Map(DType a, DType b) { \
-    return (expr); \
-  } \
-}
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a, DType b) { \
+      return (expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
 #define MXNET_SIMPLE_UNARY_MATH_OP(name) MXNET_UNARY_MATH_OP(name, math::name(a))
 
@@ -134,6 +158,7 @@ struct softrelu {
     }
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(softrelu)
 
 MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a));
 
@@ -154,6 +179,7 @@ struct log10_grad {
     return DType(0.4342944819f / static_cast<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log10_grad)
 
 template<>
 MSHADOW_XINLINE double log10_grad::Map<double>(double a) {
@@ -169,6 +195,7 @@ struct log2_grad {
     return DType(1.442695041f / static_cast<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log2_grad)
 
 template<>
 MSHADOW_XINLINE double log2_grad::Map<double>(double a) {
@@ -263,6 +290,7 @@ struct sign {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(sign)
 
 MXNET_UNARY_MATH_OP_NC(sign_grad, DType(0));
 
@@ -333,6 +361,7 @@ struct rint {
     return DType((af - floor) <= (ceil - af) ? floor : ceil);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rint)
 
 /*! \brief used to round number to integer nearest to 0 */
 struct fix {
@@ -343,6 +372,7 @@ struct fix {
     return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(fix)
 
 /*! \brief used for generate gradient of MAE loss*/
 MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
@@ -405,6 +435,7 @@ struct mod {
     }
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(mod)
 
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
@@ -419,6 +450,8 @@ struct mod_grad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_grad)
+
 template<>
 MSHADOW_XINLINE double mod_grad::Map<double>(double a, double b) {
   return 1.0;
@@ -454,6 +487,8 @@ struct mod_rgrad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_rgrad)
+
 template<>
 MSHADOW_XINLINE double mod_rgrad::Map<double>(double a, double b) {
   return -::floor(a/b);
@@ -517,6 +552,7 @@ struct rmod {
     }
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rmod)
 
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
@@ -531,6 +567,8 @@ struct rmod_grad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(rmod_grad)
+
 template<>
 MSHADOW_XINLINE double rmod_grad::Map<double>(double a, double b) {
   return -::floor(b/a);
@@ -572,6 +610,7 @@ struct clip {
     }
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(clip)
 
 /***** gamma ******/
 
@@ -585,6 +624,7 @@ struct gamma_grad {
     return DType(math::tgamma(af) * special_functions::cephes::psi<float>(af));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gamma_grad)
 
 template<>
 MSHADOW_XINLINE double gamma_grad::Map<double>(double a) {
@@ -602,6 +642,7 @@ struct gammaln_grad {
     return DType(special_functions::cephes::psi<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gammaln_grad)
 
 template<>
 MSHADOW_XINLINE double gammaln_grad::Map<double>(double a) {
@@ -633,6 +674,7 @@ struct smooth_l1_loss {
     }
   }
 };  // struct smooth_l1_loss
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(smooth_l1_loss)
 
 /* The derivative of smooth l1 loss is
  * f'(x) = sigma^2 * x, |x| < 1 / sigma^2
@@ -654,6 +696,7 @@ struct smooth_l1_gradient {
     }
   }
 };  // struct smooth_l1_derivative
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(smooth_l1_gradient)
 
 /*! \brief product reducer */
 struct product {
@@ -755,6 +798,7 @@ struct nansum_grad {
     return isnan_typed::IsNan(a) ? DType(0) : DType(1);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nansum_grad)
 
 /*! \brief product reducer that ignores NaN values in the input */
 struct nanprod {
@@ -791,7 +835,7 @@ struct nanprod_grad {
     return isnan_typed::IsNan(a) ? DType(0) : b / a;
   }
 };
-
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nanprod_grad)
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet