apache · anirudh2290 · Dec 13, 2018 · Dec 10, 2018 · Dec 11, 2018 · Dec 11, 2018
@@ -222,3 +222,17 @@ Settings for More GPU Parallelism
 - Set ```MXNET_GPU_WORKER_NTHREADS``` to a larger number (e.g., 2)
   - To reduce memory usage, consider setting ```MXNET_EXEC_NUM_TEMP```.
   - This might not speed things up, especially for image applications, because GPU is usually fully utilized even with serialized jobs.
+
+Settings for controlling OMP tuning
+---------------------------------
+- Set ```MXNET_USE_OPERATOR_TUNING=0``` to disable Operator tuning code which decides whether to use OMP or not for operator
+   - Values: String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable
+   -            0=disable all
+   -            1=enable all
+   -            float32, float16, float32=list of types to enable, and disable those not listed
+   - refer : https://github.com/apache/incubator-mxnet/blob/master/src/operator/operator_tune-inl.h#L444
+
+- Set ```MXNET_USE_NUM_CORES_OPERATOR_TUNING``` to define num_cores to be used by operator tuning code.
+  - This reduces operator tuning overhead when there are multiple instances of mxnet running in the system and we know that
+    each mxnet will take only partial num_cores available with system. 
+  - refer: https://github.com/apache/incubator-mxnet/pull/13602
diff --git a/src/operator/operator_tune-inl.h b/src/operator/operator_tune-inl.h
@@ -56,7 +56,7 @@ namespace op {
 #endif
 #endif  // MXNET_NO_INLINE
 
-#define OUTSIDE_COUNT_SHIFT    9
+#define OUTSIDE_COUNT_SHIFT  3
 
 namespace tune {
 
@@ -356,7 +356,8 @@ class OperatorTune : public OperatorTuneByType<DType> {
   static duration_t GetOMPLoopOverhead() {
     // It was found empirically that OMP times was not heavily tied to number of cores,
     // so take an average across all core counts
-    const auto max_cores = static_cast<size_t>(omp_get_num_procs()) >> 1;
+    const auto max_cores_default = static_cast<size_t>(omp_get_num_procs()) >> 1;
+    const auto max_cores = dmlc::GetEnv("MXNET_USE_NUM_CORES_OPERATOR_TUNING", max_cores_default);
     if (max_cores >= 2) {
       std::vector<duration_t> core_times;
       // Take care of any OMP lazy-init with a throwaway call