microsoft · guolinke · Feb 2, 2020 · Jan 20, 2020 · Jan 20, 2020 · Jan 21, 2020
@@ -68,6 +68,10 @@ if(USE_R35)
     ADD_DEFINITIONS(-DR_VER_ABOVE_35)
 endif(USE_R35)
 
+if(USE_TIMETAG)
+    ADD_DEFINITIONS(-DTIMETAG)
+endif(USE_TIMETAG)
+
 if(USE_MPI)
     find_package(MPI REQUIRED)
     ADD_DEFINITIONS(-DUSE_MPI)
@@ -130,6 +134,21 @@ if(${MM_PREFETCH})
   ADD_DEFINITIONS(-DMM_PREFETCH)
 endif()
 
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+#include <mm_malloc.h>
+int main() {
+  char *a = (char*)_mm_malloc(8, 16);
+  _mm_free(a);
+  return 0;
+}
+" MM_MALLOC)
+
+if(${MM_MALLOC})
+  message(STATUS "Use _mm_malloc")
+  ADD_DEFINITIONS(-DMM_MALLOC)
+endif()
+
 if(UNIX OR MINGW OR CYGWIN)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
     if(USE_SWIG)
@@ -152,10 +171,13 @@ if(MSVC)
         CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_RELWITHDEBINFO
     )
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL /MP")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /O2 /Ob2 /Oi /Ot /Oy /GL /MP /arch:AVX2")
 else()
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops")
+    if (NOT APPLE)
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+    endif()
 endif(MSVC)
 
 SET(LightGBM_HEADER_DIR ${PROJECT_SOURCE_DIR}/include)

@@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
     )
   }, regexp = "each element of valids must have a name")
 })
+
+test_that("lgb.train() works with force_col_wise and force_row_wise", {
+  set.seed(1234L)
+  nrounds <- 10L
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+  )
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_col_wise = TRUE
+  )
+  bst_colwise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  params <- list(
+    objective = "binary"
+    , metric = "binary_error"
+    , force_row_wise = TRUE
+  )
+  bst_row_wise <- lgb.train(
+    params = params
+    , data = dtrain
+    , nrounds = nrounds
+  )
+
+  expected_error <- 0.003070782
+  expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error)
+  expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error)
+
+  # check some basic details of the boosters just to be sure force_col_wise
+  # and force_row_wise are not causing any weird side effects
+  for (bst in list(bst_row_wise, bst_colwise)) {
+    expect_equal(bst$current_iter(), nrounds)
+    parsed_model <- jsonlite::fromJSON(bst$dump_model())
+    expect_equal(parsed_model$objective, "binary sigmoid:1")
+    expect_false(parsed_model$average_output)
+  }
+})
@@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
     }
     expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
     expect_equal(eval_results[[1L]][["value"]], 0.825)
-    expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
-    expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
+    expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE)
+    expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE)
 })
 
 test_that("learning-to-rank with lgb.cv() works as expected", {

@@ -190,6 +190,38 @@ Core Parameters
 Learning Control Parameters
 ---------------------------
 
+-  ``force_col_wise`` :raw-html:`<a id="force_col_wise" title="Permalink to this parameter" href="#force_col_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+
+   -  Recommend ``force_col_wise=true`` when:
+
+      -  the number of columns is large, or the total number of bin is large
+
+      -  when ``num_threads`` is large, e.g. ``>20``
+
+      -  want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+
+      -  want to reduce memory cost
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+
+-  ``force_row_wise`` :raw-html:`<a id="force_row_wise" title="Permalink to this parameter" href="#force_row_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+
+   -  Recommend ``force_row_wise=true`` when:
+
+      -  the number of data is large, and the number of total bin is relatively small
+
+      -  want to use small ``bagging``, or ``goss``, to speed-up
+
+      -  when ``num_threads`` is relatively small, e.g. ``<=16``
+
+   -  set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+
+   -  when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+
 -  ``max_depth`` :raw-html:`<a id="max_depth" title="Permalink to this parameter" href="#max_depth">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
    -  limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
@@ -559,22 +591,6 @@ IO Parameters
 
    -  **Note**: disabling this may cause the slow training speed for sparse datasets
 
--  ``max_conflict_rate`` :raw-html:`<a id="max_conflict_rate" title="Permalink to this parameter" href="#max_conflict_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0``
-
-   -  max conflict rate for bundles in EFB
-
-   -  set this to ``0.0`` to disallow the conflict and provide more accurate results
-
-   -  set this to a larger value to achieve faster speed
-
--  ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
-
-   -  used to enable/disable sparse optimization
-
--  ``sparse_threshold`` :raw-html:`<a id="sparse_threshold" title="Permalink to this parameter" href="#sparse_threshold">&#x1F517;&#xFE0E;</a>`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0``
-
-   -  the threshold of zero elements percentage for treating a feature as a sparse one
-
 -  ``use_missing`` :raw-html:`<a id="use_missing" title="Permalink to this parameter" href="#use_missing">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
 
    -  set this to ``false`` to disable the special handle of missing value

@@ -29,36 +29,29 @@ enum MissingType {
   NaN
 };
 
-/*! \brief Store data for one histogram bin */
-struct HistogramBinEntry {
- public:
-  /*! \brief Sum of gradients on this bin */
-  double sum_gradients = 0.0f;
-  /*! \brief Sum of hessians on this bin */
-  double sum_hessians = 0.0f;
-  /*! \brief Number of data on this bin */
-  data_size_t cnt = 0;
-  /*!
-  * \brief Sum up (reducers) functions for histogram bin
-  */
-  inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
-    comm_size_t used_size = 0;
-    const HistogramBinEntry* p1;
-    HistogramBinEntry* p2;
-    while (used_size < len) {
-      // convert
-      p1 = reinterpret_cast<const HistogramBinEntry*>(src);
-      p2 = reinterpret_cast<HistogramBinEntry*>(dst);
-      // add
-      p2->cnt += p1->cnt;
-      p2->sum_gradients += p1->sum_gradients;
-      p2->sum_hessians += p1->sum_hessians;
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
-    }
+typedef double hist_t;
+
+const size_t KHistEntrySize = 2 * sizeof(hist_t);
+const int KHistOffset = 2;
+const double kSparseThreshold = 0.7;
+
+#define GET_GRAD(hist, i) hist[(i) << 1]
+#define GET_HESS(hist, i) hist[((i) << 1) + 1]
+
+inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  comm_size_t used_size = 0;
+  const hist_t* p1;
+  hist_t* p2;
+  while (used_size < len) {
+    // convert
+    p1 = reinterpret_cast<const hist_t*>(src);
+    p2 = reinterpret_cast<hist_t*>(dst);
+    *p2 += *p1;
+    src += type_size;
+    dst += type_size;
+    used_size += type_size;
   }
-};
+}
 
 /*! \brief This class used to convert feature values into bin,
 *          and store some meta information for bin*/
@@ -252,7 +245,7 @@ class OrderedBin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(int leaf, const score_t* gradients,
-    const score_t* hessians, HistogramBinEntry* out) const = 0;
+    const score_t* hessians, hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram by using this bin
@@ -262,7 +255,7 @@ class OrderedBin {
   * \param gradients Gradients, Note:non-ordered by leaf
   * \param out Output Result
   */
-  virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
+  virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split current bin, and perform re-order by leaf
@@ -360,11 +353,11 @@ class Bin {
   virtual void ConstructHistogram(
     const data_size_t* data_indices, data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
     const score_t* ordered_gradients, const score_t* ordered_hessians,
-    HistogramBinEntry* out) const = 0;
+    hist_t* out) const = 0;
 
   /*!
   * \brief Construct histogram of this feature,
@@ -380,10 +373,10 @@ class Bin {
   * \param out Output Result
   */
   virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
-                                  const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
+                                  const score_t* ordered_gradients, hist_t* out) const = 0;
 
   /*!
   * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
@@ -423,30 +416,11 @@ class Bin {
                             data_size_t* data_indices, data_size_t num_data,
                             data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
 
-  /*!
-  * \brief Create the ordered bin for this bin
-  * \return Pointer to ordered bin
-  */
-  virtual OrderedBin* CreateOrderedBin() const = 0;
-
   /*!
   * \brief After pushed all feature data, call this could have better refactor for bin data
   */
   virtual void FinishLoad() = 0;
 
-  /*!
-  * \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
-  * \param num_data Total number of data
-  * \param num_bin Number of bin
-  * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
-  * \param is_enable_sparse True if enable sparse feature
-  * \param sparse_threshold Threshold for treating a feature as a sparse feature
-  * \param is_sparse Will set to true if this bin is sparse
-  * \return The bin data object
-  */
-  static Bin* CreateBin(data_size_t num_data, int num_bin,
-    double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
-
   /*!
   * \brief Create object for bin data of one feature, used for dense feature
   * \param num_data Total number of data
@@ -469,6 +443,46 @@ class Bin {
   virtual Bin* Clone() = 0;
 };
 
+
+class MultiValBin {
+public:
+
+  virtual ~MultiValBin() {}
+
+  virtual data_size_t num_data() const = 0;
+
+  virtual int32_t num_bin() const = 0;
+
+  virtual void ReSize(data_size_t num_data) = 0;
+
+  virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
+
+  virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
+
+  virtual void ConstructHistogram(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* gradients, const score_t* hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogram(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void FinishLoad() = 0;
+
+  virtual bool IsSparse() = 0;
+
+  static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);
+
+  virtual MultiValBin* Clone() = 0;
+};
+
 inline uint32_t BinMapper::ValueToBin(double value) const {
   if (std::isnan(value)) {
     if (missing_type_ == MissingType::NaN) {

@@ -214,6 +214,24 @@ struct Config {
 
   #pragma region Learning Control Parameters
 
+  // desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
+  // desc = Recommend ``force_col_wise=true`` when:
+  // descl2 = the number of columns is large, or the total number of bin is large
+  // descl2 = when ``num_threads`` is large, e.g. ``>20``
+  // descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
+  // descl2 = want to reduce memory cost
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
+  bool force_col_wise = false;
+
+  // desc = set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
+  // desc = Recommend ``force_row_wise=true`` when:
+  // descl2 = the number of data is large, and the number of total bin is relatively small
+  // descl2 = want to use small ``bagging``, or ``goss``, to speed-up
+  // descl2 = when ``num_threads`` is relatively small, e.g. ``<=16``
+  // desc = set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
+  // desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
+  bool force_row_wise = false;
+
   // desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
   // desc = ``<= 0`` means no limit
   int max_depth = -1;
@@ -534,22 +552,6 @@ struct Config {
   // desc = **Note**: disabling this may cause the slow training speed for sparse datasets
   bool enable_bundle = true;
 
-  // check = >=0.0
-  // check = <1.0
-  // desc = max conflict rate for bundles in EFB
-  // desc = set this to ``0.0`` to disallow the conflict and provide more accurate results
-  // desc = set this to a larger value to achieve faster speed
-  double max_conflict_rate = 0.0;
-
-  // alias = is_sparse, enable_sparse, sparse
-  // desc = used to enable/disable sparse optimization
-  bool is_enable_sparse = true;
-
-  // check = >0.0
-  // check = <=1.0
-  // desc = the threshold of zero elements percentage for treating a feature as a sparse one
-  double sparse_threshold = 0.8;
-
   // desc = set this to ``false`` to disable the special handle of missing value
   bool use_missing = true;