microsoft · StrikerRUS · Mar 12, 2022 · Mar 22, 2022 · May 3, 2022 · Jan 5, 2023
@@ -26,6 +26,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     boosting/sample_strategy.o \
     io/bin.o \
     io/config.o \

@@ -27,6 +27,7 @@ OBJECTS = \
     boosting/gbdt_model_text.o \
     boosting/gbdt_prediction.o \
     boosting/prediction_early_stop.o \
+    boosting/mvs.o \
     boosting/sample_strategy.o \
     io/bin.o \
     io/config.o \

@@ -137,6 +137,8 @@ Core Parameters
 
       -  **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
 
+   -  ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
+
 -  ``data_sample_strategy`` :raw-html:`<a id="data_sample_strategy" title="Permalink to this parameter" href="#data_sample_strategy">&#x1F517;&#xFE0E;</a>`, default = ``bagging``, type = enum, options: ``bagging``, ``goss``
 
    -  ``bagging``, Randomly Bagging Sampling
@@ -346,6 +348,28 @@ Learning Control Parameters
 
    -  **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
 
+-  ``mvs_lambda`` :raw-html:`<a id="mvs_lambda" title="Permalink to this parameter" href="#mvs_lambda">&#x1F517;&#xFE0E;</a>`, default = ``1e-4``, type = double, constraints: ``mvs_lambda > 0.0``
+
+   -  used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored.
+
+   -  used only in ``mvs``
+
+-  ``mvs_adaptive`` :raw-html:`<a id="mvs_adaptive" title="Permalink to this parameter" href="#mvs_adaptive">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  use adaptive variant of mvs boosting
+
+   -  used only in ``mvs``
+
+-  ``mvs_max_sequential_size`` :raw-html:`<a id="mvs_max_sequential_size" title="Permalink to this parameter" href="#mvs_max_sequential_size">&#x1F517;&#xFE0E;</a>`, default = ``256000``, type = int, constraints: ``mvs_max_sequential_size > 0``
+
+   -  used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold
+
+   -  for MVS is chosen for each thread independently.
+
+   -  used only in ``mvs``
+
+   -  **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+
 -  ``bagging_freq`` :raw-html:`<a id="bagging_freq" title="Permalink to this parameter" href="#bagging_freq">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = int, aliases: ``subsample_freq``
 
    -  frequency for bagging

@@ -158,6 +158,7 @@ struct Config {
   // desc = ``rf``, Random Forest, aliases: ``random_forest``
   // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
   // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations
+  // desc = ``mvs``, Minimal variance sampling <https://arxiv.org/abs/1910.13204>__
   std::string boosting = "gbdt";
 
   // [doc-only]
@@ -329,6 +330,25 @@ struct Config {
   // desc = **Note**: if balanced bagging is enabled, ``bagging_fraction`` will be ignored
   double neg_bagging_fraction = 1.0;
 
+  // default = 1e-4
+  // check = >0.0
+  // desc = used in MVS boosting. If ``mvs_adaptive == true`` then this value is ignored.
+  // desc = used only in ``mvs``
+  double mvs_lambda = 1e-4;
+
+  // default = false
+  // desc = use adaptive variant of mvs boosting
+  // desc = used only in ``mvs``
+  bool mvs_adaptive = false;
+
+  // default = 256000
+  // check = >0
+  // desc = used in MVS boosting training. If dataset size is greater than ``mvs_max_sequential_size``, then threshold
+  // desc = for MVS is chosen for each thread independently.
+  // desc = used only in ``mvs``
+  // desc = **Note**: on small dataset setting this parameter less than size of dataset may produce results depending on number of threads
+  int mvs_max_sequential_size = 256000;
+
   // alias = subsample_freq
   // desc = frequency for bagging
   // desc = ``0`` means disable bagging; ``k`` means perform bagging at every ``k`` iteration. Every ``k``-th iteration, LightGBM will randomly select ``bagging_fraction * 100 %`` of the data to use for the next ``k`` iterations

@@ -22,13 +22,13 @@ namespace LightGBM {
 
 class SampleStrategy {
  public:
-  SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false) {}
+  SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false), need_re_bagging_(false) {}
 
   virtual ~SampleStrategy() {}
 
   static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration);
 
-  virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0;
+  virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians, const std::vector<std::unique_ptr<Tree>>& models) = 0;
 
   virtual void ResetSampleConfig(const Config* config, bool is_change_dataset) = 0;
 
@@ -71,6 +71,8 @@ class SampleStrategy {
   ParallelPartitionRunner<data_size_t, false> bagging_runner_;
   /*! \brief whether need to resize the gradient vectors */
   bool need_resize_gradients_;
+  /*! \brief whether need restart bagging in continued training */
+  bool need_re_bagging_;
 
   #ifdef USE_CUDA_EXP
   /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */

@@ -9,6 +9,7 @@
 #include <LightGBM/utils/threading.h>
 
 #include <algorithm>
+#include <numeric>
 #include <utility>
 #include <vector>
 
@@ -185,6 +186,36 @@ class ArrayArgs {
     }
     return true;
   }
+
+  static double CalculateThresholdMVS(std::vector<VAL_T>* gradients, data_size_t begin, data_size_t end,
+                                             const double sample_size) {
+    double current_sum_small = 0.0;
+    data_size_t big_grad_size = 0;
+
+    while (begin != end) {
+      data_size_t middle_begin = begin - 1, middle_end = end;
+      ArrayArgs<score_t>::Partition(gradients, begin, end, &middle_begin, &middle_end);
+      ++middle_begin;  // for half intervals
+      const data_size_t n_middle = middle_end - middle_begin;
+      const data_size_t large_size = middle_begin - begin;
+
+      const double sum_small = std::accumulate(gradients->begin() + middle_end, gradients->begin() + end, 0.0);
+      const double sum_middle = (*gradients)[middle_begin] * n_middle;
+
+      const double
+          current_sampling_rate = (current_sum_small + sum_small) / (*gradients)[middle_begin] + big_grad_size + n_middle + large_size;
+
+      if (current_sampling_rate > sample_size) {
+        current_sum_small += sum_small + sum_middle;
+        end = middle_begin;
+      } else {
+        big_grad_size += n_middle + large_size;
+        begin = middle_end;
+      }
+    }
+
+    return current_sum_small / (sample_size - big_grad_size + kEpsilon);
+  }
 };
 
 }  // namespace LightGBM

@@ -383,6 +383,7 @@ def __init__(
             'gbdt', traditional Gradient Boosting Decision Tree.
             'dart', Dropouts meet Multiple Additive Regression Trees.
             'rf', Random Forest.
+            'mvs', Minimal Variance Sampling.
         num_leaves : int, optional (default=31)
             Maximum tree leaves for base learners.
         max_depth : int, optional (default=-1)

@@ -12,8 +12,7 @@ namespace LightGBM {
 
 class BaggingSampleStrategy : public SampleStrategy {
  public:
-  BaggingSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration)
-    : need_re_bagging_(false) {
+  BaggingSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) {
     config_ = config;
     train_data_ = train_data;
     num_data_ = train_data->num_data();
@@ -23,7 +22,7 @@ class BaggingSampleStrategy : public SampleStrategy {
 
   ~BaggingSampleStrategy() {}
 
-  void Bagging(int iter, TreeLearner* tree_learner, score_t* /*gradients*/, score_t* /*hessians*/) override {
+  void Bagging(int iter, TreeLearner* tree_learner, score_t* /*gradients*/, score_t* /*hessians*/, const std::vector<std::unique_ptr<Tree>>& /*models*/) override {
     Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
     // if need bagging
     if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) ||
@@ -199,9 +198,6 @@ class BaggingSampleStrategy : public SampleStrategy {
     }
     return cur_left_cnt;
   }
-
-  /*! \brief whether need restart bagging in continued training */
-  bool need_re_bagging_;
 };
 
 }  // namespace LightGBM

@@ -7,6 +7,7 @@
 #include "dart.hpp"
 #include "gbdt.h"
 #include "rf.hpp"
+#include "mvs.hpp"
 
 namespace LightGBM {
 

@@ -337,7 +337,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
   }
 
   // bagging logic
-  data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data());
+  data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data(), models_);
   const bool is_use_subset = data_sample_strategy_->is_use_subset();
   const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
   const std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices = data_sample_strategy_->bag_data_indices();

@@ -27,7 +27,7 @@ class GOSSStrategy : public SampleStrategy {
   ~GOSSStrategy() {
   }
 
-  void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override {
+  void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians, const std::vector<std::unique_ptr<Tree>>& /*models*/) override {
     bag_data_cnt_ = num_data_;
     // not subsample for first iterations
     if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
@@ -90,14 +90,19 @@ class GOSSStrategy : public SampleStrategy {
     Log::Info("Using GOSS");
     balanced_bagging_ = false;
     bag_data_indices_.resize(num_data_);
+    #ifdef USE_CUDA_EXP
+    if (config_->device_type == std::string("cuda_exp")) {
+      cuda_bag_data_indices_.Resize(num_data_);
+    }
+    #endif  // USE_CUDA_EXP
     bagging_runner_.ReSize(num_data_);
     bagging_rands_.clear();
     for (int i = 0;
          i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) {
       bagging_rands_.emplace_back(config_->bagging_seed + i);
     }
     is_use_subset_ = false;
-    if (config_->top_rate + config_->other_rate <= 0.5) {
+    if (config_->device_type != std::string("cuda_exp") && config_->top_rate + config_->other_rate <= 0.5) {
       auto bag_data_cnt = static_cast<data_size_t>((config_->top_rate + config_->other_rate) * num_data_);
       bag_data_cnt = std::max(1, bag_data_cnt);
       tmp_subset_.reset(new Dataset(bag_data_cnt));