Skip to content

Commit

Permalink
introduce random assignment prediction(microsoft#5177)
Browse files Browse the repository at this point in the history
* Introduce a struct of parameters PredictionControlParameter

* Add a new field random_assign_features to class Config

* Add new GetLeaf functions that apply random assign mechanism to class Tree
  • Loading branch information
suncanghuai authored and suncanghuai committed Aug 28, 2023
1 parent 858eeb5 commit 3821952
Show file tree
Hide file tree
Showing 9 changed files with 205 additions and 35 deletions.
13 changes: 7 additions & 6 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <LightGBM/config.h>
#include <LightGBM/meta.h>
#include <LightGBM/prediction_control_parameter.h>

#include <string>
#include <map>
Expand Down Expand Up @@ -131,11 +132,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const = 0;

virtual void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* predict_params) const = 0;


/*!
Expand All @@ -144,11 +145,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const = 0;

virtual void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* predict_params) const = 0;


/*!
Expand Down
6 changes: 6 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,12 @@ struct Config {
// desc = **Note**: can be used only in CLI version
std::string output_result = "LightGBM_predict_result.txt";

// [no-save]
// desc = used only in ``prediction`` task
// desc = Split features of internal nodes that enable left-right random assignment mechnisim.
// desc = See section "A surrogate VIMP" of the paper https://doi.org/10.1214/07-EJS039 for more details.
std::vector<int> random_assign_features;

#ifndef __NVCC__
#pragma endregion

Expand Down
40 changes: 40 additions & 0 deletions include/LightGBM/prediction_control_parameter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_
#define LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_


#include <vector>
#include <algorithm>

namespace LightGBM {

/*!
* \brief Control paramters for prediction, used to implement variants of prediction algorithm
*/
struct PredictionControlParameter {
public:
PredictionControlParameter() {}
PredictionControlParameter(std::vector<int> &&ra_features) : random_assign_features(ra_features) {
// std::stable_sort(random_assign_features.begin(), random_assign_features.end(), std::less<int>());
}

/*!
* \brief try to enable random assignment mechanism with the split feature of current node
* \param split_feat_idx real index of the split feature
*/
inline bool EnableRandomAssign(int split_feat_idx) const {
// return std::binary_search(random_assign_features.begin(), random_assign_features.end(), split_feat_idx);
return (std::find(random_assign_features.begin(), random_assign_features.end(), split_feat_idx)
!= random_assign_features.end());
}

std::vector<int> random_assign_features;
};

} // namespace LightGBM

#endif // LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_
120 changes: 112 additions & 8 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@

#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>
#include <LightGBM/prediction_control_parameter.h>

#include <string>
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#include <random>

namespace LightGBM {

Expand Down Expand Up @@ -128,10 +130,11 @@ class Tree {
/*!
* \brief Prediction on one record
* \param feature_values Feature value of this record
* \param predict_params Control parameters for prediction
* \return Prediction result
*/
inline double Predict(const double* feature_values) const;
inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;
inline double Predict(const double* feature_values, PredictionControlParameter* predict_params) const;
inline double PredictByMap(const std::unordered_map<int, double>& feature_values, PredictionControlParameter* predict_params) const;

inline int PredictLeafIndex(const double* feature_values) const;
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;
Expand Down Expand Up @@ -422,6 +425,16 @@ class Tree {
inline int GetLeaf(const double* feature_values) const;
inline int GetLeafByMap(const std::unordered_map<int, double>& feature_values) const;

/*!
* \brief Find leaf index of which record belongs by features under the random assignment mechanism
* \param feature_values Feature value of this record
* \param predict_params Control parameters for prediction
* \return Leaf index
*/
inline int GetLeafWithRandomAssign(const double* feature_values, PredictionControlParameter* predict_params) const;
inline int GetLeafByMapWithRandomAssign(const std::unordered_map<int, double>& feature_values,
PredictionControlParameter* predict_params) const;

/*! \brief Serialize one node to json*/
std::string NodeToJSON(int index) const;

Expand Down Expand Up @@ -582,9 +595,14 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
}
}

inline double Tree::Predict(const double* feature_values) const {
inline double Tree::Predict(const double* feature_values, PredictionControlParameter* predict_params) const {
int leaf = 0;
if (num_leaves_ > 1) {
leaf = (predict_params->random_assign_features.empty())
? GetLeaf(feature_values)
: GetLeafWithRandomAssign(feature_values, predict_params);
}
if (is_linear_) {
int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0;
double output = leaf_const_[leaf];
bool nan_found = false;
for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
Expand All @@ -604,17 +622,22 @@ inline double Tree::Predict(const double* feature_values) const {
}
} else {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
return LeafOutput(leaf);
} else {
return leaf_value_[0];
}
}
}

inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values) const {
inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_values,
PredictionControlParameter* predict_params) const {
int leaf = 0;
if (num_leaves_ > 1) {
leaf = (predict_params->random_assign_features.empty())
? GetLeafByMap(feature_values)
: GetLeafByMapWithRandomAssign(feature_values, predict_params);
}
if (is_linear_) {
int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0;
double output = leaf_const_[leaf];
bool nan_found = false;
for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
Expand All @@ -637,7 +660,6 @@ inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_
}
} else {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return LeafOutput(leaf);
} else {
return leaf_value_[0];
Expand Down Expand Up @@ -724,6 +746,88 @@ inline int Tree::GetLeafByMap(const std::unordered_map<int, double>& feature_val
return ~node;
}

inline int Tree::GetLeafWithRandomAssign(const double* feature_values,
PredictionControlParameter* predict_params) const {
if (predict_params == nullptr) {
return GetLeaf(feature_values);
}
bool random_assign_enabled = false;
int node = 0;
if (num_cat_ > 0) {
while (node >= 0) {
if (predict_params->EnableRandomAssign(split_feature_[node])) {
random_assign_enabled = true;
break;
}
node = Decision(feature_values[split_feature_[node]], node);
}
} else {
while (node >= 0) {
if (predict_params->EnableRandomAssign(split_feature_[node])) {
random_assign_enabled = true;
break;
}
node = NumericalDecision(feature_values[split_feature_[node]], node);
}
}
if (random_assign_enabled) {
std::random_device rd;
std::mt19937 generator(rd());
std::bernoulli_distribution random_assign_distribution(0.5);

while (node >= 0) {
if (random_assign_distribution(generator)) {
node = left_child_[node];
}
else {
node = right_child_[node];
}
}
}
return ~node;
}

inline int Tree::GetLeafByMapWithRandomAssign(const std::unordered_map<int, double>& feature_values,
PredictionControlParameter* predict_params) const {
if (predict_params == nullptr) {
return GetLeafByMap(feature_values);
}
bool random_assign_enabled = false;
int node = 0;
if (num_cat_ > 0) {
while (node >= 0) {
if (predict_params->EnableRandomAssign(split_feature_[node])) {
random_assign_enabled = true;
break;
}
node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
} else {
while (node >= 0) {
if (predict_params->EnableRandomAssign(split_feature_[node])) {
random_assign_enabled = true;
break;
}
node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node);
}
}
if (random_assign_enabled) {
std::random_device rd;
std::mt19937 generator(rd());
std::bernoulli_distribution random_assign_distribution(0.5);

while (node >= 0) {
if (random_assign_distribution(generator)) {
node = left_child_[node];
}
else {
node = right_child_[node];
}
}
}
return ~node;
}

} // namespace LightGBM

#endif // LightGBM_TREE_H_
3 changes: 2 additions & 1 deletion src/application/application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ void Application::LoadData() {
PredictFunction predict_fun = nullptr;
// need to continue training
if (boosting_->NumberOfTotalModel() > 0 && config_.task != TaskType::KRefitTree) {
predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1));
PredictionControlParameter predict_params;
predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1, predict_params));
predict_fun = predictor->GetPredictFunction();
}

Expand Down
12 changes: 7 additions & 5 deletions src/application/predictor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Predictor {
*/
Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score,
bool predict_leaf_index, bool predict_contrib, bool early_stop,
int early_stop_freq, double early_stop_margin) {
int early_stop_freq, double early_stop_margin, PredictionControlParameter& predict_params) {
early_stop_ = CreatePredictionEarlyStopInstance(
"none", LightGBM::PredictionEarlyStopConfig());
if (early_stop && !boosting->NeedAccuratePrediction()) {
Expand All @@ -58,6 +58,7 @@ class Predictor {
}
}

predict_params_ = std::move(predict_params);
boosting->InitPredict(start_iteration, num_iteration, predict_contrib);
boosting_ = boosting;
num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration,
Expand Down Expand Up @@ -113,11 +114,11 @@ class Predictor {
if (num_feature_ > kFeatureThreshold &&
features.size() < KSparseThreshold) {
auto buf = CopyToPredictMap(features);
boosting_->PredictRawByMap(buf, output, &early_stop_);
boosting_->PredictRawByMap(buf, output, &early_stop_, &predict_params_);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->PredictRaw(predict_buf_[tid].data(), output,
&early_stop_);
&early_stop_, &predict_params_);
ClearPredictBuffer(predict_buf_[tid].data(),
predict_buf_[tid].size(), features);
}
Expand All @@ -129,10 +130,10 @@ class Predictor {
if (num_feature_ > kFeatureThreshold &&
features.size() < KSparseThreshold) {
auto buf = CopyToPredictMap(features);
boosting_->PredictByMap(buf, output, &early_stop_);
boosting_->PredictByMap(buf, output, &early_stop_, &predict_params_);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_, &predict_params_);
ClearPredictBuffer(predict_buf_[tid].data(),
predict_buf_[tid].size(), features);
}
Expand Down Expand Up @@ -292,6 +293,7 @@ class Predictor {
PredictFunction predict_fun_;
PredictSparseFunction predict_sparse_fun_;
PredictionEarlyStopInstance early_stop_;
PredictionControlParameter predict_params_;
int num_feature_;
int num_pred_one_row_;
std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
Expand Down
16 changes: 9 additions & 7 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,19 @@ class GBDT : public GBDTBase {
return num_pred_in_one_row;
}

void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const override;

void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const override;

void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const override;

void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* predict_params) const override;

void PredictLeafIndex(const double* features, double* output) const override;

Expand Down
Loading

0 comments on commit 3821952

Please sign in to comment.