Skip to content

Commit

Permalink
introduce random assignment prediction(microsoft#5177)
Browse files Browse the repository at this point in the history
  • Loading branch information
suncanghuai committed Aug 27, 2023
1 parent 858eeb5 commit 772c655
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 27 deletions.
13 changes: 7 additions & 6 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <LightGBM/config.h>
#include <LightGBM/meta.h>
#include <LightGBM/prediction_control_parameter.h>

#include <string>
#include <map>
Expand Down Expand Up @@ -131,11 +132,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* param) const = 0;

virtual void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* param) const = 0;


/*!
Expand All @@ -144,11 +145,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* param) const = 0;

virtual void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* param) const = 0;


/*!
Expand Down
6 changes: 6 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,12 @@ struct Config {
// desc = **Note**: can be used only in CLI version
std::string output_result = "LightGBM_predict_result.txt";

// [no-save]
// desc = used only in ``prediction`` task
// desc = Split features of internal nodes that enable left-right random assignment mechnisim.
// desc = See section "A surrogate VIMP" of the paper https://doi.org/10.1214/07-EJS039 for more details.
std::vector<int> random_assign_features;

#ifndef __NVCC__
#pragma endregion

Expand Down
30 changes: 30 additions & 0 deletions include/LightGBM/prediction_control_parameter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_
#define LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_


#include <vector>

namespace LightGBM {

/*!
* \brief Control paramters for prediction, used to implement variants of prediction algorithm
*/
struct PredictionControlParameter {
public:
PredictionControlParameter() {}
PredictionControlParameter(std::vector<int> &&ra_features) : random_assign_features(ra_features) {
enable_random_assign = (random_assign_features.size() > 0);
}

bool enable_random_assign = false;
std::vector<int> random_assign_features;
};

} // namespace LightGBM

#endif // LIGHTGBM_PREDICTION_EARLY_STOP_H_
78 changes: 78 additions & 0 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>
#include <LightGBM/prediction_control_parameter.h>

#include <string>
#include <map>
Expand Down Expand Up @@ -133,6 +134,16 @@ class Tree {
inline double Predict(const double* feature_values) const;
inline double PredictByMap(const std::unordered_map<int, double>& feature_values) const;

/*!
* \brief A variant of prediction, which applys random assign mechanism
* \param feature_values Feature value of this record
* \param param Control parameters for prediction
* \return Prediction result
*/
inline double RandomAssignPredict(const double* feature_values, PredictionControlParameter* param) const;
inline double RandomAssignPredictByMap(const std::unordered_map<int, double>& feature_values,
PredictionControlParameter* param) const;

inline int PredictLeafIndex(const double* feature_values) const;
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

Expand Down Expand Up @@ -645,6 +656,73 @@ inline double Tree::PredictByMap(const std::unordered_map<int, double>& feature_
}
}

inline double Tree::RandomAssignPredict(const double* feature_values, PredictionControlParameter* param) const {
double leaf_output;
if (is_linear_) {
int leaf = (num_leaves_ > 1) ? GetLeafUnderRandomAssign(feature_values) : 0;
double output = leaf_const_[leaf];
bool nan_found = false;
for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
int feat_raw = leaf_features_[leaf][i];
double feat_val = feature_values[feat_raw];
if (std::isnan(feat_val)) {
nan_found = true;
break;
} else {
output += leaf_coeff_[leaf][i] * feat_val;
}
}
if (nan_found) {
leaf_output = LeafOutput(leaf);
} else {
leaf_output = output;
}
} else {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
leaf_output = LeafOutput(leaf);
} else {
leaf_output = leaf_value_[0];
}
}


}

inline double Tree::RandomAssignPredictByMap(const std::unordered_map<int, double>& feature_values,
PredictionControlParameter* param) const {
if (is_linear_) {
int leaf = (num_leaves_ > 1) ? GetLeafByMapUnderRandomAssign(feature_values) : 0;
double output = leaf_const_[leaf];
bool nan_found = false;
for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) {
int feat = leaf_features_[leaf][i];
auto val_it = feature_values.find(feat);
if (val_it != feature_values.end()) {
double feat_val = val_it->second;
if (std::isnan(feat_val)) {
nan_found = true;
break;
} else {
output += leaf_coeff_[leaf][i] * feat_val;
}
}
}
if (nan_found) {
return LeafOutput(leaf);
} else {
return output;
}
} else {
if (num_leaves_ > 1) {
int leaf = GetLeafByMap(feature_values);
return LeafOutput(leaf);
} else {
return leaf_value_[0];
}
}
}

inline int Tree::PredictLeafIndex(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values);
Expand Down
3 changes: 2 additions & 1 deletion src/application/application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ void Application::LoadData() {
PredictFunction predict_fun = nullptr;
// need to continue training
if (boosting_->NumberOfTotalModel() > 0 && config_.task != TaskType::KRefitTree) {
predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1));
PredictionControlParameter param;
predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1, param));
predict_fun = predictor->GetPredictFunction();
}

Expand Down
12 changes: 7 additions & 5 deletions src/application/predictor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Predictor {
*/
Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score,
bool predict_leaf_index, bool predict_contrib, bool early_stop,
int early_stop_freq, double early_stop_margin) {
int early_stop_freq, double early_stop_margin, PredictionControlParameter& predict_param) {
early_stop_ = CreatePredictionEarlyStopInstance(
"none", LightGBM::PredictionEarlyStopConfig());
if (early_stop && !boosting->NeedAccuratePrediction()) {
Expand All @@ -58,6 +58,7 @@ class Predictor {
}
}

predict_param_ = std::move(predict_param_);
boosting->InitPredict(start_iteration, num_iteration, predict_contrib);
boosting_ = boosting;
num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration,
Expand Down Expand Up @@ -113,11 +114,11 @@ class Predictor {
if (num_feature_ > kFeatureThreshold &&
features.size() < KSparseThreshold) {
auto buf = CopyToPredictMap(features);
boosting_->PredictRawByMap(buf, output, &early_stop_);
boosting_->PredictRawByMap(buf, output, &early_stop_, &predict_param_);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->PredictRaw(predict_buf_[tid].data(), output,
&early_stop_);
&early_stop_, &predict_param_);
ClearPredictBuffer(predict_buf_[tid].data(),
predict_buf_[tid].size(), features);
}
Expand All @@ -129,10 +130,10 @@ class Predictor {
if (num_feature_ > kFeatureThreshold &&
features.size() < KSparseThreshold) {
auto buf = CopyToPredictMap(features);
boosting_->PredictByMap(buf, output, &early_stop_);
boosting_->PredictByMap(buf, output, &early_stop_, &predict_param_);
} else {
CopyToPredictBuffer(predict_buf_[tid].data(), features);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_, &predict_param);
ClearPredictBuffer(predict_buf_[tid].data(),
predict_buf_[tid].size(), features);
}
Expand Down Expand Up @@ -292,6 +293,7 @@ class Predictor {
PredictFunction predict_fun_;
PredictSparseFunction predict_sparse_fun_;
PredictionEarlyStopInstance early_stop_;
PredictionControlParameter predict_param_;
int num_feature_;
int num_pred_one_row_;
std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
Expand Down
14 changes: 7 additions & 7 deletions src/boosting/gbdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,17 @@ class GBDT : public GBDTBase {
return num_pred_in_one_row;
}

void PredictRaw(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* param) const = 0;

void PredictRawByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* param) const = 0;

void Predict(const double* features, double* output,
const PredictionEarlyStopInstance* earlyStop) const override;
void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop,
const PredictionControlParameter* param) const = 0;

void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop) const override;
void PredictByMap(const std::unordered_map<int, double>& features, double* output,
const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* param) const = 0;

void PredictLeafIndex(const double* features, double* output) const override;

Expand Down
34 changes: 26 additions & 8 deletions src/boosting/gbdt_prediction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,21 @@

namespace LightGBM {

void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
void GBDT::PredictRaw(const double *features, double *output,
const PredictionEarlyStopInstance *early_stop,
const PredictionControlParameter *param) const {
int early_stop_round_counter = 0;
// set zero
std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
// predict all the trees for one iteration
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features);
if (param->enable_random_assign) {
output[k] += models_[i * num_tree_per_iteration_ + k]->RandomAssignPredict(features);
} else {
output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features);
}
}
// check early stopping
++early_stop_round_counter;
Expand All @@ -31,15 +37,22 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa
}
}

void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
void GBDT::PredictRawByMap(const std::unordered_map<int, double> &features,
double *output,
const PredictionEarlyStopInstance *early_stop,
const PredictionControlParameter *param) const {
int early_stop_round_counter = 0;
// set zero
std::memset(output, 0, sizeof(double) * num_tree_per_iteration_);
const int end_iteration_for_pred = start_iteration_for_pred_ + num_iteration_for_pred_;
for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) {
// predict all the trees for one iteration
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features);
if (param->enable_random_assign) {
output[k] += models_[i * num_tree_per_iteration_ + k]->RandomAssignPredictByMap(features);
} else {
output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features);
}
}
// check early stopping
++early_stop_round_counter;
Expand All @@ -52,8 +65,10 @@ void GBDT::PredictRawByMap(const std::unordered_map<int, double>& features, doub
}
}

void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const {
PredictRaw(features, output, early_stop);
void GBDT::Predict(const double *features, double *output,
const PredictionEarlyStopInstance *early_stop,
const PredictionControlParameter *param) const {
PredictRaw(features, output, early_stop, param);
if (average_output_) {
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] /= num_iteration_for_pred_;
Expand All @@ -64,8 +79,11 @@ void GBDT::Predict(const double* features, double* output, const PredictionEarly
}
}

void GBDT::PredictByMap(const std::unordered_map<int, double>& features, double* output, const PredictionEarlyStopInstance* early_stop) const {
PredictRawByMap(features, output, early_stop);
void GBDT::PredictByMap(const std::unordered_map<int, double> &features,
double *output,
const PredictionEarlyStopInstance *early_stop,
const PredictionControlParameter *param) const {
PredictRawByMap(features, output, early_stop, param);
if (average_output_) {
for (int k = 0; k < num_tree_per_iteration_; ++k) {
output[k] /= num_iteration_for_pred_;
Expand Down
4 changes: 4 additions & 0 deletions src/io/config_auto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str

GetString(params, "output_result", &output_result);

if (GetString(params, "random_assign_features", &tmp_str)) {
random_assign_features = Common::StringToArray<int>(tmp_str, ',');
}

GetString(params, "convert_model_language", &convert_model_language);

GetString(params, "convert_model", &convert_model);
Expand Down

0 comments on commit 772c655

Please sign in to comment.