diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index 1bfc18b4470b..f97b511b42f9 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -131,11 +132,11 @@ class LIGHTGBM_EXPORT Boosting { * \param output Prediction result for this record * \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated. */ - virtual void PredictRaw(const double* features, double* output, - const PredictionEarlyStopInstance* early_stop) const = 0; + virtual void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const = 0; virtual void PredictRawByMap(const std::unordered_map& features, double* output, - const PredictionEarlyStopInstance* early_stop) const = 0; + const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* predict_params) const = 0; /*! @@ -144,11 +145,11 @@ class LIGHTGBM_EXPORT Boosting { * \param output Prediction result for this record * \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated. */ - virtual void Predict(const double* features, double* output, - const PredictionEarlyStopInstance* early_stop) const = 0; + virtual void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const = 0; virtual void PredictByMap(const std::unordered_map& features, double* output, - const PredictionEarlyStopInstance* early_stop) const = 0; + const PredictionEarlyStopInstance* early_stop, const PredictionControlParameter* predict_params) const = 0; /*! diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index e01578396259..fbb2b8268d6b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -859,6 +859,12 @@ struct Config { // desc = **Note**: can be used only in CLI version std::string output_result = "LightGBM_predict_result.txt"; + // [no-save] + // desc = used only in ``prediction`` task + // desc = Split features of internal nodes that enable left-right random assignment mechnisim. + // desc = See section "A surrogate VIMP" of the paper https://doi.org/10.1214/07-EJS039 for more details. + std::vector random_assign_features; + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/prediction_control_parameter.h b/include/LightGBM/prediction_control_parameter.h new file mode 100644 index 000000000000..6de370f32067 --- /dev/null +++ b/include/LightGBM/prediction_control_parameter.h @@ -0,0 +1,40 @@ + +/*! + * Copyright (c) 2017 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ +#ifndef LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_ +#define LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_ + + +#include +#include + +namespace LightGBM { + +/*! +* \brief Control paramters for prediction, used to implement variants of prediction algorithm +*/ +struct PredictionControlParameter { + public: + PredictionControlParameter() {} + PredictionControlParameter(std::vector &&ra_features) : random_assign_features(ra_features) { + // std::stable_sort(random_assign_features.begin(), random_assign_features.end(), std::less()); + } + + /*! + * \brief try to enable random assignment mechanism with the split feature of current node + * \param split_feat_idx real index of the split feature + */ + inline bool EnableRandomAssign(int split_feat_idx) const { + // return std::binary_search(random_assign_features.begin(), random_assign_features.end(), split_feat_idx); + return (std::find(random_assign_features.begin(), random_assign_features.end(), split_feat_idx) + != random_assign_features.end()); + } + + std::vector random_assign_features; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_PREDICTION_CONTROL_PARAMETER_H_ diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 13b3c41a2309..336aeac5d23e 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -7,12 +7,14 @@ #include #include +#include #include #include #include #include #include +#include namespace LightGBM { @@ -128,10 +130,12 @@ class Tree { /*! * \brief Prediction on one record * \param feature_values Feature value of this record + * \param predict_params Control parameters for prediction * \return Prediction result */ - inline double Predict(const double* feature_values) const; - inline double PredictByMap(const std::unordered_map& feature_values) const; + inline double Predict(const double* feature_values, const PredictionControlParameter* predict_params) const; + inline double PredictByMap(const std::unordered_map& feature_values, + const PredictionControlParameter* predict_params) const; inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndexByMap(const std::unordered_map& feature_values) const; @@ -422,6 +426,16 @@ class Tree { inline int GetLeaf(const double* feature_values) const; inline int GetLeafByMap(const std::unordered_map& feature_values) const; + /*! + * \brief Find leaf index of which record belongs by features under the random assignment mechanism + * \param feature_values Feature value of this record + * \param predict_params Control parameters for prediction + * \return Leaf index + */ + inline int GetLeafWithRandomAssign(const double* feature_values, const PredictionControlParameter* predict_params) const; + inline int GetLeafByMapWithRandomAssign(const std::unordered_map& feature_values, + const PredictionControlParameter* predict_params) const; + /*! \brief Serialize one node to json*/ std::string NodeToJSON(int index) const; @@ -582,9 +596,14 @@ inline void Tree::Split(int leaf, int feature, int real_feature, } } -inline double Tree::Predict(const double* feature_values) const { +inline double Tree::Predict(const double* feature_values, const PredictionControlParameter* predict_params) const { + int leaf = 0; + if (num_leaves_ > 1) { + leaf = (predict_params->random_assign_features.empty()) + ? GetLeaf(feature_values) + : GetLeafWithRandomAssign(feature_values, predict_params); + } if (is_linear_) { - int leaf = (num_leaves_ > 1) ? GetLeaf(feature_values) : 0; double output = leaf_const_[leaf]; bool nan_found = false; for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) { @@ -604,7 +623,6 @@ inline double Tree::Predict(const double* feature_values) const { } } else { if (num_leaves_ > 1) { - int leaf = GetLeaf(feature_values); return LeafOutput(leaf); } else { return leaf_value_[0]; @@ -612,9 +630,15 @@ inline double Tree::Predict(const double* feature_values) const { } } -inline double Tree::PredictByMap(const std::unordered_map& feature_values) const { +inline double Tree::PredictByMap(const std::unordered_map& feature_values, + const PredictionControlParameter* predict_params) const { + int leaf = 0; + if (num_leaves_ > 1) { + leaf = (predict_params->random_assign_features.empty()) + ? GetLeafByMap(feature_values) + : GetLeafByMapWithRandomAssign(feature_values, predict_params); + } if (is_linear_) { - int leaf = (num_leaves_ > 1) ? GetLeafByMap(feature_values) : 0; double output = leaf_const_[leaf]; bool nan_found = false; for (size_t i = 0; i < leaf_features_[leaf].size(); ++i) { @@ -637,7 +661,6 @@ inline double Tree::PredictByMap(const std::unordered_map& feature_ } } else { if (num_leaves_ > 1) { - int leaf = GetLeafByMap(feature_values); return LeafOutput(leaf); } else { return leaf_value_[0]; @@ -724,6 +747,88 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } +inline int Tree::GetLeafWithRandomAssign(const double* feature_values, + const PredictionControlParameter* predict_params) const { + if (predict_params == nullptr) { + return GetLeaf(feature_values); + } + bool random_assign_enabled = false; + int node = 0; + if (num_cat_ > 0) { + while (node >= 0) { + if (predict_params->EnableRandomAssign(split_feature_[node])) { + random_assign_enabled = true; + break; + } + node = Decision(feature_values[split_feature_[node]], node); + } + } else { + while (node >= 0) { + if (predict_params->EnableRandomAssign(split_feature_[node])) { + random_assign_enabled = true; + break; + } + node = NumericalDecision(feature_values[split_feature_[node]], node); + } + } + if (random_assign_enabled) { + std::random_device rd; + std::mt19937 generator(rd()); + std::bernoulli_distribution random_assign_distribution(0.5); + + while (node >= 0) { + if (random_assign_distribution(generator)) { + node = left_child_[node]; + } + else { + node = right_child_[node]; + } + } + } + return ~node; +} + +inline int Tree::GetLeafByMapWithRandomAssign(const std::unordered_map& feature_values, + const PredictionControlParameter* predict_params) const { + if (predict_params == nullptr) { + return GetLeafByMap(feature_values); + } + bool random_assign_enabled = false; + int node = 0; + if (num_cat_ > 0) { + while (node >= 0) { + if (predict_params->EnableRandomAssign(split_feature_[node])) { + random_assign_enabled = true; + break; + } + node = Decision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node); + } + } else { + while (node >= 0) { + if (predict_params->EnableRandomAssign(split_feature_[node])) { + random_assign_enabled = true; + break; + } + node = NumericalDecision(feature_values.count(split_feature_[node]) > 0 ? feature_values.at(split_feature_[node]) : 0.0f, node); + } + } + if (random_assign_enabled) { + std::random_device rd; + std::mt19937 generator(rd()); + std::bernoulli_distribution random_assign_distribution(0.5); + + while (node >= 0) { + if (random_assign_distribution(generator)) { + node = left_child_[node]; + } + else { + node = right_child_[node]; + } + } + } + return ~node; +} + } // namespace LightGBM #endif // LightGBM_TREE_H_ diff --git a/src/application/application.cpp b/src/application/application.cpp index 3e51136afc96..857b479b2755 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -92,7 +92,8 @@ void Application::LoadData() { PredictFunction predict_fun = nullptr; // need to continue training if (boosting_->NumberOfTotalModel() > 0 && config_.task != TaskType::KRefitTree) { - predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1)); + PredictionControlParameter predict_params; + predictor.reset(new Predictor(boosting_.get(), 0, -1, true, false, false, false, -1, -1, predict_params)); predict_fun = predictor->GetPredictFunction(); } @@ -221,7 +222,8 @@ void Application::Train() { void Application::Predict() { if (config_.task == TaskType::KRefitTree) { // create predictor - Predictor predictor(boosting_.get(), 0, -1, false, true, false, false, 1, 1); + PredictionControlParameter predict_params; + Predictor predictor(boosting_.get(), 0, -1, false, true, false, false, 1, 1, predict_params); predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check, config_.precise_float_parser); TextReader result_reader(config_.output_result.c_str(), false); @@ -248,10 +250,11 @@ void Application::Predict() { Log::Info("Finished RefitTree"); } else { // create predictor + PredictionControlParameter predict_params; Predictor predictor(boosting_.get(), config_.start_iteration_predict, config_.num_iteration_predict, config_.predict_raw_score, config_.predict_leaf_index, config_.predict_contrib, config_.pred_early_stop, config_.pred_early_stop_freq, - config_.pred_early_stop_margin); + config_.pred_early_stop_margin, predict_params); predictor.Predict(config_.data.c_str(), config_.output_result.c_str(), config_.header, config_.predict_disable_shape_check, config_.precise_float_parser); diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp index d1a8aca4d041..6902c8e352bc 100644 --- a/src/application/predictor.hpp +++ b/src/application/predictor.hpp @@ -40,7 +40,7 @@ class Predictor { */ Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score, bool predict_leaf_index, bool predict_contrib, bool early_stop, - int early_stop_freq, double early_stop_margin) { + int early_stop_freq, double early_stop_margin, PredictionControlParameter& predict_params) { early_stop_ = CreatePredictionEarlyStopInstance( "none", LightGBM::PredictionEarlyStopConfig()); if (early_stop && !boosting->NeedAccuratePrediction()) { @@ -58,6 +58,7 @@ class Predictor { } } + predict_params_ = std::move(predict_params); boosting->InitPredict(start_iteration, num_iteration, predict_contrib); boosting_ = boosting; num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration, @@ -113,11 +114,11 @@ class Predictor { if (num_feature_ > kFeatureThreshold && features.size() < KSparseThreshold) { auto buf = CopyToPredictMap(features); - boosting_->PredictRawByMap(buf, output, &early_stop_); + boosting_->PredictRawByMap(buf, output, &early_stop_, &predict_params_); } else { CopyToPredictBuffer(predict_buf_[tid].data(), features); boosting_->PredictRaw(predict_buf_[tid].data(), output, - &early_stop_); + &early_stop_, &predict_params_); ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features); } @@ -129,10 +130,10 @@ class Predictor { if (num_feature_ > kFeatureThreshold && features.size() < KSparseThreshold) { auto buf = CopyToPredictMap(features); - boosting_->PredictByMap(buf, output, &early_stop_); + boosting_->PredictByMap(buf, output, &early_stop_, &predict_params_); } else { CopyToPredictBuffer(predict_buf_[tid].data(), features); - boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_); + boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_, &predict_params_); ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(), features); } @@ -292,6 +293,7 @@ class Predictor { PredictFunction predict_fun_; PredictSparseFunction predict_sparse_fun_; PredictionEarlyStopInstance early_stop_; + PredictionControlParameter predict_params_; int num_feature_; int num_pred_one_row_; std::vector>> predict_buf_; diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index e38b26be3e14..02cf3c6243bf 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -290,17 +290,19 @@ class GBDT : public GBDTBase { return num_pred_in_one_row; } - void PredictRaw(const double* features, double* output, - const PredictionEarlyStopInstance* earlyStop) const override; + void PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const override; void PredictRawByMap(const std::unordered_map& features, double* output, - const PredictionEarlyStopInstance* early_stop) const override; + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const override; - void Predict(const double* features, double* output, - const PredictionEarlyStopInstance* earlyStop) const override; + void Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const override; - void PredictByMap(const std::unordered_map& features, double* output, - const PredictionEarlyStopInstance* early_stop) const override; + void PredictByMap(const std::unordered_map& features, double* output, + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const override; void PredictLeafIndex(const double* features, double* output) const override; diff --git a/src/boosting/gbdt_prediction.cpp b/src/boosting/gbdt_prediction.cpp index f1f7478cfcde..f2424e726d24 100644 --- a/src/boosting/gbdt_prediction.cpp +++ b/src/boosting/gbdt_prediction.cpp @@ -10,7 +10,9 @@ namespace LightGBM { -void GBDT::PredictRaw(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { +void GBDT::PredictRaw(const double* features, double* output, + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const { int early_stop_round_counter = 0; // set zero std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); @@ -18,7 +20,7 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { - output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features); + output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(features, predict_params); } // check early stopping ++early_stop_round_counter; @@ -31,7 +33,10 @@ void GBDT::PredictRaw(const double* features, double* output, const PredictionEa } } -void GBDT::PredictRawByMap(const std::unordered_map& features, double* output, const PredictionEarlyStopInstance* early_stop) const { +void GBDT::PredictRawByMap(const std::unordered_map& features, + double* output, + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const { int early_stop_round_counter = 0; // set zero std::memset(output, 0, sizeof(double) * num_tree_per_iteration_); @@ -39,7 +44,7 @@ void GBDT::PredictRawByMap(const std::unordered_map& features, doub for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { - output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features); + output[k] += models_[i * num_tree_per_iteration_ + k]->PredictByMap(features, predict_params); } // check early stopping ++early_stop_round_counter; @@ -52,8 +57,10 @@ void GBDT::PredictRawByMap(const std::unordered_map& features, doub } } -void GBDT::Predict(const double* features, double* output, const PredictionEarlyStopInstance* early_stop) const { - PredictRaw(features, output, early_stop); +void GBDT::Predict(const double* features, double* output, + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const { + PredictRaw(features, output, early_stop, predict_params); if (average_output_) { for (int k = 0; k < num_tree_per_iteration_; ++k) { output[k] /= num_iteration_for_pred_; @@ -64,8 +71,11 @@ void GBDT::Predict(const double* features, double* output, const PredictionEarly } } -void GBDT::PredictByMap(const std::unordered_map& features, double* output, const PredictionEarlyStopInstance* early_stop) const { - PredictRawByMap(features, output, early_stop); +void GBDT::PredictByMap(const std::unordered_map& features, + double* output, + const PredictionEarlyStopInstance* early_stop, + const PredictionControlParameter* predict_params) const { + PredictRawByMap(features, output, early_stop, predict_params); if (average_output_) { for (int k = 0; k < num_tree_per_iteration_; ++k) { output[k] /= num_iteration_for_pred_; diff --git a/src/c_api.cpp b/src/c_api.cpp index 442247d7a9dd..fed7229c35b3 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -78,8 +78,9 @@ class SingleRowPredictor { early_stop_freq_ = config.pred_early_stop_freq; early_stop_margin_ = config.pred_early_stop_margin; iter_ = num_iter; + PredictionControlParameter predict_params; predictor_.reset(new Predictor(boosting, start_iter, iter_, is_raw_score, is_predict_leaf, predict_contrib, - early_stop_, early_stop_freq_, early_stop_margin_)); + early_stop_, early_stop_freq_, early_stop_margin_, predict_params)); num_pred_in_one_row = boosting->NumPredictOneRow(start_iter, iter_, is_predict_leaf, predict_contrib); predict_function = predictor_->GetPredictFunction(); num_total_model_ = boosting->NumberOfTotalModel(); @@ -417,8 +418,9 @@ class Booster { is_raw_score = false; } + PredictionControlParameter predict_params(config.random_assign_features); return Predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin, predict_params); } void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, @@ -707,8 +709,10 @@ class Booster { } else { is_raw_score = false; } + PredictionControlParameter predict_params(config.random_assign_features); Predictor predictor(boosting_.get(), start_iteration, num_iteration, is_raw_score, is_predict_leaf, predict_contrib, - config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); + config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin, + predict_params); bool bool_data_has_header = data_has_header > 0 ? true : false; predictor.Predict(data_filename, result_filename, bool_data_has_header, config.predict_disable_shape_check, config.precise_float_parser); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 0906ba4b6439..69298555e005 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -576,6 +576,10 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); + } + GetString(params, "convert_model_language", &convert_model_language); GetString(params, "convert_model", &convert_model);