diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index c04263f62c1c..ec2c067b64b8 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -46,6 +46,7 @@ OBJECTS = \ network/linkers_socket.o \ network/network.o \ treelearner/data_parallel_tree_learner.o \ + treelearner/feature_histogram.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ treelearner/gradient_discretizer.o \ diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 86d56fecdf34..ebcb40d1372a 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -47,6 +47,7 @@ OBJECTS = \ network/linkers_socket.o \ network/network.o \ treelearner/data_parallel_tree_learner.o \ + treelearner/feature_histogram.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ treelearner/gradient_discretizer.o \ diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 4f2ec456c937..ec7581e504c4 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -62,15 +62,17 @@ void MultiValBinWrapper::HistMove(const std::vector(origin_hist_data_) + hist_move_dest_[i] / 2); } } else if (HIST_BITS == 16) { - const int32_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - - static_cast(num_bin_aligned_); if (is_use_subcol_) { + const int32_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - + static_cast(num_bin_aligned_); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, reinterpret_cast(origin_hist_data_) + hist_move_dest_[i] / 2); } } else { + CHECK_EQ(INNER_HIST_BITS, 8); + const int32_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2; int32_t* orig_ptr = reinterpret_cast(origin_hist_data_); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int i = 0; i < num_bin_; ++i) { @@ -148,7 +150,7 @@ void MultiValBinWrapper::HistMerge(std::vector(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + int32_t* dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2; std::memset(reinterpret_cast(dst), 0, num_bin_ * kInt16HistBufferEntrySize); #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) for (int t = 0; t < n_bin_block; ++t) { diff --git a/src/treelearner/feature_histogram.cpp b/src/treelearner/feature_histogram.cpp new file mode 100644 index 000000000000..29d99b92010b --- /dev/null +++ b/src/treelearner/feature_histogram.cpp @@ -0,0 +1,739 @@ +/*! + * Copyright (c) 2024 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "feature_histogram.hpp" + +namespace LightGBM { + +void FeatureHistogram::FuncForCategorical() { + if (meta_->config->extra_trees) { + if (meta_->config->monotone_constraints.empty()) { + FuncForCategoricalL1(); + } else { + FuncForCategoricalL1(); + } + } else { + if (meta_->config->monotone_constraints.empty()) { + FuncForCategoricalL1(); + } else { + FuncForCategoricalL1(); + } + } +} + +template +void FeatureHistogram::FuncForCategoricalL1() { + if (meta_->config->path_smooth > kEpsilon) { + FuncForCategoricalL2(); + } else { + FuncForCategoricalL2(); + } +} + +template +void FeatureHistogram::FuncForCategoricalL2() { + if (meta_->config->use_quantized_grad) { +#define LAMBDA_PARAMS_INT \ + int64_t int_sum_gradient_and_hessian, \ + const double grad_scale, const double hess_scale, \ + const uint8_t hist_bits_bin, const uint8_t hist_bits_acc, \ + data_size_t num_data, \ + const FeatureConstraint* constraints, \ + double parent_output, \ + SplitInfo* output + +#define ARGUMENTS_INT \ + int_sum_gradient_and_hessian, grad_scale, hess_scale, num_data, constraints, parent_output, output + + if (meta_->config->lambda_l1 > 0) { + if (meta_->config->max_delta_step > 0) { + int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) { + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + if (hist_bits_bin <= 16) { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) { + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + if (hist_bits_bin <= 16) { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } + } + }; + } + } else { + if (meta_->config->max_delta_step > 0) { + int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) { + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + if (hist_bits_bin <= 16) { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) { + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + if (hist_bits_bin <= 16) { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } else { + FindBestThresholdCategoricalIntInner(ARGUMENTS_INT); + } + } + }; + } + } +#undef LAMBDA_ARGUMENTS_INT +#undef ARGUMENTS_INT + } else { +#define ARGUMENTS \ + std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \ + std::placeholders::_4, std::placeholders::_5, std::placeholders::_6 + if (meta_->config->lambda_l1 > 0) { + if (meta_->config->max_delta_step > 0) { + find_best_threshold_fun_ = + std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< + USE_RAND, USE_MC, true, true, USE_SMOOTHING>, + this, ARGUMENTS); + } else { + find_best_threshold_fun_ = + std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< + USE_RAND, USE_MC, true, false, USE_SMOOTHING>, + this, ARGUMENTS); + } + } else { + if (meta_->config->max_delta_step > 0) { + find_best_threshold_fun_ = + std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< + USE_RAND, USE_MC, false, true, USE_SMOOTHING>, + this, ARGUMENTS); + } else { + find_best_threshold_fun_ = + std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< + USE_RAND, USE_MC, false, false, USE_SMOOTHING>, + this, ARGUMENTS); + } + } +#undef ARGUMENTS + } +} + +template +void FeatureHistogram::FindBestThresholdCategoricalInner(double sum_gradient, + double sum_hessian, + data_size_t num_data, + const FeatureConstraint* constraints, + double parent_output, + SplitInfo* output) { + is_splittable_ = false; + output->default_left = false; + double best_gain = kMinScore; + data_size_t best_left_count = 0; + double best_sum_left_gradient = 0; + double best_sum_left_hessian = 0; + double gain_shift; + if (USE_MC) { + constraints->InitCumulativeConstraints(true); + } + if (USE_SMOOTHING) { + gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); + } else { + // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated + // with the larger categorical l2, whereas min_split_gain uses the original l2. + gain_shift = GetLeafGain(sum_gradient, sum_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0, + num_data, 0); + } + + double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; + const int8_t offset = meta_->offset; + const int bin_start = 1 - offset; + const int bin_end = meta_->num_bin - offset; + int used_bin = -1; + + std::vector sorted_idx; + double l2 = meta_->config->lambda_l2; + bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot; + int best_threshold = -1; + int best_dir = 1; + const double cnt_factor = num_data / sum_hessian; + int rand_threshold = 0; + if (use_onehot) { + if (USE_RAND) { + if (bin_end - bin_start > 0) { + rand_threshold = meta_->rand.NextInt(bin_start, bin_end); + } + } + for (int t = bin_start; t < bin_end; ++t) { + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = + static_cast(Common::RoundInt(hess * cnt_factor)); + // if data not enough, or sum hessian too small + if (cnt < meta_->config->min_data_in_leaf || + hess < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t other_count = num_data - cnt; + // if data not enough + if (other_count < meta_->config->min_data_in_leaf) { + continue; + } + + double sum_other_hessian = sum_hessian - hess - kEpsilon; + // if sum hessian too small + if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + + double sum_other_gradient = sum_gradient - grad; + if (USE_RAND) { + if (t != rand_threshold) { + continue; + } + } + // current split gain + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + best_threshold = t; + best_sum_left_gradient = grad; + best_sum_left_hessian = hess + kEpsilon; + best_left_count = cnt; + best_gain = current_gain; + } + } + } else { + for (int i = bin_start; i < bin_end; ++i) { + if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= + meta_->config->cat_smooth) { + sorted_idx.push_back(i); + } + } + used_bin = static_cast(sorted_idx.size()); + + l2 += meta_->config->cat_l2; + + auto ctr_fun = [this](double sum_grad, double sum_hess) { + return (sum_grad) / (sum_hess + meta_->config->cat_smooth); + }; + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), [this, &ctr_fun](int i, int j) { + return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < + ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j)); + }); + + std::vector find_direction(1, 1); + std::vector start_position(1, 0); + find_direction.push_back(-1); + start_position.push_back(used_bin - 1); + const int max_num_cat = + std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2); + int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0); + if (USE_RAND) { + if (max_threshold > 0) { + rand_threshold = meta_->rand.NextInt(0, max_threshold); + } + } + + is_splittable_ = false; + for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) { + auto dir = find_direction[out_i]; + auto start_pos = start_position[out_i]; + data_size_t min_data_per_group = meta_->config->min_data_per_group; + data_size_t cnt_cur_group = 0; + double sum_left_gradient = 0.0f; + double sum_left_hessian = kEpsilon; + data_size_t left_count = 0; + for (int i = 0; i < used_bin && i < max_num_cat; ++i) { + auto t = sorted_idx[start_pos]; + start_pos += dir; + const auto grad = GET_GRAD(data_, t); + const auto hess = GET_HESS(data_, t); + data_size_t cnt = + static_cast(Common::RoundInt(hess * cnt_factor)); + + sum_left_gradient += grad; + sum_left_hessian += hess; + left_count += cnt; + cnt_cur_group += cnt; + + if (left_count < meta_->config->min_data_in_leaf || + sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + if (right_count < meta_->config->min_data_in_leaf || + right_count < min_data_per_group) { + break; + } + + double sum_right_hessian = sum_hessian - sum_left_hessian; + if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + if (cnt_cur_group < min_data_per_group) { + continue; + } + + cnt_cur_group = 0; + + double sum_right_gradient = sum_gradient - sum_left_gradient; + if (USE_RAND) { + if (i != rand_threshold) { + continue; + } + } + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth, + left_count, right_count, parent_output); + if (current_gain <= min_gain_shift) { + continue; + } + is_splittable_ = true; + if (current_gain > best_gain) { + best_left_count = left_count; + best_sum_left_gradient = sum_left_gradient; + best_sum_left_hessian = sum_left_hessian; + best_threshold = i; + best_gain = current_gain; + best_dir = dir; + } + } + } + } + + if (is_splittable_) { + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output); + output->left_count = best_left_count; + output->left_sum_gradient = best_sum_left_gradient; + output->left_sum_hessian = best_sum_left_hessian - kEpsilon; + output->right_output = CalculateSplittedLeafOutput( + sum_gradient - best_sum_left_gradient, + sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth, + num_data - best_left_count, parent_output); + output->right_count = num_data - best_left_count; + output->right_sum_gradient = sum_gradient - best_sum_left_gradient; + output->right_sum_hessian = + sum_hessian - best_sum_left_hessian - kEpsilon; + output->gain = best_gain - min_gain_shift; + if (use_onehot) { + output->num_cat_threshold = 1; + output->cat_threshold = + std::vector(1, static_cast(best_threshold + offset)); + } else { + output->num_cat_threshold = best_threshold + 1; + output->cat_threshold = + std::vector(output->num_cat_threshold); + if (best_dir == 1) { + for (int i = 0; i < output->num_cat_threshold; ++i) { + auto t = sorted_idx[i] + offset; + output->cat_threshold[i] = t; + } + } else { + for (int i = 0; i < output->num_cat_threshold; ++i) { + auto t = sorted_idx[used_bin - 1 - i] + offset; + output->cat_threshold[i] = t; + } + } + } + output->monotone_type = 0; + } +} + +template +void FeatureHistogram::FindBestThresholdCategoricalIntInner(int64_t int_sum_gradient_and_hessian, + const double grad_scale, const double hess_scale, + data_size_t num_data, + const FeatureConstraint* constraints, + double parent_output, + SplitInfo* output) { + is_splittable_ = false; + output->default_left = false; + double best_gain = kMinScore; + PACKED_HIST_ACC_T best_sum_left_gradient_and_hessian = 0; + double gain_shift; + if (USE_MC) { + constraints->InitCumulativeConstraints(true); + } + + PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian = + HIST_BITS_ACC == 16 ? + ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : + static_cast(int_sum_gradient_and_hessian); + + // recover sum of gradient and hessian from the sum of quantized gradient and hessian + double sum_gradient = static_cast(static_cast(int_sum_gradient_and_hessian >> 32)) * grad_scale; + double sum_hessian = static_cast(static_cast(int_sum_gradient_and_hessian & 0x00000000ffffffff)) * hess_scale; + if (USE_SMOOTHING) { + gain_shift = GetLeafGainGivenOutput( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); + } else { + // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated + // with the larger categorical l2, whereas min_split_gain uses the original l2. + gain_shift = GetLeafGain(sum_gradient, sum_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0, + num_data, 0); + } + + double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; + const int8_t offset = meta_->offset; + const int bin_start = 1 - offset; + const int bin_end = meta_->num_bin - offset; + int used_bin = -1; + + std::vector sorted_idx; + double l2 = meta_->config->lambda_l2; + bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot; + int best_threshold = -1; + int best_dir = 1; + const double cnt_factor = static_cast(num_data) / + static_cast(static_cast(int_sum_gradient_and_hessian & 0x00000000ffffffff)); + int rand_threshold = 0; + + const PACKED_HIST_BIN_T* data_ptr = nullptr; + if (HIST_BITS_BIN == 16) { + data_ptr = reinterpret_cast(data_int16_); + } else { + data_ptr = reinterpret_cast(data_); + } + + if (use_onehot) { + if (USE_RAND) { + if (bin_end - bin_start > 0) { + rand_threshold = meta_->rand.NextInt(bin_start, bin_end); + } + } + for (int t = bin_start; t < bin_end; ++t) { + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t]; + const uint32_t int_hess = HIST_BITS_BIN == 16 ? + static_cast(grad_and_hess & 0x0000ffff) : + static_cast(grad_and_hess & 0x00000000ffffffff); + data_size_t cnt = + static_cast(Common::RoundInt(int_hess * cnt_factor)); + const double hess = int_hess * hess_scale; + // if data not enough, or sum hessian too small + if (cnt < meta_->config->min_data_in_leaf || + hess < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t other_count = num_data - cnt; + // if data not enough + if (other_count < meta_->config->min_data_in_leaf) { + continue; + } + + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_ACC != HIST_BITS_BIN ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + grad_and_hess; + const PACKED_HIST_ACC_T sum_other_grad_and_hess = local_int_sum_gradient_and_hessian - grad_and_hess_acc; + const uint32_t sum_other_hess_int = HIST_BITS_ACC == 16 ? + static_cast(sum_other_grad_and_hess & 0x0000ffff) : + static_cast(sum_other_grad_and_hess & 0x00000000ffffffff); + double sum_other_hessian = sum_other_hess_int * hess_scale; + // if sum hessian too small + if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + + const int32_t int_grad = HIST_BITS_ACC == 16 ? + static_cast(static_cast(grad_and_hess_acc >> 16)) : + static_cast(static_cast(grad_and_hess_acc) >> 32); + const double grad = int_grad * grad_scale; + + const int32_t sum_other_grad_int = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_other_grad_and_hess >> 16)) : + static_cast(static_cast(sum_other_grad_and_hess) >> 32); + const double sum_other_gradient = sum_other_grad_int * grad_scale; + + if (USE_RAND) { + if (t != rand_threshold) { + continue; + } + } + // current split gain + double current_gain = GetSplitGains( + sum_other_gradient, sum_other_hessian, grad, hess, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + best_threshold = t; + best_sum_left_gradient_and_hessian = grad_and_hess_acc; + best_gain = current_gain; + } + } + } else { + for (int i = bin_start; i < bin_end; ++i) { + const PACKED_HIST_BIN_T int_grad_and_hess = data_ptr[i]; + const uint32_t int_hess = HIST_BITS_BIN == 16 ? + static_cast(int_grad_and_hess & 0x0000ffff) : + static_cast(int_grad_and_hess & 0x00000000ffffffff); + const int cnt = Common::RoundInt(int_hess * cnt_factor); + if (cnt >= meta_->config->cat_smooth) { + sorted_idx.push_back(i); + } + } + used_bin = static_cast(sorted_idx.size()); + + l2 += meta_->config->cat_l2; + + auto ctr_fun = [this](double sum_grad, double sum_hess) { + return (sum_grad) / (sum_hess + meta_->config->cat_smooth); + }; + std::stable_sort( + sorted_idx.begin(), sorted_idx.end(), [data_ptr, &ctr_fun, grad_scale, hess_scale](int i, int j) { + const PACKED_HIST_BIN_T int_grad_and_hess_i = data_ptr[i]; + const PACKED_HIST_BIN_T int_grad_and_hess_j = data_ptr[j]; + const int32_t int_grad_i = HIST_BITS_BIN == 16 ? + static_cast(static_cast(int_grad_and_hess_i >> 16)) : + static_cast(static_cast(int_grad_and_hess_i) >> 32); + const uint32_t int_hess_i = HIST_BITS_BIN == 16 ? + static_cast(int_grad_and_hess_i & 0x0000ffff) : + static_cast(int_grad_and_hess_i & 0x00000000ffffffff); + const int32_t int_grad_j = HIST_BITS_BIN == 16 ? + static_cast(static_cast(int_grad_and_hess_j >> 16)) : + static_cast(static_cast(int_grad_and_hess_j) >> 32); + const uint32_t int_hess_j = HIST_BITS_BIN == 16 ? + static_cast(int_grad_and_hess_j & 0x0000ffff) : + static_cast(int_grad_and_hess_j & 0x00000000ffffffff); + + const double grad_i = int_grad_i * grad_scale; + const double hess_i = int_hess_i * hess_scale; + const double grad_j = int_grad_j * grad_scale; + const double hess_j = int_hess_j * hess_scale; + + return ctr_fun(grad_i, hess_i) < ctr_fun(grad_j, hess_j); + }); + + std::vector find_direction(1, 1); + std::vector start_position(1, 0); + find_direction.push_back(-1); + start_position.push_back(used_bin - 1); + const int max_num_cat = + std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2); + int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0); + if (USE_RAND) { + if (max_threshold > 0) { + rand_threshold = meta_->rand.NextInt(0, max_threshold); + } + } + + is_splittable_ = false; + for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) { + auto dir = find_direction[out_i]; + auto start_pos = start_position[out_i]; + data_size_t min_data_per_group = meta_->config->min_data_per_group; + data_size_t cnt_cur_group = 0; + PACKED_HIST_ACC_T int_sum_left_gradient_and_hessian = 0; + data_size_t left_count = 0; + for (int i = 0; i < used_bin && i < max_num_cat; ++i) { + auto t = sorted_idx[start_pos]; + start_pos += dir; + PACKED_HIST_BIN_T int_grad_and_hess = data_ptr[t]; + + uint32_t int_hess = HIST_BITS_BIN == 16 ? + static_cast(int_grad_and_hess & 0x0000ffff) : + static_cast(int_grad_and_hess & 0x00000000ffffffff); + data_size_t cnt = + static_cast(Common::RoundInt(int_hess * cnt_factor)); + + if (HIST_BITS_ACC != HIST_BITS_BIN) { + PACKED_HIST_ACC_T int_grad_and_hess_acc = + (static_cast(static_cast(int_grad_and_hess & 0xffff0000)) << 32) | + (static_cast(int_grad_and_hess & 0x0000ffff)); + int_sum_left_gradient_and_hessian += int_grad_and_hess_acc; + } else { + int_sum_left_gradient_and_hessian += int_grad_and_hess; + } + + left_count += cnt; + cnt_cur_group += cnt; + + const uint32_t int_left_sum_hessian = HIST_BITS_ACC == 16 ? + static_cast(int_sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(int_sum_left_gradient_and_hessian & 0x00000000ffffffff); + const double sum_left_hessian = int_left_sum_hessian * hess_scale; + + if (left_count < meta_->config->min_data_in_leaf || + sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + if (right_count < meta_->config->min_data_in_leaf || + right_count < min_data_per_group) { + break; + } + + const PACKED_HIST_ACC_T int_sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - int_sum_left_gradient_and_hessian; + const uint32_t int_right_sum_hessian = HIST_BITS_ACC == 16 ? + static_cast(int_sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(int_sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double sum_right_hessian = int_right_sum_hessian * hess_scale; + + if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + if (cnt_cur_group < min_data_per_group) { + continue; + } + + cnt_cur_group = 0; + + const int32_t int_sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(int_sum_left_gradient_and_hessian >> 16)) : + static_cast(static_cast(int_sum_left_gradient_and_hessian) >> 32); + const double sum_left_gradient = int_sum_left_gradient * grad_scale; + + const int32_t int_sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(int_sum_right_gradient_and_hessian >> 16)) : + static_cast(static_cast(int_sum_right_gradient_and_hessian) >> 32); + const double sum_right_gradient = int_sum_right_gradient * grad_scale; + + if (USE_RAND) { + if (i != rand_threshold) { + continue; + } + } + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian, sum_right_gradient, + sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth, + left_count, right_count, parent_output); + if (current_gain <= min_gain_shift) { + continue; + } + is_splittable_ = true; + if (current_gain > best_gain) { + best_sum_left_gradient_and_hessian = int_sum_left_gradient_and_hessian; + best_threshold = i; + best_gain = current_gain; + best_dir = dir; + } + } + } + } + + if (is_splittable_) { + const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) : + static_cast(static_cast(best_sum_left_gradient_and_hessian) >> 32); + const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(best_sum_left_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_left_gradient = int_best_sum_left_gradient * grad_scale; + const double best_sum_left_hessian = int_best_sum_left_hessian * hess_scale; + + const PACKED_HIST_ACC_T best_sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian; + const int32_t int_best_sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(best_sum_right_gradient_and_hessian >> 16)) : + static_cast(static_cast(best_sum_right_gradient_and_hessian) >> 32); + const uint32_t int_best_sum_right_hessian = HIST_BITS_ACC == 16 ? + static_cast(best_sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(best_sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_right_gradient = int_best_sum_right_gradient * grad_scale; + const double best_sum_right_hessian = int_best_sum_right_hessian * hess_scale; + + const data_size_t best_left_count = Common::RoundInt(static_cast(int_best_sum_left_hessian) * cnt_factor); + const data_size_t best_right_count = Common::RoundInt(static_cast(int_best_sum_right_hessian) * cnt_factor); + + const int64_t best_sum_left_gradient_and_hessian_int64 = HIST_BITS_ACC == 16 ? + ((static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) << 32) | + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff)) : + best_sum_left_gradient_and_hessian; + const int64_t best_sum_right_gradient_and_hessian_int64 = int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian_int64; + + output->left_output = CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, l2, meta_->config->max_delta_step, + constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output); + output->left_count = best_left_count; + output->left_sum_gradient = best_sum_left_gradient; + output->left_sum_hessian = best_sum_left_hessian; + output->right_output = CalculateSplittedLeafOutput( + best_sum_right_gradient, + best_sum_right_hessian, meta_->config->lambda_l1, l2, + meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth, + best_right_count, parent_output); + output->right_count = best_right_count; + output->right_sum_gradient = best_sum_right_gradient; + output->right_sum_hessian = best_sum_right_hessian; + output->gain = best_gain - min_gain_shift; + + output->left_sum_gradient_and_hessian = best_sum_left_gradient_and_hessian_int64; + output->right_sum_gradient_and_hessian = best_sum_right_gradient_and_hessian_int64; + if (use_onehot) { + output->num_cat_threshold = 1; + output->cat_threshold = + std::vector(1, static_cast(best_threshold + offset)); + } else { + output->num_cat_threshold = best_threshold + 1; + output->cat_threshold = + std::vector(output->num_cat_threshold); + if (best_dir == 1) { + for (int i = 0; i < output->num_cat_threshold; ++i) { + auto t = sorted_idx[i] + offset; + output->cat_threshold[i] = t; + } + } else { + for (int i = 0; i < output->num_cat_threshold; ++i) { + auto t = sorted_idx[used_bin - 1 - i] + offset; + output->cat_threshold[i] = t; + } + } + } + output->monotone_type = 0; + } +} + +} // namespace LightGBM diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index bd5bda1e8879..70dd0fb5436f 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -117,7 +117,7 @@ class FeatureHistogram { (static_cast(static_cast(other_grad_hess >> 16)) << 32) | (static_cast(other_grad_hess & 0x0000ffff)); const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64; - result_int_data[i] = result_grad_hess; + result_int_data[i] = static_cast(result_grad_hess); } } else if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 16) { for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { @@ -446,63 +446,13 @@ class FeatureHistogram { } } - void FuncForCategorical() { - if (meta_->config->extra_trees) { - if (meta_->config->monotone_constraints.empty()) { - FuncForCategoricalL1(); - } else { - FuncForCategoricalL1(); - } - } else { - if (meta_->config->monotone_constraints.empty()) { - FuncForCategoricalL1(); - } else { - FuncForCategoricalL1(); - } - } - } + void FuncForCategorical(); template - void FuncForCategoricalL1() { - if (meta_->config->path_smooth > kEpsilon) { - FuncForCategoricalL2(); - } else { - FuncForCategoricalL2(); - } - } + void FuncForCategoricalL1(); template - void FuncForCategoricalL2() { -#define ARGUMENTS \ - std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \ - std::placeholders::_4, std::placeholders::_5, std::placeholders::_6 - if (meta_->config->lambda_l1 > 0) { - if (meta_->config->max_delta_step > 0) { - find_best_threshold_fun_ = - std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, true, true, USE_SMOOTHING>, - this, ARGUMENTS); - } else { - find_best_threshold_fun_ = - std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, true, false, USE_SMOOTHING>, - this, ARGUMENTS); - } - } else { - if (meta_->config->max_delta_step > 0) { - find_best_threshold_fun_ = - std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, false, true, USE_SMOOTHING>, - this, ARGUMENTS); - } else { - find_best_threshold_fun_ = - std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< - USE_RAND, USE_MC, false, false, USE_SMOOTHING>, - this, ARGUMENTS); - } - } -#undef ARGUMENTS - } + void FuncForCategoricalL2(); template void FindBestThresholdCategoricalInner(double sum_gradient, @@ -510,240 +460,16 @@ class FeatureHistogram { data_size_t num_data, const FeatureConstraint* constraints, double parent_output, - SplitInfo* output) { - is_splittable_ = false; - output->default_left = false; - double best_gain = kMinScore; - data_size_t best_left_count = 0; - double best_sum_left_gradient = 0; - double best_sum_left_hessian = 0; - double gain_shift; - if (USE_MC) { - constraints->InitCumulativeConstraints(true); - } - if (USE_SMOOTHING) { - gain_shift = GetLeafGainGivenOutput( - sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output); - } else { - // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated - // with the larger categorical l2, whereas min_split_gain uses the original l2. - gain_shift = GetLeafGain(sum_gradient, sum_hessian, - meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0, - num_data, 0); - } - - double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; - const int8_t offset = meta_->offset; - const int bin_start = 1 - offset; - const int bin_end = meta_->num_bin - offset; - int used_bin = -1; - - std::vector sorted_idx; - double l2 = meta_->config->lambda_l2; - bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot; - int best_threshold = -1; - int best_dir = 1; - const double cnt_factor = num_data / sum_hessian; - int rand_threshold = 0; - if (use_onehot) { - if (USE_RAND) { - if (bin_end - bin_start > 0) { - rand_threshold = meta_->rand.NextInt(bin_start, bin_end); - } - } - for (int t = bin_start; t < bin_end; ++t) { - const auto grad = GET_GRAD(data_, t); - const auto hess = GET_HESS(data_, t); - data_size_t cnt = - static_cast(Common::RoundInt(hess * cnt_factor)); - // if data not enough, or sum hessian too small - if (cnt < meta_->config->min_data_in_leaf || - hess < meta_->config->min_sum_hessian_in_leaf) { - continue; - } - data_size_t other_count = num_data - cnt; - // if data not enough - if (other_count < meta_->config->min_data_in_leaf) { - continue; - } - - double sum_other_hessian = sum_hessian - hess - kEpsilon; - // if sum hessian too small - if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) { - continue; - } - - double sum_other_gradient = sum_gradient - grad; - if (USE_RAND) { - if (t != rand_threshold) { - continue; - } - } - // current split gain - double current_gain = GetSplitGains( - sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output); - // gain with split is worse than without split - if (current_gain <= min_gain_shift) { - continue; - } - - // mark as able to be split - is_splittable_ = true; - // better split point - if (current_gain > best_gain) { - best_threshold = t; - best_sum_left_gradient = grad; - best_sum_left_hessian = hess + kEpsilon; - best_left_count = cnt; - best_gain = current_gain; - } - } - } else { - for (int i = bin_start; i < bin_end; ++i) { - if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= - meta_->config->cat_smooth) { - sorted_idx.push_back(i); - } - } - used_bin = static_cast(sorted_idx.size()); - - l2 += meta_->config->cat_l2; - - auto ctr_fun = [this](double sum_grad, double sum_hess) { - return (sum_grad) / (sum_hess + meta_->config->cat_smooth); - }; - std::stable_sort( - sorted_idx.begin(), sorted_idx.end(), [this, &ctr_fun](int i, int j) { - return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < - ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j)); - }); - - std::vector find_direction(1, 1); - std::vector start_position(1, 0); - find_direction.push_back(-1); - start_position.push_back(used_bin - 1); - const int max_num_cat = - std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2); - int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0); - if (USE_RAND) { - if (max_threshold > 0) { - rand_threshold = meta_->rand.NextInt(0, max_threshold); - } - } - - is_splittable_ = false; - for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) { - auto dir = find_direction[out_i]; - auto start_pos = start_position[out_i]; - data_size_t min_data_per_group = meta_->config->min_data_per_group; - data_size_t cnt_cur_group = 0; - double sum_left_gradient = 0.0f; - double sum_left_hessian = kEpsilon; - data_size_t left_count = 0; - for (int i = 0; i < used_bin && i < max_num_cat; ++i) { - auto t = sorted_idx[start_pos]; - start_pos += dir; - const auto grad = GET_GRAD(data_, t); - const auto hess = GET_HESS(data_, t); - data_size_t cnt = - static_cast(Common::RoundInt(hess * cnt_factor)); - - sum_left_gradient += grad; - sum_left_hessian += hess; - left_count += cnt; - cnt_cur_group += cnt; - - if (left_count < meta_->config->min_data_in_leaf || - sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { - continue; - } - data_size_t right_count = num_data - left_count; - if (right_count < meta_->config->min_data_in_leaf || - right_count < min_data_per_group) { - break; - } - - double sum_right_hessian = sum_hessian - sum_left_hessian; - if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { - break; - } - - if (cnt_cur_group < min_data_per_group) { - continue; - } - - cnt_cur_group = 0; + SplitInfo* output); - double sum_right_gradient = sum_gradient - sum_left_gradient; - if (USE_RAND) { - if (i != rand_threshold) { - continue; - } - } - double current_gain = GetSplitGains( - sum_left_gradient, sum_left_hessian, sum_right_gradient, - sum_right_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth, - left_count, right_count, parent_output); - if (current_gain <= min_gain_shift) { - continue; - } - is_splittable_ = true; - if (current_gain > best_gain) { - best_left_count = left_count; - best_sum_left_gradient = sum_left_gradient; - best_sum_left_hessian = sum_left_hessian; - best_threshold = i; - best_gain = current_gain; - best_dir = dir; - } - } - } - } - - if (is_splittable_) { - output->left_output = CalculateSplittedLeafOutput( - best_sum_left_gradient, best_sum_left_hessian, - meta_->config->lambda_l1, l2, meta_->config->max_delta_step, - constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output); - output->left_count = best_left_count; - output->left_sum_gradient = best_sum_left_gradient; - output->left_sum_hessian = best_sum_left_hessian - kEpsilon; - output->right_output = CalculateSplittedLeafOutput( - sum_gradient - best_sum_left_gradient, - sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, - meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth, - num_data - best_left_count, parent_output); - output->right_count = num_data - best_left_count; - output->right_sum_gradient = sum_gradient - best_sum_left_gradient; - output->right_sum_hessian = - sum_hessian - best_sum_left_hessian - kEpsilon; - output->gain = best_gain - min_gain_shift; - if (use_onehot) { - output->num_cat_threshold = 1; - output->cat_threshold = - std::vector(1, static_cast(best_threshold + offset)); - } else { - output->num_cat_threshold = best_threshold + 1; - output->cat_threshold = - std::vector(output->num_cat_threshold); - if (best_dir == 1) { - for (int i = 0; i < output->num_cat_threshold; ++i) { - auto t = sorted_idx[i] + offset; - output->cat_threshold[i] = t; - } - } else { - for (int i = 0; i < output->num_cat_threshold; ++i) { - auto t = sorted_idx[used_bin - 1 - i] + offset; - output->cat_threshold[i] = t; - } - } - } - output->monotone_type = 0; - } - } + template + void FindBestThresholdCategoricalIntInner(int64_t int_sum_gradient_and_hessian, + const double grad_scale, const double hess_scale, + data_size_t num_data, + const FeatureConstraint* constraints, + double parent_output, + SplitInfo* output); void GatherInfoForThreshold(double sum_gradient, double sum_hessian, uint32_t threshold, data_size_t num_data, @@ -1344,7 +1070,7 @@ class FeatureHistogram { PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian = HIST_BITS_ACC == 16 ? ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : - int_sum_gradient_and_hessian; + static_cast(int_sum_gradient_and_hessian); double best_gain = kMinScore; uint32_t best_threshold = static_cast(meta_->num_bin); const double cnt_factor = static_cast(num_data) / @@ -1418,10 +1144,10 @@ class FeatureHistogram { double sum_right_gradient = HIST_BITS_ACC == 16 ? static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : - static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + static_cast(static_cast(static_cast(sum_right_gradient_and_hessian) >> 32)) * grad_scale; double sum_left_gradient = HIST_BITS_ACC == 16 ? static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : - static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + static_cast(static_cast(static_cast(sum_left_gradient_and_hessian) >> 32)) * grad_scale; if (USE_RAND) { if (t - 1 + offset != rand_threshold) { continue; @@ -1535,10 +1261,10 @@ class FeatureHistogram { double sum_right_gradient = HIST_BITS_ACC == 16 ? static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : - static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + static_cast(static_cast(static_cast(sum_right_gradient_and_hessian) >> 32)) * grad_scale; double sum_left_gradient = HIST_BITS_ACC == 16 ? static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : - static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + static_cast(static_cast(static_cast(sum_left_gradient_and_hessian) >> 32)) * grad_scale; if (USE_RAND) { if (t + offset != rand_threshold) { continue; @@ -1578,7 +1304,7 @@ class FeatureHistogram { if (is_splittable_ && best_gain > output->gain + min_gain_shift) { const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ? static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) : - static_cast(best_sum_left_gradient_and_hessian >> 32); + static_cast(static_cast(best_sum_left_gradient_and_hessian) >> 32); const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ? static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff) : static_cast(best_sum_left_gradient_and_hessian & 0x00000000ffffffff); diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 666e09217678..bc41c75a7a8c 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -6,6 +6,8 @@ #ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ #define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ +#include + #include #include #include diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index d5c5cc59ef3a..f3a88bd18679 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -315,8 +315,8 @@ void SerialTreeLearner::BeforeTrain() { smaller_leaf_splits_->Init( 0, data_partition_.get(), gradient_discretizer_->discretized_gradients_and_hessians(), - gradient_discretizer_->grad_scale(), - gradient_discretizer_->hess_scale()); + static_cast(gradient_discretizer_->grad_scale()), + static_cast(gradient_discretizer_->hess_scale())); } } diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index ccde38977d2d..3fad36b34407 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -275,7 +275,20 @@ def test_missing_value_handle_none(): assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret) -def test_categorical_handle(): +@pytest.mark.parametrize( + "use_quantized_grad", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + getenv("TASK", "") == "cuda", + reason="Skip because quantized training with categorical features is not supported for cuda version", + ), + ), + False, + ], +) +def test_categorical_handle(use_quantized_grad): x = [0, 1, 2, 3, 4, 5, 6, 7] y = [0, 1, 0, 1, 0, 1, 0, 1] @@ -299,6 +312,7 @@ def test_categorical_handle(): "max_cat_to_onehot": 1, "zero_as_missing": True, "categorical_column": 0, + "use_quantized_grad": use_quantized_grad, } evals_result = {} gbm = lgb.train( @@ -311,7 +325,20 @@ def test_categorical_handle(): assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret) -def test_categorical_handle_na(): +@pytest.mark.parametrize( + "use_quantized_grad", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + getenv("TASK", "") == "cuda", + reason="Skip because quantized training with categorical features is not supported for cuda version", + ), + ), + False, + ], +) +def test_categorical_handle_na(use_quantized_grad): x = [0, np.nan, 0, np.nan, 0, np.nan] y = [0, 1, 0, 1, 0, 1] @@ -335,6 +362,7 @@ def test_categorical_handle_na(): "max_cat_to_onehot": 1, "zero_as_missing": False, "categorical_column": 0, + "use_quantized_grad": use_quantized_grad, } evals_result = {} gbm = lgb.train( @@ -347,7 +375,20 @@ def test_categorical_handle_na(): assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret) -def test_categorical_non_zero_inputs(): +@pytest.mark.parametrize( + "use_quantized_grad", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + getenv("TASK", "") == "cuda", + reason="Skip because quantized training with categorical features is not supported for cuda version", + ), + ), + False, + ], +) +def test_categorical_non_zero_inputs(use_quantized_grad): x = [1, 1, 1, 1, 1, 1, 2, 2] y = [1, 1, 1, 1, 1, 1, 0, 0] @@ -371,6 +412,7 @@ def test_categorical_non_zero_inputs(): "max_cat_to_onehot": 1, "zero_as_missing": False, "categorical_column": 0, + "use_quantized_grad": use_quantized_grad, } evals_result = {} gbm = lgb.train( diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index b7848b4cab5a..009c744964d1 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -337,6 +337,7 @@ + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 7010926799b7..00cb875b1218 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -284,6 +284,9 @@ src\treelearner + + src\treelearner + src\treelearner