diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index c04263f62c1c..ec2c067b64b8 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -46,6 +46,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 86d56fecdf34..ebcb40d1372a 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -47,6 +47,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
index 4f2ec456c937..ec7581e504c4 100644
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@@ -62,15 +62,17 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
                     reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
       }
     } else if (HIST_BITS == 16) {
-      const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
-        static_cast<size_t>(num_bin_aligned_);
       if (is_use_subcol_) {
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
+          static_cast<size_t>(num_bin_aligned_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
           std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
                       reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
         }
       } else {
+        CHECK_EQ(INNER_HIST_BITS, 8);
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2;
         int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < num_bin_; ++i) {
@@ -148,7 +150,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
         }
       }
     } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) {
-      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2;
       std::memset(reinterpret_cast<void*>(dst), 0, num_bin_ * kInt16HistBufferEntrySize);
       #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
       for (int t = 0; t < n_bin_block; ++t) {
diff --git a/src/treelearner/feature_histogram.cpp b/src/treelearner/feature_histogram.cpp
new file mode 100644
index 000000000000..29d99b92010b
--- /dev/null
+++ b/src/treelearner/feature_histogram.cpp
@@ -0,0 +1,739 @@
+/*!
+ * Copyright (c) 2024 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#include "feature_histogram.hpp"
+
+namespace LightGBM {
+
+void FeatureHistogram::FuncForCategorical() {
+  if (meta_->config->extra_trees) {
+    if (meta_->config->monotone_constraints.empty()) {
+      FuncForCategoricalL1<true, false>();
+    } else {
+      FuncForCategoricalL1<true, true>();
+    }
+  } else {
+    if (meta_->config->monotone_constraints.empty()) {
+      FuncForCategoricalL1<false, false>();
+    } else {
+      FuncForCategoricalL1<false, true>();
+    }
+  }
+}
+
+template <bool USE_RAND, bool USE_MC>
+void FeatureHistogram::FuncForCategoricalL1() {
+  if (meta_->config->path_smooth > kEpsilon) {
+    FuncForCategoricalL2<USE_RAND, USE_MC, true>();
+  } else {
+    FuncForCategoricalL2<USE_RAND, USE_MC, false>();
+  }
+}
+
+template <bool USE_RAND, bool USE_MC, bool USE_SMOOTHING>
+void FeatureHistogram::FuncForCategoricalL2() {
+  if (meta_->config->use_quantized_grad) {
+#define LAMBDA_PARAMS_INT \
+    int64_t int_sum_gradient_and_hessian, \
+    const double grad_scale, const double hess_scale, \
+    const uint8_t hist_bits_bin, const uint8_t hist_bits_acc, \
+    data_size_t num_data, \
+    const FeatureConstraint* constraints, \
+    double parent_output, \
+    SplitInfo* output
+
+#define ARGUMENTS_INT \
+    int_sum_gradient_and_hessian, grad_scale, hess_scale, num_data, constraints, parent_output, output
+
+    if (meta_->config->lambda_l1 > 0) {
+      if (meta_->config->max_delta_step > 0) {
+        int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) {
+          if (hist_bits_acc <= 16) {
+            CHECK_LE(hist_bits_bin, 16);
+            FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, true, USE_SMOOTHING, int32_t, int32_t, int16_t, int16_t, 16, 16>(ARGUMENTS_INT);
+          } else {
+            if (hist_bits_bin <= 16) {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, true, USE_SMOOTHING, int32_t, int64_t, int16_t, int32_t, 16, 32>(ARGUMENTS_INT);
+            } else {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, true, USE_SMOOTHING, int64_t, int64_t, int32_t, int32_t, 32, 32>(ARGUMENTS_INT);
+            }
+          }
+        };
+      } else {
+        int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) {
+          if (hist_bits_acc <= 16) {
+            CHECK_LE(hist_bits_bin, 16);
+            FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, false, USE_SMOOTHING, int32_t, int32_t, int16_t, int16_t, 16, 16>(ARGUMENTS_INT);
+          } else {
+            if (hist_bits_bin <= 16) {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, false, USE_SMOOTHING, int32_t, int64_t, int16_t, int32_t, 16, 32>(ARGUMENTS_INT);
+            } else {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, true, false, USE_SMOOTHING, int64_t, int64_t, int32_t, int32_t, 32, 32>(ARGUMENTS_INT);
+            }
+          }
+        };
+      }
+    } else {
+      if (meta_->config->max_delta_step > 0) {
+        int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) {
+          if (hist_bits_acc <= 16) {
+            CHECK_LE(hist_bits_bin, 16);
+            FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, true, USE_SMOOTHING, int32_t, int32_t, int16_t, int16_t, 16, 16>(ARGUMENTS_INT);
+          } else {
+            if (hist_bits_bin <= 16) {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, true, USE_SMOOTHING, int32_t, int64_t, int16_t, int32_t, 16, 32>(ARGUMENTS_INT);
+            } else {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, true, USE_SMOOTHING, int64_t, int64_t, int32_t, int32_t, 32, 32>(ARGUMENTS_INT);
+            }
+          }
+        };
+      } else {
+        int_find_best_threshold_fun_ = [=] (LAMBDA_PARAMS_INT) {
+          if (hist_bits_acc <= 16) {
+            CHECK_LE(hist_bits_bin, 16);
+            FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, false, USE_SMOOTHING, int32_t, int32_t, int16_t, int16_t, 16, 16>(ARGUMENTS_INT);
+          } else {
+            if (hist_bits_bin <= 16) {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, false, USE_SMOOTHING, int32_t, int64_t, int16_t, int32_t, 16, 32>(ARGUMENTS_INT);
+            } else {
+              FindBestThresholdCategoricalIntInner<USE_RAND, USE_MC, false, false, USE_SMOOTHING, int64_t, int64_t, int32_t, int32_t, 32, 32>(ARGUMENTS_INT);
+            }
+          }
+        };
+      }
+    }
+#undef LAMBDA_ARGUMENTS_INT
+#undef ARGUMENTS_INT
+  } else {
+#define ARGUMENTS                                                      \
+  std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
+      std::placeholders::_4, std::placeholders::_5, std::placeholders::_6
+    if (meta_->config->lambda_l1 > 0) {
+      if (meta_->config->max_delta_step > 0) {
+        find_best_threshold_fun_ =
+            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
+                          USE_RAND, USE_MC, true, true, USE_SMOOTHING>,
+                      this, ARGUMENTS);
+      } else {
+        find_best_threshold_fun_ =
+            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
+                          USE_RAND, USE_MC, true, false, USE_SMOOTHING>,
+                      this, ARGUMENTS);
+      }
+    } else {
+      if (meta_->config->max_delta_step > 0) {
+        find_best_threshold_fun_ =
+            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
+                          USE_RAND, USE_MC, false, true, USE_SMOOTHING>,
+                      this, ARGUMENTS);
+      } else {
+        find_best_threshold_fun_ =
+            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
+                          USE_RAND, USE_MC, false, false, USE_SMOOTHING>,
+                      this, ARGUMENTS);
+      }
+    }
+#undef ARGUMENTS
+  }
+}
+
+template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
+void FeatureHistogram::FindBestThresholdCategoricalInner(double sum_gradient,
+                                        double sum_hessian,
+                                        data_size_t num_data,
+                                        const FeatureConstraint* constraints,
+                                        double parent_output,
+                                        SplitInfo* output) {
+  is_splittable_ = false;
+  output->default_left = false;
+  double best_gain = kMinScore;
+  data_size_t best_left_count = 0;
+  double best_sum_left_gradient = 0;
+  double best_sum_left_hessian = 0;
+  double gain_shift;
+  if (USE_MC) {
+    constraints->InitCumulativeConstraints(true);
+  }
+  if (USE_SMOOTHING) {
+    gain_shift = GetLeafGainGivenOutput<USE_L1>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
+  } else {
+    // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated
+    // with the larger categorical l2, whereas min_split_gain uses the original l2.
+    gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(sum_gradient, sum_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0,
+        num_data, 0);
+  }
+
+  double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
+  const int8_t offset = meta_->offset;
+  const int bin_start = 1 - offset;
+  const int bin_end = meta_->num_bin - offset;
+  int used_bin = -1;
+
+  std::vector<int> sorted_idx;
+  double l2 = meta_->config->lambda_l2;
+  bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
+  int best_threshold = -1;
+  int best_dir = 1;
+  const double cnt_factor = num_data / sum_hessian;
+  int rand_threshold = 0;
+  if (use_onehot) {
+    if (USE_RAND) {
+      if (bin_end - bin_start > 0) {
+        rand_threshold = meta_->rand.NextInt(bin_start, bin_end);
+      }
+    }
+    for (int t = bin_start; t < bin_end; ++t) {
+      const auto grad = GET_GRAD(data_, t);
+      const auto hess = GET_HESS(data_, t);
+      data_size_t cnt =
+          static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+      // if data not enough, or sum hessian too small
+      if (cnt < meta_->config->min_data_in_leaf ||
+          hess < meta_->config->min_sum_hessian_in_leaf) {
+        continue;
+      }
+      data_size_t other_count = num_data - cnt;
+      // if data not enough
+      if (other_count < meta_->config->min_data_in_leaf) {
+        continue;
+      }
+
+      double sum_other_hessian = sum_hessian - hess - kEpsilon;
+      // if sum hessian too small
+      if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) {
+        continue;
+      }
+
+      double sum_other_gradient = sum_gradient - grad;
+      if (USE_RAND) {
+        if (t != rand_threshold) {
+          continue;
+        }
+      }
+      // current split gain
+      double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+          sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output);
+      // gain with split is worse than without split
+      if (current_gain <= min_gain_shift) {
+        continue;
+      }
+
+      // mark as able to be split
+      is_splittable_ = true;
+      // better split point
+      if (current_gain > best_gain) {
+        best_threshold = t;
+        best_sum_left_gradient = grad;
+        best_sum_left_hessian = hess + kEpsilon;
+        best_left_count = cnt;
+        best_gain = current_gain;
+      }
+    }
+  } else {
+    for (int i = bin_start; i < bin_end; ++i) {
+      if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >=
+          meta_->config->cat_smooth) {
+        sorted_idx.push_back(i);
+      }
+    }
+    used_bin = static_cast<int>(sorted_idx.size());
+
+    l2 += meta_->config->cat_l2;
+
+    auto ctr_fun = [this](double sum_grad, double sum_hess) {
+      return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
+    };
+    std::stable_sort(
+        sorted_idx.begin(), sorted_idx.end(), [this, &ctr_fun](int i, int j) {
+          return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) <
+                  ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
+        });
+
+    std::vector<int> find_direction(1, 1);
+    std::vector<int> start_position(1, 0);
+    find_direction.push_back(-1);
+    start_position.push_back(used_bin - 1);
+    const int max_num_cat =
+        std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
+    int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0);
+    if (USE_RAND) {
+      if (max_threshold > 0) {
+        rand_threshold = meta_->rand.NextInt(0, max_threshold);
+      }
+    }
+
+    is_splittable_ = false;
+    for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
+      auto dir = find_direction[out_i];
+      auto start_pos = start_position[out_i];
+      data_size_t min_data_per_group = meta_->config->min_data_per_group;
+      data_size_t cnt_cur_group = 0;
+      double sum_left_gradient = 0.0f;
+      double sum_left_hessian = kEpsilon;
+      data_size_t left_count = 0;
+      for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
+        auto t = sorted_idx[start_pos];
+        start_pos += dir;
+        const auto grad = GET_GRAD(data_, t);
+        const auto hess = GET_HESS(data_, t);
+        data_size_t cnt =
+            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
+
+        sum_left_gradient += grad;
+        sum_left_hessian += hess;
+        left_count += cnt;
+        cnt_cur_group += cnt;
+
+        if (left_count < meta_->config->min_data_in_leaf ||
+            sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          continue;
+        }
+        data_size_t right_count = num_data - left_count;
+        if (right_count < meta_->config->min_data_in_leaf ||
+            right_count < min_data_per_group) {
+          break;
+        }
+
+        double sum_right_hessian = sum_hessian - sum_left_hessian;
+        if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          break;
+        }
+
+        if (cnt_cur_group < min_data_per_group) {
+          continue;
+        }
+
+        cnt_cur_group = 0;
+
+        double sum_right_gradient = sum_gradient - sum_left_gradient;
+        if (USE_RAND) {
+          if (i != rand_threshold) {
+            continue;
+          }
+        }
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, meta_->config->lambda_l1, l2,
+            meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth,
+            left_count, right_count, parent_output);
+        if (current_gain <= min_gain_shift) {
+          continue;
+        }
+        is_splittable_ = true;
+        if (current_gain > best_gain) {
+          best_left_count = left_count;
+          best_sum_left_gradient = sum_left_gradient;
+          best_sum_left_hessian = sum_left_hessian;
+          best_threshold = i;
+          best_gain = current_gain;
+          best_dir = dir;
+        }
+      }
+    }
+  }
+
+  if (is_splittable_) {
+    output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        best_sum_left_gradient, best_sum_left_hessian,
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output);
+    output->left_count = best_left_count;
+    output->left_sum_gradient = best_sum_left_gradient;
+    output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
+    output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        sum_gradient - best_sum_left_gradient,
+        sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth,
+        num_data - best_left_count, parent_output);
+    output->right_count = num_data - best_left_count;
+    output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
+    output->right_sum_hessian =
+        sum_hessian - best_sum_left_hessian - kEpsilon;
+    output->gain = best_gain - min_gain_shift;
+    if (use_onehot) {
+      output->num_cat_threshold = 1;
+      output->cat_threshold =
+          std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold + offset));
+    } else {
+      output->num_cat_threshold = best_threshold + 1;
+      output->cat_threshold =
+          std::vector<uint32_t>(output->num_cat_threshold);
+      if (best_dir == 1) {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          auto t = sorted_idx[i] + offset;
+          output->cat_threshold[i] = t;
+        }
+      } else {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          auto t = sorted_idx[used_bin - 1 - i] + offset;
+          output->cat_threshold[i] = t;
+        }
+      }
+    }
+    output->monotone_type = 0;
+  }
+}
+
+template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING, typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T,
+          typename HIST_BIN_T, typename HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
+void FeatureHistogram::FindBestThresholdCategoricalIntInner(int64_t int_sum_gradient_and_hessian,
+                                          const double grad_scale, const double hess_scale,
+                                          data_size_t num_data,
+                                          const FeatureConstraint* constraints,
+                                          double parent_output,
+                                          SplitInfo* output) {
+  is_splittable_ = false;
+  output->default_left = false;
+  double best_gain = kMinScore;
+  PACKED_HIST_ACC_T best_sum_left_gradient_and_hessian = 0;
+  double gain_shift;
+  if (USE_MC) {
+    constraints->InitCumulativeConstraints(true);
+  }
+
+  PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian =
+    HIST_BITS_ACC == 16 ?
+    ((static_cast<int32_t>(int_sum_gradient_and_hessian >> 32) << 16) | static_cast<int32_t>(int_sum_gradient_and_hessian & 0x0000ffff)) :
+    static_cast<PACKED_HIST_ACC_T>(int_sum_gradient_and_hessian);
+
+  // recover sum of gradient and hessian from the sum of quantized gradient and hessian
+  double sum_gradient = static_cast<double>(static_cast<int32_t>(int_sum_gradient_and_hessian >> 32)) * grad_scale;
+  double sum_hessian = static_cast<double>(static_cast<uint32_t>(int_sum_gradient_and_hessian & 0x00000000ffffffff)) * hess_scale;
+  if (USE_SMOOTHING) {
+    gain_shift = GetLeafGainGivenOutput<USE_L1>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
+  } else {
+    // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated
+    // with the larger categorical l2, whereas min_split_gain uses the original l2.
+    gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(sum_gradient, sum_hessian,
+        meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0,
+        num_data, 0);
+  }
+
+  double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
+  const int8_t offset = meta_->offset;
+  const int bin_start = 1 - offset;
+  const int bin_end = meta_->num_bin - offset;
+  int used_bin = -1;
+
+  std::vector<int> sorted_idx;
+  double l2 = meta_->config->lambda_l2;
+  bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
+  int best_threshold = -1;
+  int best_dir = 1;
+  const double cnt_factor = static_cast<double>(num_data) /
+    static_cast<double>(static_cast<uint32_t>(int_sum_gradient_and_hessian & 0x00000000ffffffff));
+  int rand_threshold = 0;
+
+  const PACKED_HIST_BIN_T* data_ptr = nullptr;
+  if (HIST_BITS_BIN == 16) {
+    data_ptr = reinterpret_cast<const PACKED_HIST_BIN_T*>(data_int16_);
+  } else {
+    data_ptr = reinterpret_cast<const PACKED_HIST_BIN_T*>(data_);
+  }
+
+  if (use_onehot) {
+    if (USE_RAND) {
+      if (bin_end - bin_start > 0) {
+        rand_threshold = meta_->rand.NextInt(bin_start, bin_end);
+      }
+    }
+    for (int t = bin_start; t < bin_end; ++t) {
+      const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t];
+      const uint32_t int_hess = HIST_BITS_BIN == 16 ?
+        static_cast<uint32_t>(grad_and_hess & 0x0000ffff) :
+        static_cast<uint32_t>(grad_and_hess & 0x00000000ffffffff);
+      data_size_t cnt =
+          static_cast<data_size_t>(Common::RoundInt(int_hess * cnt_factor));
+      const double hess = int_hess * hess_scale;
+      // if data not enough, or sum hessian too small
+      if (cnt < meta_->config->min_data_in_leaf ||
+          hess < meta_->config->min_sum_hessian_in_leaf) {
+        continue;
+      }
+      data_size_t other_count = num_data - cnt;
+      // if data not enough
+      if (other_count < meta_->config->min_data_in_leaf) {
+        continue;
+      }
+
+      const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_ACC != HIST_BITS_BIN ?
+        ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+        (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x0000ffff))) :
+        grad_and_hess;
+      const PACKED_HIST_ACC_T sum_other_grad_and_hess = local_int_sum_gradient_and_hessian - grad_and_hess_acc;
+      const uint32_t sum_other_hess_int = HIST_BITS_ACC == 16 ?
+        static_cast<uint32_t>(sum_other_grad_and_hess & 0x0000ffff) :
+        static_cast<uint32_t>(sum_other_grad_and_hess & 0x00000000ffffffff);
+      double sum_other_hessian = sum_other_hess_int * hess_scale;
+      // if sum hessian too small
+      if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) {
+        continue;
+      }
+
+      const int32_t int_grad = HIST_BITS_ACC == 16 ?
+        static_cast<int32_t>(static_cast<int16_t>(grad_and_hess_acc >> 16)) :
+        static_cast<int32_t>(static_cast<int64_t>(grad_and_hess_acc) >> 32);
+      const double grad = int_grad * grad_scale;
+
+      const int32_t sum_other_grad_int = HIST_BITS_ACC == 16 ?
+        static_cast<int32_t>(static_cast<int16_t>(sum_other_grad_and_hess >> 16)) :
+        static_cast<int32_t>(static_cast<int64_t>(sum_other_grad_and_hess) >> 32);
+      const double sum_other_gradient = sum_other_grad_int * grad_scale;
+
+      if (USE_RAND) {
+        if (t != rand_threshold) {
+          continue;
+        }
+      }
+      // current split gain
+      double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+          sum_other_gradient, sum_other_hessian, grad, hess,
+          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+          constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output);
+      // gain with split is worse than without split
+      if (current_gain <= min_gain_shift) {
+        continue;
+      }
+
+      // mark as able to be split
+      is_splittable_ = true;
+      // better split point
+      if (current_gain > best_gain) {
+        best_threshold = t;
+        best_sum_left_gradient_and_hessian = grad_and_hess_acc;
+        best_gain = current_gain;
+      }
+    }
+  } else {
+    for (int i = bin_start; i < bin_end; ++i) {
+      const PACKED_HIST_BIN_T int_grad_and_hess = data_ptr[i];
+      const uint32_t int_hess = HIST_BITS_BIN == 16 ?
+        static_cast<uint32_t>(int_grad_and_hess & 0x0000ffff) :
+        static_cast<uint32_t>(int_grad_and_hess & 0x00000000ffffffff);
+      const int cnt = Common::RoundInt(int_hess * cnt_factor);
+      if (cnt >= meta_->config->cat_smooth) {
+        sorted_idx.push_back(i);
+      }
+    }
+    used_bin = static_cast<int>(sorted_idx.size());
+
+    l2 += meta_->config->cat_l2;
+
+    auto ctr_fun = [this](double sum_grad, double sum_hess) {
+      return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
+    };
+    std::stable_sort(
+        sorted_idx.begin(), sorted_idx.end(), [data_ptr, &ctr_fun, grad_scale, hess_scale](int i, int j) {
+          const PACKED_HIST_BIN_T int_grad_and_hess_i = data_ptr[i];
+          const PACKED_HIST_BIN_T int_grad_and_hess_j = data_ptr[j];
+          const int32_t int_grad_i = HIST_BITS_BIN == 16 ?
+            static_cast<int32_t>(static_cast<int16_t>(int_grad_and_hess_i >> 16)) :
+            static_cast<int32_t>(static_cast<int64_t>(int_grad_and_hess_i) >> 32);
+          const uint32_t int_hess_i = HIST_BITS_BIN == 16 ?
+            static_cast<int32_t>(int_grad_and_hess_i & 0x0000ffff) :
+            static_cast<int32_t>(int_grad_and_hess_i & 0x00000000ffffffff);
+          const int32_t int_grad_j = HIST_BITS_BIN == 16 ?
+            static_cast<int32_t>(static_cast<int16_t>(int_grad_and_hess_j >> 16)) :
+            static_cast<int32_t>(static_cast<int64_t>(int_grad_and_hess_j) >> 32);
+          const uint32_t int_hess_j = HIST_BITS_BIN == 16 ?
+            static_cast<int32_t>(int_grad_and_hess_j & 0x0000ffff) :
+            static_cast<int32_t>(int_grad_and_hess_j & 0x00000000ffffffff);
+
+          const double grad_i = int_grad_i * grad_scale;
+          const double hess_i = int_hess_i * hess_scale;
+          const double grad_j = int_grad_j * grad_scale;
+          const double hess_j = int_hess_j * hess_scale;
+
+          return ctr_fun(grad_i, hess_i) < ctr_fun(grad_j, hess_j);
+        });
+
+    std::vector<int> find_direction(1, 1);
+    std::vector<int> start_position(1, 0);
+    find_direction.push_back(-1);
+    start_position.push_back(used_bin - 1);
+    const int max_num_cat =
+        std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
+    int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0);
+    if (USE_RAND) {
+      if (max_threshold > 0) {
+        rand_threshold = meta_->rand.NextInt(0, max_threshold);
+      }
+    }
+
+    is_splittable_ = false;
+    for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
+      auto dir = find_direction[out_i];
+      auto start_pos = start_position[out_i];
+      data_size_t min_data_per_group = meta_->config->min_data_per_group;
+      data_size_t cnt_cur_group = 0;
+      PACKED_HIST_ACC_T int_sum_left_gradient_and_hessian = 0;
+      data_size_t left_count = 0;
+      for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
+        auto t = sorted_idx[start_pos];
+        start_pos += dir;
+        PACKED_HIST_BIN_T int_grad_and_hess = data_ptr[t];
+
+        uint32_t int_hess = HIST_BITS_BIN == 16 ?
+          static_cast<uint32_t>(int_grad_and_hess & 0x0000ffff) :
+          static_cast<uint32_t>(int_grad_and_hess & 0x00000000ffffffff);
+        data_size_t cnt =
+            static_cast<data_size_t>(Common::RoundInt(int_hess * cnt_factor));
+
+        if (HIST_BITS_ACC != HIST_BITS_BIN) {
+          PACKED_HIST_ACC_T int_grad_and_hess_acc =
+            (static_cast<PACKED_HIST_ACC_T>(static_cast<int64_t>(int_grad_and_hess & 0xffff0000)) << 32) |
+            (static_cast<PACKED_HIST_ACC_T>(int_grad_and_hess & 0x0000ffff));
+          int_sum_left_gradient_and_hessian += int_grad_and_hess_acc;
+        } else {
+          int_sum_left_gradient_and_hessian += int_grad_and_hess;
+        }
+
+        left_count += cnt;
+        cnt_cur_group += cnt;
+
+        const uint32_t int_left_sum_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(int_sum_left_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(int_sum_left_gradient_and_hessian & 0x00000000ffffffff);
+        const double sum_left_hessian = int_left_sum_hessian * hess_scale;
+
+        if (left_count < meta_->config->min_data_in_leaf ||
+            sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          continue;
+        }
+        data_size_t right_count = num_data - left_count;
+        if (right_count < meta_->config->min_data_in_leaf ||
+            right_count < min_data_per_group) {
+          break;
+        }
+
+        const PACKED_HIST_ACC_T int_sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - int_sum_left_gradient_and_hessian;
+        const uint32_t int_right_sum_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(int_sum_right_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(int_sum_right_gradient_and_hessian & 0x00000000ffffffff);
+        const double sum_right_hessian = int_right_sum_hessian * hess_scale;
+
+        if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          break;
+        }
+
+        if (cnt_cur_group < min_data_per_group) {
+          continue;
+        }
+
+        cnt_cur_group = 0;
+
+        const int32_t int_sum_left_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<int32_t>(static_cast<int16_t>(int_sum_left_gradient_and_hessian >> 16)) :
+          static_cast<int32_t>(static_cast<int64_t>(int_sum_left_gradient_and_hessian) >> 32);
+        const double sum_left_gradient = int_sum_left_gradient * grad_scale;
+
+        const int32_t int_sum_right_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<int32_t>(static_cast<int16_t>(int_sum_right_gradient_and_hessian >> 16)) :
+          static_cast<int32_t>(static_cast<int64_t>(int_sum_right_gradient_and_hessian) >> 32);
+        const double sum_right_gradient = int_sum_right_gradient * grad_scale;
+
+        if (USE_RAND) {
+          if (i != rand_threshold) {
+            continue;
+          }
+        }
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian, sum_right_gradient,
+            sum_right_hessian, meta_->config->lambda_l1, l2,
+            meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth,
+            left_count, right_count, parent_output);
+        if (current_gain <= min_gain_shift) {
+          continue;
+        }
+        is_splittable_ = true;
+        if (current_gain > best_gain) {
+          best_sum_left_gradient_and_hessian = int_sum_left_gradient_and_hessian;
+          best_threshold = i;
+          best_gain = current_gain;
+          best_dir = dir;
+        }
+      }
+    }
+  }
+
+  if (is_splittable_) {
+    const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ?
+      static_cast<int32_t>(static_cast<int16_t>(best_sum_left_gradient_and_hessian >> 16)) :
+      static_cast<int32_t>(static_cast<int64_t>(best_sum_left_gradient_and_hessian) >> 32);
+    const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ?
+      static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x0000ffff) :
+      static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x00000000ffffffff);
+    const double best_sum_left_gradient = int_best_sum_left_gradient * grad_scale;
+    const double best_sum_left_hessian = int_best_sum_left_hessian * hess_scale;
+
+    const PACKED_HIST_ACC_T best_sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian;
+    const int32_t int_best_sum_right_gradient = HIST_BITS_ACC == 16 ?
+      static_cast<int32_t>(static_cast<int16_t>(best_sum_right_gradient_and_hessian >> 16)) :
+      static_cast<int32_t>(static_cast<int64_t>(best_sum_right_gradient_and_hessian) >> 32);
+    const uint32_t int_best_sum_right_hessian = HIST_BITS_ACC == 16 ?
+      static_cast<uint32_t>(best_sum_right_gradient_and_hessian & 0x0000ffff) :
+      static_cast<uint32_t>(best_sum_right_gradient_and_hessian & 0x00000000ffffffff);
+    const double best_sum_right_gradient = int_best_sum_right_gradient * grad_scale;
+    const double best_sum_right_hessian = int_best_sum_right_hessian * hess_scale;
+
+    const data_size_t best_left_count = Common::RoundInt(static_cast<double>(int_best_sum_left_hessian) * cnt_factor);
+    const data_size_t best_right_count = Common::RoundInt(static_cast<double>(int_best_sum_right_hessian) * cnt_factor);
+
+    const int64_t best_sum_left_gradient_and_hessian_int64 = HIST_BITS_ACC == 16 ?
+        ((static_cast<int64_t>(static_cast<int16_t>(best_sum_left_gradient_and_hessian >> 16)) << 32) |
+        static_cast<int64_t>(best_sum_left_gradient_and_hessian & 0x0000ffff)) :
+        best_sum_left_gradient_and_hessian;
+    const int64_t best_sum_right_gradient_and_hessian_int64 = int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian_int64;
+
+    output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        best_sum_left_gradient, best_sum_left_hessian,
+        meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
+        constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output);
+    output->left_count = best_left_count;
+    output->left_sum_gradient = best_sum_left_gradient;
+    output->left_sum_hessian = best_sum_left_hessian;
+    output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        best_sum_right_gradient,
+        best_sum_right_hessian, meta_->config->lambda_l1, l2,
+        meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth,
+        best_right_count, parent_output);
+    output->right_count = best_right_count;
+    output->right_sum_gradient = best_sum_right_gradient;
+    output->right_sum_hessian = best_sum_right_hessian;
+    output->gain = best_gain - min_gain_shift;
+
+    output->left_sum_gradient_and_hessian = best_sum_left_gradient_and_hessian_int64;
+    output->right_sum_gradient_and_hessian = best_sum_right_gradient_and_hessian_int64;
+    if (use_onehot) {
+      output->num_cat_threshold = 1;
+      output->cat_threshold =
+          std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold + offset));
+    } else {
+      output->num_cat_threshold = best_threshold + 1;
+      output->cat_threshold =
+          std::vector<uint32_t>(output->num_cat_threshold);
+      if (best_dir == 1) {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          auto t = sorted_idx[i] + offset;
+          output->cat_threshold[i] = t;
+        }
+      } else {
+        for (int i = 0; i < output->num_cat_threshold; ++i) {
+          auto t = sorted_idx[used_bin - 1 - i] + offset;
+          output->cat_threshold[i] = t;
+        }
+      }
+    }
+    output->monotone_type = 0;
+  }
+}
+
+}  // namespace LightGBM
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index bd5bda1e8879..70dd0fb5436f 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -117,7 +117,7 @@ class FeatureHistogram {
             (static_cast<int64_t>(static_cast<int16_t>(other_grad_hess >> 16)) << 32) |
             (static_cast<int64_t>(other_grad_hess & 0x0000ffff));
           const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64;
-          result_int_data[i] = result_grad_hess;
+          result_int_data[i] = static_cast<RESULT_HIST_T>(result_grad_hess);
         }
       } else if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 16) {
         for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
@@ -446,63 +446,13 @@ class FeatureHistogram {
     }
   }
 
-  void FuncForCategorical() {
-    if (meta_->config->extra_trees) {
-      if (meta_->config->monotone_constraints.empty()) {
-        FuncForCategoricalL1<true, false>();
-      } else {
-        FuncForCategoricalL1<true, true>();
-      }
-    } else {
-      if (meta_->config->monotone_constraints.empty()) {
-        FuncForCategoricalL1<false, false>();
-      } else {
-        FuncForCategoricalL1<false, true>();
-      }
-    }
-  }
+  void FuncForCategorical();
 
   template <bool USE_RAND, bool USE_MC>
-  void FuncForCategoricalL1() {
-    if (meta_->config->path_smooth > kEpsilon) {
-      FuncForCategoricalL2<USE_RAND, USE_MC, true>();
-    } else {
-      FuncForCategoricalL2<USE_RAND, USE_MC, false>();
-    }
-  }
+  void FuncForCategoricalL1();
 
   template <bool USE_RAND, bool USE_MC, bool USE_SMOOTHING>
-  void FuncForCategoricalL2() {
-#define ARGUMENTS                                                      \
-  std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
-      std::placeholders::_4, std::placeholders::_5, std::placeholders::_6
-    if (meta_->config->lambda_l1 > 0) {
-      if (meta_->config->max_delta_step > 0) {
-        find_best_threshold_fun_ =
-            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, true, true, USE_SMOOTHING>,
-                      this, ARGUMENTS);
-      } else {
-        find_best_threshold_fun_ =
-            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, true, false, USE_SMOOTHING>,
-                      this, ARGUMENTS);
-      }
-    } else {
-      if (meta_->config->max_delta_step > 0) {
-        find_best_threshold_fun_ =
-            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, false, true, USE_SMOOTHING>,
-                      this, ARGUMENTS);
-      } else {
-        find_best_threshold_fun_ =
-            std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
-                          USE_RAND, USE_MC, false, false, USE_SMOOTHING>,
-                      this, ARGUMENTS);
-      }
-    }
-#undef ARGUMENTS
-  }
+  void FuncForCategoricalL2();
 
   template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   void FindBestThresholdCategoricalInner(double sum_gradient,
@@ -510,240 +460,16 @@ class FeatureHistogram {
                                          data_size_t num_data,
                                          const FeatureConstraint* constraints,
                                          double parent_output,
-                                         SplitInfo* output) {
-    is_splittable_ = false;
-    output->default_left = false;
-    double best_gain = kMinScore;
-    data_size_t best_left_count = 0;
-    double best_sum_left_gradient = 0;
-    double best_sum_left_hessian = 0;
-    double gain_shift;
-    if (USE_MC) {
-      constraints->InitCumulativeConstraints(true);
-    }
-    if (USE_SMOOTHING) {
-      gain_shift = GetLeafGainGivenOutput<USE_L1>(
-          sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
-    } else {
-      // Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated
-      // with the larger categorical l2, whereas min_split_gain uses the original l2.
-      gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(sum_gradient, sum_hessian,
-          meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0,
-          num_data, 0);
-    }
-
-    double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
-    const int8_t offset = meta_->offset;
-    const int bin_start = 1 - offset;
-    const int bin_end = meta_->num_bin - offset;
-    int used_bin = -1;
-
-    std::vector<int> sorted_idx;
-    double l2 = meta_->config->lambda_l2;
-    bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
-    int best_threshold = -1;
-    int best_dir = 1;
-    const double cnt_factor = num_data / sum_hessian;
-    int rand_threshold = 0;
-    if (use_onehot) {
-      if (USE_RAND) {
-        if (bin_end - bin_start > 0) {
-          rand_threshold = meta_->rand.NextInt(bin_start, bin_end);
-        }
-      }
-      for (int t = bin_start; t < bin_end; ++t) {
-        const auto grad = GET_GRAD(data_, t);
-        const auto hess = GET_HESS(data_, t);
-        data_size_t cnt =
-            static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
-        // if data not enough, or sum hessian too small
-        if (cnt < meta_->config->min_data_in_leaf ||
-            hess < meta_->config->min_sum_hessian_in_leaf) {
-          continue;
-        }
-        data_size_t other_count = num_data - cnt;
-        // if data not enough
-        if (other_count < meta_->config->min_data_in_leaf) {
-          continue;
-        }
-
-        double sum_other_hessian = sum_hessian - hess - kEpsilon;
-        // if sum hessian too small
-        if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) {
-          continue;
-        }
-
-        double sum_other_gradient = sum_gradient - grad;
-        if (USE_RAND) {
-          if (t != rand_threshold) {
-            continue;
-          }
-        }
-        // current split gain
-        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-            sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
-            meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-            constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output);
-        // gain with split is worse than without split
-        if (current_gain <= min_gain_shift) {
-          continue;
-        }
-
-        // mark as able to be split
-        is_splittable_ = true;
-        // better split point
-        if (current_gain > best_gain) {
-          best_threshold = t;
-          best_sum_left_gradient = grad;
-          best_sum_left_hessian = hess + kEpsilon;
-          best_left_count = cnt;
-          best_gain = current_gain;
-        }
-      }
-    } else {
-      for (int i = bin_start; i < bin_end; ++i) {
-        if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >=
-            meta_->config->cat_smooth) {
-          sorted_idx.push_back(i);
-        }
-      }
-      used_bin = static_cast<int>(sorted_idx.size());
-
-      l2 += meta_->config->cat_l2;
-
-      auto ctr_fun = [this](double sum_grad, double sum_hess) {
-        return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
-      };
-      std::stable_sort(
-          sorted_idx.begin(), sorted_idx.end(), [this, &ctr_fun](int i, int j) {
-            return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) <
-                   ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
-          });
-
-      std::vector<int> find_direction(1, 1);
-      std::vector<int> start_position(1, 0);
-      find_direction.push_back(-1);
-      start_position.push_back(used_bin - 1);
-      const int max_num_cat =
-          std::min(meta_->config->max_cat_threshold, (used_bin + 1) / 2);
-      int max_threshold = std::max(std::min(max_num_cat, used_bin) - 1, 0);
-      if (USE_RAND) {
-        if (max_threshold > 0) {
-          rand_threshold = meta_->rand.NextInt(0, max_threshold);
-        }
-      }
-
-      is_splittable_ = false;
-      for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
-        auto dir = find_direction[out_i];
-        auto start_pos = start_position[out_i];
-        data_size_t min_data_per_group = meta_->config->min_data_per_group;
-        data_size_t cnt_cur_group = 0;
-        double sum_left_gradient = 0.0f;
-        double sum_left_hessian = kEpsilon;
-        data_size_t left_count = 0;
-        for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
-          auto t = sorted_idx[start_pos];
-          start_pos += dir;
-          const auto grad = GET_GRAD(data_, t);
-          const auto hess = GET_HESS(data_, t);
-          data_size_t cnt =
-              static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
-
-          sum_left_gradient += grad;
-          sum_left_hessian += hess;
-          left_count += cnt;
-          cnt_cur_group += cnt;
-
-          if (left_count < meta_->config->min_data_in_leaf ||
-              sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
-            continue;
-          }
-          data_size_t right_count = num_data - left_count;
-          if (right_count < meta_->config->min_data_in_leaf ||
-              right_count < min_data_per_group) {
-            break;
-          }
-
-          double sum_right_hessian = sum_hessian - sum_left_hessian;
-          if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
-            break;
-          }
-
-          if (cnt_cur_group < min_data_per_group) {
-            continue;
-          }
-
-          cnt_cur_group = 0;
+                                         SplitInfo* output);
 
-          double sum_right_gradient = sum_gradient - sum_left_gradient;
-          if (USE_RAND) {
-            if (i != rand_threshold) {
-              continue;
-            }
-          }
-          double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-              sum_left_gradient, sum_left_hessian, sum_right_gradient,
-              sum_right_hessian, meta_->config->lambda_l1, l2,
-              meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth,
-              left_count, right_count, parent_output);
-          if (current_gain <= min_gain_shift) {
-            continue;
-          }
-          is_splittable_ = true;
-          if (current_gain > best_gain) {
-            best_left_count = left_count;
-            best_sum_left_gradient = sum_left_gradient;
-            best_sum_left_hessian = sum_left_hessian;
-            best_threshold = i;
-            best_gain = current_gain;
-            best_dir = dir;
-          }
-        }
-      }
-    }
-
-    if (is_splittable_) {
-      output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-          best_sum_left_gradient, best_sum_left_hessian,
-          meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
-          constraints->LeftToBasicConstraint(), meta_->config->path_smooth, best_left_count, parent_output);
-      output->left_count = best_left_count;
-      output->left_sum_gradient = best_sum_left_gradient;
-      output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
-      output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-          sum_gradient - best_sum_left_gradient,
-          sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
-          meta_->config->max_delta_step, constraints->RightToBasicConstraint(), meta_->config->path_smooth,
-          num_data - best_left_count, parent_output);
-      output->right_count = num_data - best_left_count;
-      output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
-      output->right_sum_hessian =
-          sum_hessian - best_sum_left_hessian - kEpsilon;
-      output->gain = best_gain - min_gain_shift;
-      if (use_onehot) {
-        output->num_cat_threshold = 1;
-        output->cat_threshold =
-            std::vector<uint32_t>(1, static_cast<uint32_t>(best_threshold + offset));
-      } else {
-        output->num_cat_threshold = best_threshold + 1;
-        output->cat_threshold =
-            std::vector<uint32_t>(output->num_cat_threshold);
-        if (best_dir == 1) {
-          for (int i = 0; i < output->num_cat_threshold; ++i) {
-            auto t = sorted_idx[i] + offset;
-            output->cat_threshold[i] = t;
-          }
-        } else {
-          for (int i = 0; i < output->num_cat_threshold; ++i) {
-            auto t = sorted_idx[used_bin - 1 - i] + offset;
-            output->cat_threshold[i] = t;
-          }
-        }
-      }
-      output->monotone_type = 0;
-    }
-  }
+  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING, typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T,
+          typename HIST_BIN_T, typename HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
+  void FindBestThresholdCategoricalIntInner(int64_t int_sum_gradient_and_hessian,
+                                            const double grad_scale, const double hess_scale,
+                                            data_size_t num_data,
+                                            const FeatureConstraint* constraints,
+                                            double parent_output,
+                                            SplitInfo* output);
 
   void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
                               uint32_t threshold, data_size_t num_data,
@@ -1344,7 +1070,7 @@ class FeatureHistogram {
     PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian =
       HIST_BITS_ACC == 16 ?
       ((static_cast<int32_t>(int_sum_gradient_and_hessian >> 32) << 16) | static_cast<int32_t>(int_sum_gradient_and_hessian & 0x0000ffff)) :
-      int_sum_gradient_and_hessian;
+      static_cast<PACKED_HIST_ACC_T>(int_sum_gradient_and_hessian);
     double best_gain = kMinScore;
     uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
     const double cnt_factor = static_cast<double>(num_data) /
@@ -1418,10 +1144,10 @@ class FeatureHistogram {
 
         double sum_right_gradient = HIST_BITS_ACC == 16 ?
           static_cast<double>(static_cast<int16_t>(sum_right_gradient_and_hessian >> 16)) * grad_scale :
-          static_cast<double>(static_cast<int32_t>(sum_right_gradient_and_hessian >> 32)) * grad_scale;
+          static_cast<double>(static_cast<int32_t>(static_cast<int64_t>(sum_right_gradient_and_hessian) >> 32)) * grad_scale;
         double sum_left_gradient = HIST_BITS_ACC == 16 ?
           static_cast<double>(static_cast<int16_t>(sum_left_gradient_and_hessian >> 16)) * grad_scale :
-          static_cast<double>(static_cast<int32_t>(sum_left_gradient_and_hessian >> 32)) * grad_scale;
+          static_cast<double>(static_cast<int32_t>(static_cast<int64_t>(sum_left_gradient_and_hessian) >> 32)) * grad_scale;
         if (USE_RAND) {
           if (t - 1 + offset != rand_threshold) {
             continue;
@@ -1535,10 +1261,10 @@ class FeatureHistogram {
 
         double sum_right_gradient = HIST_BITS_ACC == 16 ?
           static_cast<double>(static_cast<int16_t>(sum_right_gradient_and_hessian >> 16)) * grad_scale :
-          static_cast<double>(static_cast<int32_t>(sum_right_gradient_and_hessian >> 32)) * grad_scale;
+          static_cast<double>(static_cast<int32_t>(static_cast<int64_t>(sum_right_gradient_and_hessian) >> 32)) * grad_scale;
         double sum_left_gradient = HIST_BITS_ACC == 16 ?
           static_cast<double>(static_cast<int16_t>(sum_left_gradient_and_hessian >> 16)) * grad_scale :
-          static_cast<double>(static_cast<int32_t>(sum_left_gradient_and_hessian >> 32)) * grad_scale;
+          static_cast<double>(static_cast<int32_t>(static_cast<int64_t>(sum_left_gradient_and_hessian) >> 32)) * grad_scale;
         if (USE_RAND) {
           if (t + offset != rand_threshold) {
             continue;
@@ -1578,7 +1304,7 @@ class FeatureHistogram {
     if (is_splittable_ && best_gain > output->gain + min_gain_shift) {
       const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ?
         static_cast<int32_t>(static_cast<int16_t>(best_sum_left_gradient_and_hessian >> 16)) :
-        static_cast<int32_t>(best_sum_left_gradient_and_hessian >> 32);
+        static_cast<int32_t>(static_cast<int64_t>(best_sum_left_gradient_and_hessian) >> 32);
       const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ?
         static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x0000ffff) :
         static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x00000000ffffffff);
diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp
index 666e09217678..bc41c75a7a8c 100644
--- a/src/treelearner/monotone_constraints.hpp
+++ b/src/treelearner/monotone_constraints.hpp
@@ -6,6 +6,8 @@
 #ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_
 #define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_
 
+#include <LightGBM/tree.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <limits>
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index d5c5cc59ef3a..f3a88bd18679 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -315,8 +315,8 @@ void SerialTreeLearner::BeforeTrain() {
       smaller_leaf_splits_->Init(
         0, data_partition_.get(),
         gradient_discretizer_->discretized_gradients_and_hessians(),
-        gradient_discretizer_->grad_scale(),
-        gradient_discretizer_->hess_scale());
+        static_cast<score_t>(gradient_discretizer_->grad_scale()),
+        static_cast<score_t>(gradient_discretizer_->hess_scale()));
     }
   }
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index ccde38977d2d..3fad36b34407 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -275,7 +275,20 @@ def test_missing_value_handle_none():
     assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret)
 
 
-def test_categorical_handle():
+@pytest.mark.parametrize(
+    "use_quantized_grad",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(
+                getenv("TASK", "") == "cuda",
+                reason="Skip because quantized training with categorical features is not supported for cuda version",
+            ),
+        ),
+        False,
+    ],
+)
+def test_categorical_handle(use_quantized_grad):
     x = [0, 1, 2, 3, 4, 5, 6, 7]
     y = [0, 1, 0, 1, 0, 1, 0, 1]
 
@@ -299,6 +312,7 @@ def test_categorical_handle():
         "max_cat_to_onehot": 1,
         "zero_as_missing": True,
         "categorical_column": 0,
+        "use_quantized_grad": use_quantized_grad,
     }
     evals_result = {}
     gbm = lgb.train(
@@ -311,7 +325,20 @@ def test_categorical_handle():
     assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret)
 
 
-def test_categorical_handle_na():
+@pytest.mark.parametrize(
+    "use_quantized_grad",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(
+                getenv("TASK", "") == "cuda",
+                reason="Skip because quantized training with categorical features is not supported for cuda version",
+            ),
+        ),
+        False,
+    ],
+)
+def test_categorical_handle_na(use_quantized_grad):
     x = [0, np.nan, 0, np.nan, 0, np.nan]
     y = [0, 1, 0, 1, 0, 1]
 
@@ -335,6 +362,7 @@ def test_categorical_handle_na():
         "max_cat_to_onehot": 1,
         "zero_as_missing": False,
         "categorical_column": 0,
+        "use_quantized_grad": use_quantized_grad,
     }
     evals_result = {}
     gbm = lgb.train(
@@ -347,7 +375,20 @@ def test_categorical_handle_na():
     assert evals_result["valid_0"]["auc"][-1] == pytest.approx(ret)
 
 
-def test_categorical_non_zero_inputs():
+@pytest.mark.parametrize(
+    "use_quantized_grad",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(
+                getenv("TASK", "") == "cuda",
+                reason="Skip because quantized training with categorical features is not supported for cuda version",
+            ),
+        ),
+        False,
+    ],
+)
+def test_categorical_non_zero_inputs(use_quantized_grad):
     x = [1, 1, 1, 1, 1, 1, 2, 2]
     y = [1, 1, 1, 1, 1, 1, 0, 0]
 
@@ -371,6 +412,7 @@ def test_categorical_non_zero_inputs():
         "max_cat_to_onehot": 1,
         "zero_as_missing": False,
         "categorical_column": 0,
+        "use_quantized_grad": use_quantized_grad,
     }
     evals_result = {}
     gbm = lgb.train(
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index b7848b4cab5a..009c744964d1 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -337,6 +337,7 @@
     <ClCompile Include="..\src\objective\objective_function.cpp" />
     <ClCompile Include="..\src\main.cpp" />
     <ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\feature_histogram.cpp" />
     <ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\linear_tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index 7010926799b7..00cb875b1218 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -284,6 +284,9 @@
     <ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp">
       <Filter>src\treelearner</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\treelearner\feature_histogram.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
     <ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp">
       <Filter>src\treelearner</Filter>
     </ClCompile>