From 8cc70f33eb8e1acf7ee3746a10ee772414827101 Mon Sep 17 00:00:00 2001
From: Jack Gerrits <jackgerrits95@gmail.com>
Date: Fri, 6 Jan 2023 15:23:28 -0500
Subject: [PATCH 1/4] refactor: migrate GD namespace

---
 python/pylibvw.cc                             |   2 +-
 .../core/include/vw/core/gd_predict.h         | 120 ++++++++-
 .../core/include/vw/core/reductions/gd.h      |  93 ++++++-
 vowpalwabbit/core/src/example.cc              |   2 +-
 vowpalwabbit/core/src/reductions/automl.cc    |   2 +-
 vowpalwabbit/core/src/reductions/bfgs.cc      |  14 +-
 .../cb/cb_explore_adf_large_action_space.cc   |   4 +-
 .../src/reductions/cb/cb_explore_adf_rnd.cc   |   2 +-
 .../large_action/compute_dot_prod_scalar.h    |   2 +-
 .../details/large_action/two_pass_svd_impl.cc |   8 +-
 vowpalwabbit/core/src/reductions/cbzo.cc      |   8 +-
 vowpalwabbit/core/src/reductions/csoaa_ldf.cc |   4 +-
 .../core/src/reductions/epsilon_decay.cc      |   2 +-
 vowpalwabbit/core/src/reductions/freegrad.cc  |  20 +-
 vowpalwabbit/core/src/reductions/ftrl.cc      |  42 ++--
 vowpalwabbit/core/src/reductions/gd.cc        | 238 +++++++++---------
 vowpalwabbit/core/src/reductions/gd_mf.cc     |   8 +-
 vowpalwabbit/core/src/reductions/lda_core.cc  |   2 +-
 vowpalwabbit/core/src/reductions/mf.cc        |   2 +-
 vowpalwabbit/core/src/reductions/mwt.cc       |   2 +-
 vowpalwabbit/core/src/reductions/nn.cc        |   4 +-
 .../core/src/reductions/oja_newton.cc         |  16 +-
 vowpalwabbit/core/src/reductions/print.cc     |   2 +-
 .../core/src/reductions/search/search.cc      |   6 +-
 .../src/reductions/search/search_graph.cc     |   4 +-
 .../core/src/reductions/stagewise_poly.cc     |   4 +-
 vowpalwabbit/core/src/reductions/svrg.cc      |  16 +-
 vowpalwabbit/core/src/vw.cc                   |   2 +-
 .../slim/include/vw/slim/vw_slim_predict.h    |   4 +-
 29 files changed, 404 insertions(+), 231 deletions(-)
diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index c2dba2c4063..e2ab2b06077 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -358,7 +358,7 @@ py::object get_options(vw_ptr all, py::object py_class, bool enabled_only)
   return opt_manager.get_vw_option_pyobjects(enabled_only);
 }
 
-void my_audit_example(vw_ptr all, example_ptr ec) { GD::print_audit_features(*all, *ec); }
+void my_audit_example(vw_ptr all, example_ptr ec) { VW::details::print_audit_features(*all, *ec); }
 
 const char* get_model_id(vw_ptr all) { return all->id.c_str(); }
 
diff --git a/vowpalwabbit/core/include/vw/core/gd_predict.h b/vowpalwabbit/core/include/vw/core/gd_predict.h
index 2185ec2e61c..b2117a3bd71 100644
--- a/vowpalwabbit/core/include/vw/core/gd_predict.h
+++ b/vowpalwabbit/core/include/vw/core/gd_predict.h
@@ -11,8 +11,18 @@
 #undef VW_DEBUG_LOG
 #define VW_DEBUG_LOG vw_dbg::GD_PREDICT
 
-namespace GD
+namespace VW
 {
+  namespace details
+  {
+    template <class DataT>
+inline void dummy_func(DataT&, const VW::audit_strings*)
+{
+}  // should never be called due to call_audit overload
+
+inline void vec_add(float& p, float fx, float fw) { p += fw * fx; }
+
+  }
 // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_index)
 template <class DataT, void (*FuncT)(DataT&, float feature_value, uint64_t feature_index), class WeightsT>
 void foreach_feature(WeightsT& /*weights*/, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
@@ -39,11 +49,6 @@ inline void foreach_feature(
   for (const auto& f : fs) { FuncT(dat, mult * f.value(), weights[static_cast<size_t>(f.index() + offset)]); }
 }
 
-template <class DataT>
-inline void dummy_func(DataT&, const VW::audit_strings*)
-{
-}  // should never be called due to call_audit overload
-
 template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
     class WeightsT>  // nullptr func can't be used as template param in old
                      // compilers
@@ -54,7 +59,7 @@ inline void generate_interactions(const std::vector<std::vector<VW::namespace_in
     VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
                                                              // ambiguity in old complers
 {
-  VW::generate_interactions<DataT, WeightOrIndexT, FuncT, false, dummy_func<DataT>, WeightsT>(
+  VW::generate_interactions<DataT, WeightOrIndexT, FuncT, false, details::dummy_func<DataT>, WeightsT>(
       interactions, extent_interactions, permutations, ec, dat, weights, num_interacted_features, cache);
 }
 
@@ -100,7 +105,7 @@ inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
       extent_interactions, permutations, ec, dat, num_interacted_features_ignored, cache);
 }
 
-inline void vec_add(float& p, float fx, float fw) { p += fw * fx; }
+
 
 template <class WeightsT>
 inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
@@ -109,7 +114,7 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 {
-  foreach_feature<float, float, vec_add, WeightsT>(
+  foreach_feature<float, float, details::vec_add, WeightsT>(
       weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec, initial, cache);
   return initial;
 }
@@ -121,8 +126,101 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 {
-  foreach_feature<float, float, vec_add, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
+  foreach_feature<float, float, details::vec_add, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
       extent_interactions, permutations, ec, initial, num_interacted_features, cache);
   return initial;
 }
-}  // namespace GD
+}
+
+// namespace GD
+// {
+
+// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_index)
+// template <class DataT, void (*FuncT)(DataT&, float feature_value, uint64_t feature_index), class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+// {
+//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+// }
+
+// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+// template <class DataT, void (*FuncT)(DataT&, const float feature_value, float& weight_reference), class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+// {
+//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+// }
+
+// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+// template <class DataT, void (*FuncT)(DataT&, float, float), class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(
+//     const WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+// {
+//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+// }
+
+// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
+//     class WeightsT>  // nullptr func can't be used as template param in old
+//                      // compilers
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void generate_interactions(const std::vector<std::vector<VW::namespace_index>>& interactions,
+//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+//     DataT& dat, WeightsT& weights, size_t& num_interacted_features,
+//     VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
+//                                                              // ambiguity in old complers
+// {
+//   VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations, ec,
+//       dat, weights, num_interacted_features, cache);
+// }
+
+// // iterate through all namespaces and quadratic&cubic features, callback function FuncT(some_data_R, feature_value_x,
+// // WeightOrIndexT) where WeightOrIndexT is EITHER float& feature_weight OR uint64_t feature_index
+// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
+//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+//     const std::vector<std::vector<VW::namespace_index>>& interactions,
+//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+//     DataT& dat, size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache)
+// {
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
+//       extent_interactions, permutations, ec, dat, num_interacted_features, cache);
+// }
+
+// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
+//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+//     const std::vector<std::vector<VW::namespace_index>>& interactions,
+//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+//     DataT& dat, VW::details::generate_interactions_object_cache& cache)
+// {
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
+//       extent_interactions, permutations, ec, dat, cache);
+// }
+
+// template <class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
+//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+//     const std::vector<std::vector<VW::namespace_index>>& interactions,
+//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+//     VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
+// {
+//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec,
+//       cache, initial);
+// }
+
+// template <class WeightsT>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
+//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+//     const std::vector<std::vector<VW::namespace_index>>& interactions,
+//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+//     size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
+// {
+//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec,
+//       num_interacted_features, cache, initial);
+// }
+// }  // namespace GD
\ No newline at end of file
diff --git a/vowpalwabbit/core/include/vw/core/reductions/gd.h b/vowpalwabbit/core/include/vw/core/reductions/gd.h
index 009cda60456..d2eae5e2f00 100644
--- a/vowpalwabbit/core/include/vw/core/reductions/gd.h
+++ b/vowpalwabbit/core/include/vw/core/reductions/gd.h
@@ -20,20 +20,22 @@ namespace VW
 namespace reductions
 {
 VW::LEARNER::base_learner* gd_setup(VW::setup_base_i& stack_builder);
-}
-}  // namespace VW
-namespace GD
+
+namespace details
 {
+
 class per_model_state
 {
 public:
   double normalized_sum_norm_x = 0.0;
   double total_weight = 0.0;
 };
+}  // namespace details
+
 class gd
 {
 public:
-  std::vector<per_model_state> per_model_states;
+  std::vector<details::per_model_state> per_model_states;
   size_t no_win_counter = 0;
   size_t early_stop_thres = 0;
   float initial_constant = 0.f;
@@ -52,13 +54,17 @@ class gd
   bool adax = false;
   VW::workspace* all = nullptr;  // parallel, features, parameters
 };
+}  // namespace reductions
+
+namespace details
+{
 
 float finalize_prediction(VW::shared_data* sd, VW::io::logger& logger, float ret);
 void print_features(VW::workspace& all, VW::example& ec);
 void print_audit_features(VW::workspace&, VW::example& ec);
-void save_load_regressor(VW::workspace& all, VW::io_buf& model_file, bool read, bool text);
-void save_load_online_state(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, double& total_weight,
-    double& normalized_sum_norm_x, GD::gd* g = nullptr, uint32_t ftrl_size = 0);
+void save_load_regressor_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text);
+void save_load_online_state_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, double& total_weight,
+    double& normalized_sum_norm_x, VW::reductions::gd* g = nullptr, uint32_t ftrl_size = 0);
 
 template <class T>
 class multipredict_info
@@ -99,6 +105,7 @@ inline void vec_add_multipredict(multipredict_info<T>& mp, const float fx, uint6
     }
   }
 }
+}  // namespace details
 
 // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
 template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
@@ -180,7 +187,7 @@ inline float trunc_weight(const float w, const float gravity)
   return (gravity < fabsf(w)) ? w - VW::math::sign(w) * gravity : 0.f;
 }
 
-}  // namespace GD
+}  // namespace VW
 
 namespace VW
 {
@@ -239,3 +246,73 @@ inline void generate_interactions(VW::workspace& all, VW::example_predict& ec, R
 }
 
 }  // namespace INTERACTIONS
+
+// namespace GD
+// {
+
+// using gd = VW::reductions::gd;
+
+// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+// {
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat);
+// }
+
+// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+// {
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat, num_interacted_features);
+// }
+
+// // iterate through all namespaces and quadratic&cubic features, callback function T(some_data_R, feature_value_x,
+// // feature_weight)
+// template <class DataT, void (*FuncT)(DataT&, float, float&)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+// {
+//   VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat);
+// }
+
+// template <class DataT, void (*FuncT)(DataT&, float, float)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+// {
+//   VW::foreach_feature<DataT, float, FuncT>(all, ec, dat);
+// }
+
+// template <class DataT, void (*FuncT)(DataT&, float, float&)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+// {
+//   VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat, num_interacted_features);
+// }
+
+// template <class DataT, void (*FuncT)(DataT&, float, const float&)>
+// VW_DEPRECATED("Moved to VW namespace")
+// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+// {
+//   VW::foreach_feature<DataT, const float&, FuncT>(all, ec, dat, num_interacted_features);
+// }
+
+// VW_DEPRECATED("Moved to VW namespace")
+// inline float inline_predict(VW::workspace& all, VW::example& ec)
+// {
+//   return VW::inline_predict(all, ec);
+// }
+
+// VW_DEPRECATED("Moved to VW namespace")
+// inline float inline_predict(VW::workspace& all, VW::example& ec, size_t& num_generated_features)
+// {
+//   return VW::inline_predict(all, ec, num_generated_features);
+// }
+
+// VW_DEPRECATED("Moved to VW namespace")
+// inline float trunc_weight(const float w, const float gravity)
+// {
+//   return VW::trunc_weight(w, gravity);
+// }
+// }
\ No newline at end of file
diff --git a/vowpalwabbit/core/src/example.cc b/vowpalwabbit/core/src/example.cc
index cebe9c1f6b8..dbd671c825a 100644
--- a/vowpalwabbit/core/src/example.cc
+++ b/vowpalwabbit/core/src/example.cc
@@ -122,7 +122,7 @@ flat_example* flatten_example(VW::workspace& all, example* ec)
     ffs.mask = all.weights.mask() >> all.weights.stride_shift();
   }
   else { ffs.mask = static_cast<uint64_t>(LONG_MAX) >> all.weights.stride_shift(); }
-  GD::foreach_feature<full_features_and_source, uint64_t, vec_ffs_store>(all, *ec, ffs);
+  VW::foreach_feature<full_features_and_source, uint64_t, vec_ffs_store>(all, *ec, ffs);
 
   std::swap(fec.fs, ffs.fs);
 
diff --git a/vowpalwabbit/core/src/reductions/automl.cc b/vowpalwabbit/core/src/reductions/automl.cc
index dbe368aaccf..1a14a17765e 100644
--- a/vowpalwabbit/core/src/reductions/automl.cc
+++ b/vowpalwabbit/core/src/reductions/automl.cc
@@ -184,7 +184,7 @@ VW::LEARNER::base_learner* make_automl_with_impl(VW::setup_base_i& stack_builder
   auto ppw = max_live_configs;
   auto* persist_ptr = verbose_metrics ? persist<config_manager_type, true> : persist<config_manager_type, false>;
   data->adf_learner = as_multiline(base_learner->get_learner_by_name_prefix("cb_adf"));
-  GD::gd& gd = *static_cast<GD::gd*>(
+  VW::reductions::gd& gd = *static_cast<VW::reductions::gd*>(
       base_learner->get_learner_by_name_prefix("gd")->get_internal_type_erased_data_pointer_test_use_only());
   auto& adf_data =
       *static_cast<VW::reductions::cb_adf*>(data->adf_learner->get_internal_type_erased_data_pointer_test_use_only());
diff --git a/vowpalwabbit/core/src/reductions/bfgs.cc b/vowpalwabbit/core/src/reductions/bfgs.cc
index 271f3c6a518..339ba579359 100644
--- a/vowpalwabbit/core/src/reductions/bfgs.cc
+++ b/vowpalwabbit/core/src/reductions/bfgs.cc
@@ -162,8 +162,8 @@ constexpr bool test_example(VW::example& ec) noexcept { return ec.l.simple.label
 
 float bfgs_predict(VW::workspace& all, VW::example& ec)
 {
-  ec.partial_prediction = GD::inline_predict(all, ec);
-  return GD::finalize_prediction(all.sd, all.logger, ec.partial_prediction);
+  ec.partial_prediction = VW::inline_predict(all, ec);
+  return VW::details::finalize_prediction(all.sd, all.logger, ec.partial_prediction);
 }
 
 inline void add_grad(float& d, float f, float& fw) { (&fw)[W_GT] += d * f; }
@@ -175,7 +175,7 @@ float predict_and_gradient(VW::workspace& all, VW::example& ec)
   all.set_minmax(all.sd, ld.label);
 
   float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
-  GD::foreach_feature<float, add_grad>(all, ec, loss_grad);
+  VW::foreach_feature<float, add_grad>(all, ec, loss_grad);
 
   return fp;
 }
@@ -185,7 +185,7 @@ inline void add_precond(float& d, float f, float& fw) { (&fw)[W_COND] += d * f *
 void update_preconditioner(VW::workspace& all, VW::example& ec)
 {
   float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
-  GD::foreach_feature<float, add_precond>(all, ec, curvature);
+  VW::foreach_feature<float, add_precond>(all, ec, curvature);
 }
 
 inline void add_dir(float& p, const float fx, float& fw) { p += (&fw)[W_DIR] * fx; }
@@ -194,7 +194,7 @@ float dot_with_direction(VW::workspace& all, VW::example& ec)
 {
   const auto& simple_red_features = ec.ex_reduction_features.template get<VW::simple_label_reduction_features>();
   float temp = simple_red_features.initial;
-  GD::foreach_feature<float, add_dir>(all, ec, temp);
+  VW::foreach_feature<float, add_dir>(all, ec, temp);
   return temp;
 }
 
@@ -982,7 +982,7 @@ void predict(bfgs& b, base_learner&, VW::example& ec)
 {
   VW::workspace* all = b.all;
   ec.pred.scalar = bfgs_predict(*all, ec);
-  if (audit) { GD::print_audit_features(*(b.all), ec); }
+  if (audit) { VW::details::print_audit_features(*(b.all), ec); }
 }
 
 template <bool audit>
@@ -1103,7 +1103,7 @@ void save_load(bfgs& b, VW::io_buf& model_file, bool read, bool text)
         model_file, reinterpret_cast<char*>(&reg_vector), sizeof(reg_vector), read, msg, text);
 
     if (reg_vector) { save_load_regularizer(*all, b, model_file, read, text); }
-    else { GD::save_load_regressor(*all, model_file, read, text); }
+    else { VW::details::save_load_regressor_gd(*all, model_file, read, text); }
   }
 }
 
diff --git a/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_large_action_space.cc b/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_large_action_space.cc
index 28293313628..73bf987976c 100644
--- a/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_large_action_space.cc
+++ b/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_large_action_space.cc
@@ -72,7 +72,7 @@ bool _test_only_generate_A(VW::workspace* _all, const multi_ex& examples, std::v
     if (_all->weights.sparse)
     {
       A_triplet_constructor w(_all->weights.sparse_weights.mask(), row_index, _triplets, max_non_zero_col);
-      GD::foreach_feature<A_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
+      VW::foreach_feature<A_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
           _all->weights.sparse_weights, _all->ignore_some_linear, _all->ignore_linear,
           (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
           (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
@@ -83,7 +83,7 @@ bool _test_only_generate_A(VW::workspace* _all, const multi_ex& examples, std::v
     {
       A_triplet_constructor w(_all->weights.dense_weights.mask(), row_index, _triplets, max_non_zero_col);
 
-      GD::foreach_feature<A_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
+      VW::foreach_feature<A_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
           _all->weights.dense_weights, _all->ignore_some_linear, _all->ignore_linear,
           (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
           (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
diff --git a/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_rnd.cc b/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_rnd.cc
index 4da6aa54071..b1b6b43ac0f 100644
--- a/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_rnd.cc
+++ b/vowpalwabbit/core/src/reductions/cb/cb_explore_adf_rnd.cc
@@ -161,7 +161,7 @@ float cb_explore_adf_rnd::get_initial_prediction(VW::example* ec)
   lazy_gaussian w;
 
   std::pair<float, float> dotwithnorm(0.f, 0.f);
-  GD::foreach_feature<std::pair<float, float>, float, vec_add_with_norm, lazy_gaussian>(w, _all->ignore_some_linear,
+  VW::foreach_feature<std::pair<float, float>, float, vec_add_with_norm, lazy_gaussian>(w, _all->ignore_some_linear,
       _all->ignore_linear, _all->interactions, _all->extent_interactions, _all->permutations, *ec, dotwithnorm,
       _all->generate_interactions_object_cache_state);
 
diff --git a/vowpalwabbit/core/src/reductions/cb/details/large_action/compute_dot_prod_scalar.h b/vowpalwabbit/core/src/reductions/cb/details/large_action/compute_dot_prod_scalar.h
index dd0e59f7452..b5752c2bb87 100644
--- a/vowpalwabbit/core/src/reductions/cb/details/large_action/compute_dot_prod_scalar.h
+++ b/vowpalwabbit/core/src/reductions/cb/details/large_action/compute_dot_prod_scalar.h
@@ -55,7 +55,7 @@ inline float compute_dot_prod_scalar(uint64_t col, VW::workspace* _all, uint64_t
 
   AO_triplet_constructor tc(_all->weights.mask(), col, _seed, final_dot_prod);
 
-  GD::foreach_feature<AO_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
+  VW::foreach_feature<AO_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
       _all->weights.dense_weights, _all->ignore_some_linear, _all->ignore_linear,
       (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
       (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
diff --git a/vowpalwabbit/core/src/reductions/cb/details/large_action/two_pass_svd_impl.cc b/vowpalwabbit/core/src/reductions/cb/details/large_action/two_pass_svd_impl.cc
index 6059bdd50dd..f0680426e8a 100644
--- a/vowpalwabbit/core/src/reductions/cb/details/large_action/two_pass_svd_impl.cc
+++ b/vowpalwabbit/core/src/reductions/cb/details/large_action/two_pass_svd_impl.cc
@@ -96,7 +96,7 @@ bool two_pass_svd_impl::generate_Y(const multi_ex& examples, const std::vector<f
       {
         Y_triplet_constructor tc(_all->weights.sparse_weights.mask(), row_index, col, _seed, _triplets,
             max_non_zero_col, non_zero_rows, shrink_factors);
-        GD::foreach_feature<Y_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
+        VW::foreach_feature<Y_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
             _all->weights.sparse_weights, _all->ignore_some_linear, _all->ignore_linear,
             (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
             (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
@@ -107,7 +107,7 @@ bool two_pass_svd_impl::generate_Y(const multi_ex& examples, const std::vector<f
       {
         Y_triplet_constructor tc(_all->weights.dense_weights.mask(), row_index, col, _seed, _triplets, max_non_zero_col,
             non_zero_rows, shrink_factors);
-        GD::foreach_feature<Y_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
+        VW::foreach_feature<Y_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
             _all->weights.dense_weights, _all->ignore_some_linear, _all->ignore_linear,
             (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
             (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
@@ -152,7 +152,7 @@ void two_pass_svd_impl::generate_B(const multi_ex& examples, const std::vector<f
       if (_all->weights.sparse)
       {
         B_triplet_constructor tc(_all->weights.sparse_weights.mask(), col, Y, final_dot_prod);
-        GD::foreach_feature<B_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
+        VW::foreach_feature<B_triplet_constructor, uint64_t, triplet_construction, sparse_parameters>(
             _all->weights.sparse_weights, _all->ignore_some_linear, _all->ignore_linear,
             (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
             (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
@@ -162,7 +162,7 @@ void two_pass_svd_impl::generate_B(const multi_ex& examples, const std::vector<f
       else
       {
         B_triplet_constructor tc(_all->weights.dense_weights.mask(), col, Y, final_dot_prod);
-        GD::foreach_feature<B_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
+        VW::foreach_feature<B_triplet_constructor, uint64_t, triplet_construction, dense_parameters>(
             _all->weights.dense_weights, _all->ignore_some_linear, _all->ignore_linear,
             (red_features.generated_interactions ? *red_features.generated_interactions : *ex->interactions),
             (red_features.generated_extent_interactions ? *red_features.generated_extent_interactions
diff --git a/vowpalwabbit/core/src/reductions/cbzo.cc b/vowpalwabbit/core/src/reductions/cbzo.cc
index 04470787007..2d57dfc7f56 100644
--- a/vowpalwabbit/core/src/reductions/cbzo.cc
+++ b/vowpalwabbit/core/src/reductions/cbzo.cc
@@ -84,7 +84,7 @@ inline float constant_inference(VW::workspace& all)
 float linear_inference(VW::workspace& all, VW::example& ec)
 {
   float dotprod = 0;
-  GD::foreach_feature<float, accumulate_dotprod>(all, ec, dotprod);
+  VW::foreach_feature<float, accumulate_dotprod>(all, ec, dotprod);
   return dotprod;
 }
 
@@ -139,7 +139,7 @@ void linear_update(cbzo& data, VW::example& ec)
   upd_data.part_grad = part_grad;
   upd_data.all = data.all;
 
-  GD::foreach_feature<linear_update_data, uint64_t, linear_per_feature_update<feature_mask_off>>(
+  VW::foreach_feature<linear_update_data, uint64_t, linear_per_feature_update<feature_mask_off>>(
       *data.all, ec, upd_data);
 }
 
@@ -167,7 +167,7 @@ void print_audit_features(VW::workspace& all, VW::example& ec)
         VW::to_string(ec.pred.pdf, std::numeric_limits<float>::max_digits10), ec.tag, all.logger);
   }
 
-  GD::print_features(all, ec);
+  VW::details::print_features(all, ec);
 }
 
 // Returns a value close to x and greater than it
@@ -229,7 +229,7 @@ void NO_SANITIZE_UNDEFINED learn(cbzo& data, base_learner& base, VW::example& ec
 
 inline void save_load_regressor(VW::workspace& all, VW::io_buf& model_file, bool read, bool text)
 {
-  GD::save_load_regressor(all, model_file, read, text);
+  VW::details::save_load_regressor_gd(all, model_file, read, text);
 }
 
 void save_load(cbzo& data, VW::io_buf& model_file, bool read, bool text)
diff --git a/vowpalwabbit/core/src/reductions/csoaa_ldf.cc b/vowpalwabbit/core/src/reductions/csoaa_ldf.cc
index 321de353db6..cb3aec614ce 100644
--- a/vowpalwabbit/core/src/reductions/csoaa_ldf.cc
+++ b/vowpalwabbit/core/src/reductions/csoaa_ldf.cc
@@ -13,7 +13,7 @@
 #include "vw/core/loss_functions.h"
 #include "vw/core/prediction_type.h"
 #include "vw/core/print_utils.h"
-#include "vw/core/reductions/gd.h"  // GD::foreach_feature() needed in subtract_example()
+#include "vw/core/reductions/gd.h"  // VW::foreach_feature() needed in subtract_example()
 #include "vw/core/scope_exit.h"
 #include "vw/core/setup_base.h"
 #include "vw/core/shared_data.h"
@@ -78,7 +78,7 @@ void subtract_example(VW::workspace& all, VW::example* ec, VW::example* ecsub)
 {
   auto& wap_fs = ec->feature_space[VW::details::WAP_LDF_NAMESPACE];
   wap_fs.sum_feat_sq = 0;
-  GD::foreach_feature<VW::example&, uint64_t, subtract_feature>(all, *ecsub, *ec);
+  VW::foreach_feature<VW::example&, uint64_t, subtract_feature>(all, *ecsub, *ec);
   ec->indices.push_back(VW::details::WAP_LDF_NAMESPACE);
   ec->num_features += wap_fs.size();
   ec->reset_total_sum_feat_sq();
diff --git a/vowpalwabbit/core/src/reductions/epsilon_decay.cc b/vowpalwabbit/core/src/reductions/epsilon_decay.cc
index 919309941cd..fc7449cd469 100644
--- a/vowpalwabbit/core/src/reductions/epsilon_decay.cc
+++ b/vowpalwabbit/core/src/reductions/epsilon_decay.cc
@@ -385,7 +385,7 @@ VW::LEARNER::base_learner* VW::reductions::epsilon_decay_setup(VW::setup_base_i&
   // to make sure there are not subtle bugs
   auto* base_learner = stack_builder.setup_base_learner();
 
-  GD::gd& gd = *static_cast<GD::gd*>(
+  VW::reductions::gd& gd = *static_cast<VW::reductions::gd*>(
       base_learner->get_learner_by_name_prefix("gd")->get_internal_type_erased_data_pointer_test_use_only());
   auto& adf_data =
       *static_cast<VW::reductions::cb_adf*>(as_multiline(base_learner->get_learner_by_name_prefix("cb_adf"))
diff --git a/vowpalwabbit/core/src/reductions/freegrad.cc b/vowpalwabbit/core/src/reductions/freegrad.cc
index 8fcd8664ff9..8335ba384e7 100644
--- a/vowpalwabbit/core/src/reductions/freegrad.cc
+++ b/vowpalwabbit/core/src/reductions/freegrad.cc
@@ -69,10 +69,10 @@ template <bool audit>
 void predict(freegrad& b, base_learner& /* base */, VW::example& ec)
 {
   size_t num_features_from_interactions = 0;
-  ec.partial_prediction = GD::inline_predict(*b.all, ec, num_features_from_interactions);
+  ec.partial_prediction = VW::inline_predict(*b.all, ec, num_features_from_interactions);
   ec.num_features_from_interactions = num_features_from_interactions;
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
-  if (audit) { GD::print_audit_features(*(b.all), ec); }
+  ec.pred.scalar = VW::details::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
+  if (audit) { VW::details::print_audit_features(*(b.all), ec); }
 }
 
 void inner_freegrad_predict(freegrad_update_data& d, float x, float& wref)
@@ -109,7 +109,7 @@ void freegrad_predict(freegrad& fg, VW::example& ec)
   float projection_radius;
 
   // Compute the unprojected predict
-  GD::foreach_feature<freegrad_update_data, inner_freegrad_predict>(
+  VW::foreach_feature<freegrad_update_data, inner_freegrad_predict>(
       *fg.all, ec, fg.update_data, num_features_from_interactions);
   norm_w_pred = sqrtf(fg.update_data.squared_norm_prediction);
 
@@ -124,7 +124,7 @@ void freegrad_predict(freegrad& fg, VW::example& ec)
   ec.partial_prediction = fg.update_data.predict;
 
   ec.num_features_from_interactions = num_features_from_interactions;
-  ec.pred.scalar = GD::finalize_prediction(fg.all->sd, fg.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(fg.all->sd, fg.all->logger, ec.partial_prediction);
 }
 
 void gradient_dot_w(freegrad_update_data& d, float x, float& wref)
@@ -253,10 +253,10 @@ void freegrad_update_after_prediction(freegrad& fg, VW::example& ec)
   fg.update_data.update = fg.all->loss->first_derivative(fg.all->sd, ec.pred.scalar, ec.l.simple.label);
 
   // Compute gradient norm
-  GD::foreach_feature<freegrad_update_data, gradient_dot_w>(*fg.all, ec, fg.update_data);
+  VW::foreach_feature<freegrad_update_data, gradient_dot_w>(*fg.all, ec, fg.update_data);
 
   // Performing the update
-  GD::foreach_feature<freegrad_update_data, inner_freegrad_update_after_prediction>(*fg.all, ec, fg.update_data);
+  VW::foreach_feature<freegrad_update_data, inner_freegrad_update_after_prediction>(*fg.all, ec, fg.update_data);
 
   // Update the maximum gradient norm value
   clipped_grad_norm = sqrtf(fg.update_data.squared_norm_clipped_grad);
@@ -277,7 +277,7 @@ void learn_freegrad(freegrad& a, base_learner& /* base */, VW::example& ec)
 {
   // update state based on the example and predict
   freegrad_predict(a, ec);
-  if (audit) { GD::print_audit_features(*(a.all), ec); }
+  if (audit) { VW::details::print_audit_features(*(a.all), ec); }
 
   // update state based on the prediction
   freegrad_update_after_prediction(a, ec);
@@ -298,10 +298,10 @@ void save_load(freegrad& fg, VW::io_buf& model_file, bool read, bool text)
 
     if (resume)
     {
-      GD::save_load_online_state(
+      VW::details::save_load_online_state_gd(
           *all, model_file, read, text, fg.total_weight, fg.normalized_sum_norm_x, nullptr, fg.freegrad_size);
     }
-    else { GD::save_load_regressor(*all, model_file, read, text); }
+    else { VW::details::save_load_regressor_gd(*all, model_file, read, text); }
   }
 }
 
diff --git a/vowpalwabbit/core/src/reductions/ftrl.cc b/vowpalwabbit/core/src/reductions/ftrl.cc
index ad6fada5305..7b44a810058 100644
--- a/vowpalwabbit/core/src/reductions/ftrl.cc
+++ b/vowpalwabbit/core/src/reductions/ftrl.cc
@@ -85,7 +85,7 @@ inline void predict_with_confidence(uncertainty& d, const float fx, float& fw)
 float sensitivity(ftrl& b, base_learner& /* base */, VW::example& ec)
 {
   uncertainty uncetain(b);
-  GD::foreach_feature<uncertainty, predict_with_confidence>(*(b.all), ec, uncetain);
+  VW::foreach_feature<uncertainty, predict_with_confidence>(*(b.all), ec, uncetain);
   return uncetain.score;
 }
 
@@ -93,10 +93,10 @@ template <bool audit>
 void predict(ftrl& b, base_learner&, VW::example& ec)
 {
   size_t num_features_from_interactions = 0;
-  ec.partial_prediction = GD::inline_predict(*b.all, ec, num_features_from_interactions);
+  ec.partial_prediction = VW::inline_predict(*b.all, ec, num_features_from_interactions);
   ec.num_features_from_interactions = num_features_from_interactions;
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
-  if (audit) { GD::print_audit_features(*(b.all), ec); }
+  ec.pred.scalar = VW::details::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
+  if (audit) { VW::details::print_audit_features(*(b.all), ec); }
 }
 
 template <bool audit>
@@ -112,16 +112,16 @@ void multipredict(ftrl& b, base_learner&, VW::example& ec, size_t count, size_t
   size_t num_features_from_interactions = 0;
   if (b.all->weights.sparse)
   {
-    GD::multipredict_info<VW::sparse_parameters> mp = {
+    VW::details::multipredict_info<VW::sparse_parameters> mp = {
         count, step, pred, all.weights.sparse_weights, static_cast<float>(all.sd->gravity)};
-    GD::foreach_feature<GD::multipredict_info<VW::sparse_parameters>, uint64_t, GD::vec_add_multipredict>(
+    VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t, VW::details::vec_add_multipredict>(
         all, ec, mp, num_features_from_interactions);
   }
   else
   {
-    GD::multipredict_info<VW::dense_parameters> mp = {
+    VW::details::multipredict_info<VW::dense_parameters> mp = {
         count, step, pred, all.weights.dense_weights, static_cast<float>(all.sd->gravity)};
-    GD::foreach_feature<GD::multipredict_info<VW::dense_parameters>, uint64_t, GD::vec_add_multipredict>(
+    VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t, VW::details::vec_add_multipredict>(
         all, ec, mp, num_features_from_interactions);
   }
   ec.num_features_from_interactions = num_features_from_interactions;
@@ -131,14 +131,14 @@ void multipredict(ftrl& b, base_learner&, VW::example& ec, size_t count, size_t
   }
   if (finalize_predictions)
   {
-    for (size_t c = 0; c < count; c++) { pred[c].scalar = GD::finalize_prediction(all.sd, all.logger, pred[c].scalar); }
+    for (size_t c = 0; c < count; c++) { pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar); }
   }
   if (audit)
   {
     for (size_t c = 0; c < count; c++)
     {
       ec.pred.scalar = pred[c].scalar;
-      GD::print_audit_features(all, ec);
+      VW::details::print_audit_features(all, ec);
       ec.ft_offset += static_cast<uint64_t>(step);
     }
     ec.ft_offset -= static_cast<uint64_t>(step * count);
@@ -252,7 +252,7 @@ void coin_betting_predict(ftrl& b, base_learner&, VW::example& ec)
   b.data.normalized_squared_norm_x = 0;
 
   size_t num_features_from_interactions = 0;
-  GD::foreach_feature<ftrl_update_data, inner_coin_betting_predict>(*b.all, ec, b.data, num_features_from_interactions);
+  VW::foreach_feature<ftrl_update_data, inner_coin_betting_predict>(*b.all, ec, b.data, num_features_from_interactions);
   ec.num_features_from_interactions = num_features_from_interactions;
 
   b.normalized_sum_norm_x += (static_cast<double>(ec.weight)) * b.data.normalized_squared_norm_x;
@@ -261,7 +261,7 @@ void coin_betting_predict(ftrl& b, base_learner&, VW::example& ec)
 
   ec.partial_prediction = b.data.predict / b.data.average_squared_norm_x;
 
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
 }
 
 void update_state_and_predict_pistol(ftrl& b, base_learner&, VW::example& ec)
@@ -269,30 +269,30 @@ void update_state_and_predict_pistol(ftrl& b, base_learner&, VW::example& ec)
   b.data.predict = 0;
 
   size_t num_features_from_interactions = 0;
-  GD::foreach_feature<ftrl_update_data, inner_update_pistol_state_and_predict>(
+  VW::foreach_feature<ftrl_update_data, inner_update_pistol_state_and_predict>(
       *b.all, ec, b.data, num_features_from_interactions);
   ec.num_features_from_interactions = num_features_from_interactions;
 
   ec.partial_prediction = b.data.predict;
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(b.all->sd, b.all->logger, ec.partial_prediction);
 }
 
 void update_after_prediction_proximal(ftrl& b, VW::example& ec)
 {
   b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
-  GD::foreach_feature<ftrl_update_data, inner_update_proximal>(*b.all, ec, b.data);
+  VW::foreach_feature<ftrl_update_data, inner_update_proximal>(*b.all, ec, b.data);
 }
 
 void update_after_prediction_pistol(ftrl& b, VW::example& ec)
 {
   b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
-  GD::foreach_feature<ftrl_update_data, inner_update_pistol_post>(*b.all, ec, b.data);
+  VW::foreach_feature<ftrl_update_data, inner_update_pistol_post>(*b.all, ec, b.data);
 }
 
 void coin_betting_update_after_prediction(ftrl& b, VW::example& ec)
 {
   b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
-  GD::foreach_feature<ftrl_update_data, inner_coin_betting_update_after_prediction>(*b.all, ec, b.data);
+  VW::foreach_feature<ftrl_update_data, inner_coin_betting_update_after_prediction>(*b.all, ec, b.data);
 }
 
 // NO_SANITIZE_UNDEFINED needed in learn functions because
@@ -312,7 +312,7 @@ void NO_SANITIZE_UNDEFINED learn_pistol(ftrl& a, base_learner& base, VW::example
 {
   // update state based on the example and predict
   update_state_and_predict_pistol(a, base, ec);
-  if (audit) { GD::print_audit_features(*(a.all), ec); }
+  if (audit) { VW::details::print_audit_features(*(a.all), ec); }
   // update state based on the prediction
   update_after_prediction_pistol(a, ec);
 }
@@ -322,7 +322,7 @@ void NO_SANITIZE_UNDEFINED learn_coin_betting(ftrl& a, base_learner& base, VW::e
 {
   // update state based on the example and predict
   coin_betting_predict(a, base, ec);
-  if (audit) { GD::print_audit_features(*(a.all), ec); }
+  if (audit) { VW::details::print_audit_features(*(a.all), ec); }
   // update state based on the prediction
   coin_betting_update_after_prediction(a, ec);
 }
@@ -342,10 +342,10 @@ void save_load(ftrl& b, VW::io_buf& model_file, bool read, bool text)
 
     if (resume)
     {
-      GD::save_load_online_state(
+      VW::details::save_load_online_state_gd(
           *all, model_file, read, text, b.total_weight, b.normalized_sum_norm_x, nullptr, b.ftrl_size);
     }
-    else { GD::save_load_regressor(*all, model_file, read, text); }
+    else { VW::details::save_load_regressor_gd(*all, model_file, read, text); }
   }
 }
 
diff --git a/vowpalwabbit/core/src/reductions/gd.cc b/vowpalwabbit/core/src/reductions/gd.cc
index 4dbd8e9e7f8..ed7be22f68a 100644
--- a/vowpalwabbit/core/src/reductions/gd.cc
+++ b/vowpalwabbit/core/src/reductions/gd.cc
@@ -13,6 +13,8 @@
 #include "vw/core/setup_base.h"
 
 #include <cfloat>
+#include <algorithm>
+
 
 #if !defined(VW_NO_INLINE_SIMD)
 #  if !defined(__SSE2__) && (defined(_M_AMD64) || defined(_M_X64))
@@ -42,11 +44,11 @@
 using namespace VW::LEARNER;
 using namespace VW::config;
 
+namespace
+{
 constexpr double L1_STATE_DEFAULT = 0.;
 constexpr double L2_STATE_DEFAULT = 1.;
 
-namespace
-{
 template <typename WeightsT>
 void merge_weights_simple(size_t length, const std::vector<std::reference_wrapper<const WeightsT>>& source,
     const std::vector<float>& per_model_weighting, WeightsT& weights)
@@ -94,14 +96,37 @@ void copy_weights(WeightsT& dest, const WeightsT& source, size_t length)
   const size_t full_weights_size = length << dest.stride_shift();
   for (size_t i = 0; i < full_weights_size; i++) { dest[i] = source[i]; }
 }
-}  // namespace
 
-// todo:
-// 4. Factor various state out of VW::workspace&
-namespace GD
+
+void sync_weights(VW::workspace& all)
 {
-void sync_weights(VW::workspace& all);
+  // todo, fix length dependence
+  if (all.sd->gravity == 0. && all.sd->contraction == 1.)
+  {  // to avoid unnecessary weight synchronization
+    return;
+  }
 
+  if (all.weights.sparse)
+  {
+    for (VW::weight& w : all.weights.sparse_weights)
+    {
+      w = VW::trunc_weight(w, static_cast<float>(all.sd->gravity)) * static_cast<float>(all.sd->contraction);
+    }
+  }
+  else
+  {
+    for (VW::weight& w : all.weights.dense_weights)
+    {
+      w = VW::trunc_weight(w, static_cast<float>(all.sd->gravity)) * static_cast<float>(all.sd->contraction);
+    }
+  }
+
+  all.sd->gravity = 0.;
+  all.sd->contraction = 1.;
+}
+
+VW_WARNING_STATE_PUSH
+VW_WARNING_DISABLE_UNUSED_FUNCTION
 inline float quake_inv_sqrt(float x)
 {
   // Carmack/Quake/SGI fast method:
@@ -113,6 +138,7 @@ inline float quake_inv_sqrt(float x)
   x = x * (1.5f - xhalf * x * x);     // One round of Newton's method
   return x;
 }
+VW_WARNING_STATE_POP
 
 static inline float inv_sqrt(float x)
 {
@@ -141,6 +167,7 @@ static inline float inv_sqrt(float x)
 
   return x;
 }
+
 VW_WARNING_STATE_PUSH
 VW_WARNING_DISABLE_COND_CONST_EXPR
 template <bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
@@ -173,14 +200,14 @@ float average_update(float total_weight, float normalized_sum_norm_x, float neg_
 }
 
 template <bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
-void train(gd& g, VW::example& ec, float update)
+void train(VW::reductions::gd& g, VW::example& ec, float update)
 {
   if VW_STD17_CONSTEXPR (normalized != 0) { update *= g.update_multiplier; }
   VW_DBG(ec) << "gd: train() spare=" << spare << std::endl;
-  foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare>>(*g.all, ec, update);
+  VW::foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare>>(*g.all, ec, update);
 }
 
-void end_pass(gd& g)
+void end_pass(VW::reductions::gd& g)
 {
   VW::workspace& all = *g.all;
 
@@ -209,7 +236,7 @@ void end_pass(gd& g)
 }
 
 void merge(const std::vector<float>& per_model_weighting, const std::vector<const VW::workspace*>& all_workspaces,
-    const std::vector<GD::gd*>& all_data, VW::workspace& output_workspace, GD::gd& output_data)
+    const std::vector<VW::reductions::gd*>& all_data, VW::workspace& output_workspace, VW::reductions::gd& output_data)
 {
   const size_t length = static_cast<size_t>(1) << output_workspace.num_bits;
 
@@ -248,8 +275,8 @@ void merge(const std::vector<float>& per_model_weighting, const std::vector<cons
   }
 }
 
-void add(const VW::workspace& /* ws1 */, const GD::gd& data1, const VW::workspace& ws2, GD::gd& data2,
-    VW::workspace& ws_out, GD::gd& data_out)
+void add(const VW::workspace& /* ws1 */, const VW::reductions::gd& data1, const VW::workspace& ws2, VW::reductions::gd& data2,
+    VW::workspace& ws_out, VW::reductions::gd& data_out)
 {
   const size_t length = static_cast<size_t>(1) << ws_out.num_bits;
   // When adding, output the weights from the model delta (2nd arugment to addition)
@@ -267,8 +294,8 @@ void add(const VW::workspace& /* ws1 */, const GD::gd& data1, const VW::workspac
   }
 }
 
-void subtract(const VW::workspace& ws1, const GD::gd& data1, const VW::workspace& /* ws2 */, GD::gd& data2,
-    VW::workspace& ws_out, GD::gd& data_out)
+void subtract(const VW::workspace& ws1, const VW::reductions::gd& data1, const VW::workspace& /* ws2 */, VW::reductions::gd& data2,
+    VW::workspace& ws_out, VW::reductions::gd& data_out)
 {
   const size_t length = static_cast<size_t>(1) << ws_out.num_bits;
   // When subtracting, output the weights from the newer model (1st arugment to subtraction)
@@ -286,8 +313,6 @@ void subtract(const VW::workspace& ws1, const GD::gd& data1, const VW::workspace
   }
 }
 
-#include <algorithm>
-
 class string_value
 {
 public:
@@ -335,7 +360,7 @@ inline void audit_feature(audit_results& dat, const float ft_weight, const uint6
   if (dat.all.audit)
   {
     tempstream << ':' << (index >> stride_shift) << ':' << ft_weight << ':'
-               << trunc_weight(weights[index], static_cast<float>(dat.all.sd->gravity)) *
+               << VW::trunc_weight(weights[index], static_cast<float>(dat.all.sd->gravity)) *
             static_cast<float>(dat.all.sd->contraction);
 
     if (weights.adaptive)
@@ -360,7 +385,6 @@ inline void audit_feature(audit_results& dat, const float ft_weight, const uint6
     }
   }
 }
-
 void print_lda_features(VW::workspace& all, VW::example& ec)
 {
   VW::parameters& weights = all.weights;
@@ -379,8 +403,9 @@ void print_lda_features(VW::workspace& all, VW::example& ec)
   }
   std::cout << " total of " << count << " features." << std::endl;
 }
+}
 
-void print_features(VW::workspace& all, VW::example& ec)
+void VW::details::print_features(VW::workspace& all, VW::example& ec)
 {
   if (all.lda > 0) { print_lda_features(all, ec); }
   else
@@ -420,14 +445,14 @@ void print_features(VW::workspace& all, VW::example& ec)
   }
 }
 
-void print_audit_features(VW::workspace& all, VW::example& ec)
+void VW::details::print_audit_features(VW::workspace& all, VW::example& ec)
 {
   if (all.audit) { VW::details::print_result_by_ref(all.audit_writer.get(), ec.pred.scalar, -1, ec.tag, all.logger); }
   fflush(stdout);
   print_features(all, ec);
 }
 
-float finalize_prediction(VW::shared_data* sd, VW::io::logger& logger, float ret)
+float VW::details::finalize_prediction(VW::shared_data* sd, VW::io::logger& logger, float ret)
 {
   if (std::isnan(ret))
   {
@@ -440,6 +465,9 @@ float finalize_prediction(VW::shared_data* sd, VW::io::logger& logger, float ret
   return ret;
 }
 
+
+namespace
+{
 class trunc_data
 {
 public:
@@ -449,26 +477,19 @@ class trunc_data
 
 inline void vec_add_trunc(trunc_data& p, const float fx, float& fw)
 {
-  p.prediction += trunc_weight(fw, p.gravity) * fx;
+  p.prediction += VW::trunc_weight(fw, p.gravity) * fx;
 }
 
 inline float trunc_predict(VW::workspace& all, VW::example& ec, double gravity, size_t& num_interacted_features)
 {
   const auto& simple_red_features = ec.ex_reduction_features.template get<VW::simple_label_reduction_features>();
   trunc_data temp = {simple_red_features.initial, static_cast<float>(gravity)};
-  foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp, num_interacted_features);
+  VW::foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp, num_interacted_features);
   return temp.prediction;
 }
 
-inline void vec_add_print(float& p, const float fx, float& fw)
-{
-  // TODO: partial line logging. This function isn't actually called from anywhere though?
-  p += fw * fx;
-  std::cerr << " + " << fw << "*" << fx;
-}
-
 template <bool l1, bool audit>
-void predict(gd& g, base_learner&, VW::example& ec)
+void predict(VW::reductions::gd& g, base_learner&, VW::example& ec)
 {
   VW_DBG(ec) << "gd.predict(): ex#=" << ec.example_counter << ", offset=" << ec.ft_offset << std::endl;
 
@@ -479,26 +500,26 @@ void predict(gd& g, base_learner&, VW::example& ec)
 
   ec.num_features_from_interactions = num_interacted_features;
   ec.partial_prediction *= static_cast<float>(all.sd->contraction);
-  ec.pred.scalar = finalize_prediction(all.sd, all.logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(all.sd, all.logger, ec.partial_prediction);
 
   VW_DBG(ec) << "gd: predict() " << VW::debug::scalar_pred_to_string(ec) << VW::debug::features_to_string(ec)
              << std::endl;
 
-  if (audit) { print_audit_features(all, ec); }
+  if (audit) { VW::details::print_audit_features(all, ec); }
 }
 
 template <class T>
-inline void vec_add_trunc_multipredict(multipredict_info<T>& mp, const float fx, uint64_t fi)
+inline void vec_add_trunc_multipredict(VW::details::multipredict_info<T>& mp, const float fx, uint64_t fi)
 {
   size_t index = fi;
   for (size_t c = 0; c < mp.count; c++, index += mp.step)
   {
-    mp.pred[c].scalar += fx * trunc_weight(mp.weights[index], mp.gravity);
+    mp.pred[c].scalar += fx * VW::trunc_weight(mp.weights[index], mp.gravity);
   }
 }
 
 template <bool l1, bool audit>
-void multipredict(gd& g, base_learner&, VW::example& ec, size_t count, size_t step, VW::polyprediction* pred,
+void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t count, size_t step, VW::polyprediction* pred,
     bool finalize_predictions)
 {
   VW::workspace& all = *g.all;
@@ -511,31 +532,31 @@ void multipredict(gd& g, base_learner&, VW::example& ec, size_t count, size_t st
   size_t num_features_from_interactions = 0;
   if (g.all->weights.sparse)
   {
-    multipredict_info<VW::sparse_parameters> mp = {
+    VW::details::multipredict_info<VW::sparse_parameters> mp = {
         count, step, pred, g.all->weights.sparse_weights, static_cast<float>(all.sd->gravity)};
     if (l1)
     {
-      foreach_feature<multipredict_info<VW::sparse_parameters>, uint64_t, vec_add_trunc_multipredict>(
+      VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t, vec_add_trunc_multipredict>(
           all, ec, mp, num_features_from_interactions);
     }
     else
     {
-      foreach_feature<multipredict_info<VW::sparse_parameters>, uint64_t, vec_add_multipredict>(
+      VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t, VW::details::vec_add_multipredict>(
           all, ec, mp, num_features_from_interactions);
     }
   }
   else
   {
-    multipredict_info<VW::dense_parameters> mp = {
+    VW::details::multipredict_info<VW::dense_parameters> mp = {
         count, step, pred, g.all->weights.dense_weights, static_cast<float>(all.sd->gravity)};
     if (l1)
     {
-      foreach_feature<multipredict_info<VW::dense_parameters>, uint64_t, vec_add_trunc_multipredict>(
+      VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t, vec_add_trunc_multipredict>(
           all, ec, mp, num_features_from_interactions);
     }
     else
     {
-      foreach_feature<multipredict_info<VW::dense_parameters>, uint64_t, vec_add_multipredict>(
+      VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t, VW::details::vec_add_multipredict>(
           all, ec, mp, num_features_from_interactions);
     }
   }
@@ -547,14 +568,14 @@ void multipredict(gd& g, base_learner&, VW::example& ec, size_t count, size_t st
   }
   if (finalize_predictions)
   {
-    for (size_t c = 0; c < count; c++) { pred[c].scalar = finalize_prediction(all.sd, all.logger, pred[c].scalar); }
+    for (size_t c = 0; c < count; c++) { pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar); }
   }
   if (audit)
   {
     for (size_t c = 0; c < count; c++)
     {
       ec.pred.scalar = pred[c].scalar;
-      print_audit_features(all, ec);
+      VW::details::print_audit_features(all, ec);
       ec.ft_offset += static_cast<uint64_t>(step);
     }
     ec.ft_offset -= static_cast<uint64_t>(step * count);
@@ -662,10 +683,9 @@ inline void pred_per_update_feature(norm_data& nd, float x, float& fw)
   }
 }
 
-bool global_print_features = false;
 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare,
     bool stateless>
-float get_pred_per_update(gd& g, VW::example& ec)
+float get_pred_per_update(VW::reductions::gd& g, VW::example& ec)
 {
   // We must traverse the features in _precisely_ the same order as during training.
   auto& ld = ec.l.simple;
@@ -677,7 +697,7 @@ float get_pred_per_update(gd& g, VW::example& ec)
   if (grad_squared == 0 && !stateless) { return 1.; }
 
   norm_data nd = {grad_squared, 0., 0., {g.neg_power_t, g.neg_norm_power}, {0}, &g.all->logger};
-  foreach_feature<norm_data,
+  VW::foreach_feature<norm_data,
       pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare, stateless>>(all, ec, nd);
   if VW_STD17_CONSTEXPR (normalized != 0)
   {
@@ -702,7 +722,7 @@ float get_pred_per_update(gd& g, VW::example& ec)
 
 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare,
     bool stateless>
-float sensitivity(gd& g, VW::example& ec)
+float sensitivity(VW::reductions::gd& g, VW::example& ec)
 {
   if VW_STD17_CONSTEXPR (adaptive || normalized)
   {
@@ -717,7 +737,7 @@ float sensitivity(gd& g, VW::example& ec)
 VW_WARNING_STATE_POP
 
 template <size_t adaptive>
-float get_scale(gd& g, VW::example& /* ec */, float weight)
+float get_scale(VW::reductions::gd& g, VW::example& /* ec */, float weight)
 {
   float update_scale = g.all->eta * weight;
   if (!adaptive)
@@ -730,7 +750,7 @@ float get_scale(gd& g, VW::example& /* ec */, float weight)
 }
 
 template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, size_t normalized, size_t spare>
-float sensitivity(gd& g, base_learner& /* base */, VW::example& ec)
+float sensitivity(VW::reductions::gd& g, base_learner& /* base */, VW::example& ec)
 {
   return get_scale<adaptive>(g, ec, 1.) *
       sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, true>(g, ec);
@@ -738,7 +758,7 @@ float sensitivity(gd& g, base_learner& /* base */, VW::example& ec)
 
 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
     size_t normalized, size_t spare>
-float compute_update(gd& g, VW::example& ec)
+float compute_update(VW::reductions::gd& g, VW::example& ec)
 {
   // invariant: not a test label, importance weight > 0
   const auto& ld = ec.l.simple;
@@ -778,7 +798,7 @@ float compute_update(gd& g, VW::example& ec)
 
 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
     size_t normalized, size_t spare>
-void update(gd& g, base_learner&, VW::example& ec)
+void update(VW::reductions::gd& g, base_learner&, VW::example& ec)
 {
   // invariant: not a test label, importance weight > 0
   float update;
@@ -792,13 +812,13 @@ void update(gd& g, base_learner&, VW::example& ec)
   {  // updating weights now to avoid numerical instability
     sync_weights(*g.all);
   }
-}  // namespace GD
+}
 
 // NO_SANITIZE_UNDEFINED needed in learn functions because
 // base_learner& base might be a reference created from nullptr
 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive,
     size_t normalized, size_t spare>
-void NO_SANITIZE_UNDEFINED learn(gd& g, base_learner& base, VW::example& ec)
+void NO_SANITIZE_UNDEFINED learn(VW::reductions::gd& g, base_learner& base, VW::example& ec)
 {
   // invariant: not a test label, importance weight > 0
   assert(ec.l.simple.label != FLT_MAX);
@@ -807,33 +827,6 @@ void NO_SANITIZE_UNDEFINED learn(gd& g, base_learner& base, VW::example& ec)
   update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
 }
 
-void sync_weights(VW::workspace& all)
-{
-  // todo, fix length dependence
-  if (all.sd->gravity == 0. && all.sd->contraction == 1.)
-  {  // to avoid unnecessary weight synchronization
-    return;
-  }
-
-  if (all.weights.sparse)
-  {
-    for (VW::weight& w : all.weights.sparse_weights)
-    {
-      w = trunc_weight(w, static_cast<float>(all.sd->gravity)) * static_cast<float>(all.sd->contraction);
-    }
-  }
-  else
-  {
-    for (VW::weight& w : all.weights.dense_weights)
-    {
-      w = trunc_weight(w, static_cast<float>(all.sd->gravity)) * static_cast<float>(all.sd->contraction);
-    }
-  }
-
-  all.sd->gravity = 0.;
-  all.sd->contraction = 1.;
-}
-
 size_t write_index(VW::io_buf& model_file, std::stringstream& msg, bool text, uint32_t num_bits, uint64_t i)
 {
   size_t brw;
@@ -935,15 +928,18 @@ void save_load_regressor(VW::workspace& all, VW::io_buf& model_file, bool read,
     }
   }
 }
+}
 
-void save_load_regressor(VW::workspace& all, VW::io_buf& model_file, bool read, bool text)
+void VW::details::save_load_regressor_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text)
 {
-  if (all.weights.sparse) { save_load_regressor(all, model_file, read, text, all.weights.sparse_weights); }
-  else { save_load_regressor(all, model_file, read, text, all.weights.dense_weights); }
+  if (all.weights.sparse) { ::save_load_regressor(all, model_file, read, text, all.weights.sparse_weights); }
+  else { ::save_load_regressor(all, model_file, read, text, all.weights.dense_weights); }
 }
 
+namespace
+{
 template <class T>
-void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, gd* g,
+void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, VW::reductions::gd* g,
     std::stringstream& msg, uint32_t ftrl_size, T& weights)
 {
   uint64_t length = static_cast<uint64_t>(1) << all.num_bits;
@@ -1072,9 +1068,10 @@ void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file,
     }
   }
 }
+}
 
-void save_load_online_state(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, double& total_weight,
-    double& normalized_sum_norm_x, gd* g, uint32_t ftrl_size)
+void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, double& total_weight,
+    double& normalized_sum_norm_x, VW::reductions::gd* g, uint32_t ftrl_size)
 {
   std::stringstream msg;
 
@@ -1219,7 +1216,8 @@ void save_load_online_state(VW::workspace& all, VW::io_buf& model_file, bool rea
   else { save_load_online_state_weights(all, model_file, read, text, g, msg, ftrl_size, all.weights.dense_weights); }
 }
 
-void save_load(gd& g, VW::io_buf& model_file, bool read, bool text)
+namespace {
+void save_load(VW::reductions::gd& g, VW::io_buf& model_file, bool read, bool text)
 {
   VW::workspace& all = *g.all;
   if (read)
@@ -1263,13 +1261,13 @@ void save_load(gd& g, VW::io_buf& model_file, bool read, bool text)
             "save_resume functionality is known to have inaccuracy in model files version less than '{}'",
             VW::version_definitions::VERSION_SAVE_RESUME_FIX.to_string());
       }
-      save_load_online_state(all, model_file, read, text, g.per_model_states[0].total_weight,
+      VW::details::save_load_online_state_gd(all, model_file, read, text, g.per_model_states[0].total_weight,
           g.per_model_states[0].normalized_sum_norm_x, &g);
     }
     else
     {
       if (!all.weights.not_null()) { THROW("Model weights not initialized."); }
-      save_load_regressor(all, model_file, read, text);
+      VW::details::save_load_regressor_gd(all, model_file, read, text);
     }
   }
   if (!all.training)
@@ -1281,7 +1279,7 @@ void save_load(gd& g, VW::io_buf& model_file, bool read, bool text)
 
 template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off, uint64_t adaptive, uint64_t normalized,
     uint64_t spare, uint64_t next>
-uint64_t set_learn(VW::workspace& all, gd& g)
+uint64_t set_learn(VW::workspace& all, VW::reductions::gd& g)
 {
   all.normalized_idx = normalized;
   if (g.adax)
@@ -1302,7 +1300,7 @@ uint64_t set_learn(VW::workspace& all, gd& g)
 
 template <bool sparse_l2, bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare,
     uint64_t next>
-uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
+uint64_t set_learn(VW::workspace& all, bool feature_mask_off, VW::reductions::gd& g)
 {
   all.normalized_idx = normalized;
   if (feature_mask_off)
@@ -1313,7 +1311,7 @@ uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
 }
 
 template <bool invariant, bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
-uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
+uint64_t set_learn(VW::workspace& all, bool feature_mask_off, VW::reductions::gd& g)
 {
   if (g.sparse_l2 > 0.f)
   {
@@ -1323,7 +1321,7 @@ uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
 }
 
 template <bool sqrt_rate, uint64_t adaptive, uint64_t normalized, uint64_t spare, uint64_t next>
-uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
+uint64_t set_learn(VW::workspace& all, bool feature_mask_off, VW::reductions::gd& g)
 {
   if (all.invariant_updates)
   {
@@ -1333,7 +1331,7 @@ uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
 }
 
 template <bool sqrt_rate, uint64_t adaptive, uint64_t spare>
-uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
+uint64_t set_learn(VW::workspace& all, bool feature_mask_off, VW::reductions::gd& g)
 {
   // select the appropriate learn function based on adaptive, normalization, and feature mask
   if (all.weights.normalized)
@@ -1344,7 +1342,7 @@ uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
 }
 
 template <bool sqrt_rate>
-uint64_t set_learn(VW::workspace& all, bool feature_mask_off, gd& g)
+uint64_t set_learn(VW::workspace& all, bool feature_mask_off, VW::reductions::gd& g)
 {
   if (all.weights.adaptive) { return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g); }
   else { return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g); }
@@ -1356,14 +1354,14 @@ uint64_t ceil_log_2(uint64_t v)
   else { return 1 + ceil_log_2(v >> 1); }
 }
 
-}  // namespace GD
+}
 
 base_learner* VW::reductions::gd_setup(VW::setup_base_i& stack_builder)
 {
   options_i& options = *stack_builder.get_options();
   VW::workspace& all = *stack_builder.get_all_pointer();
 
-  auto g = VW::make_unique<GD::gd>();
+  auto g = VW::make_unique<VW::reductions::gd>();
 
   bool sgd = false;
   bool adaptive = false;
@@ -1399,7 +1397,7 @@ base_learner* VW::reductions::gd_setup(VW::setup_base_i& stack_builder)
   if (options.was_supplied("l2_state")) { all.sd->contraction = local_contraction; }
 
   g->all = &all;
-  auto single_model_state = GD::per_model_state();
+  auto single_model_state = details::per_model_state();
   single_model_state.normalized_sum_norm_x = 0;
   single_model_state.total_weight = 0.;
   g->per_model_states.emplace_back(single_model_state);
@@ -1473,34 +1471,34 @@ base_learner* VW::reductions::gd_setup(VW::setup_base_i& stack_builder)
   {
     if (all.audit || all.hash_inv)
     {
-      g->predict = GD::predict<true, true>;
-      g->multipredict = GD::multipredict<true, true>;
+      g->predict = ::predict<true, true>;
+      g->multipredict = ::multipredict<true, true>;
     }
     else
     {
-      g->predict = GD::predict<true, false>;
-      g->multipredict = GD::multipredict<true, false>;
+      g->predict = ::predict<true, false>;
+      g->multipredict = ::multipredict<true, false>;
     }
   }
   else if (all.audit || all.hash_inv)
   {
-    g->predict = GD::predict<false, true>;
-    g->multipredict = GD::multipredict<false, true>;
+    g->predict = ::predict<false, true>;
+    g->multipredict = ::multipredict<false, true>;
   }
   else
   {
-    g->predict = GD::predict<false, false>;
-    g->multipredict = GD::multipredict<false, false>;
+    g->predict = ::predict<false, false>;
+    g->multipredict = ::multipredict<false, false>;
   }
 
   uint64_t stride;
-  if (all.power_t == 0.5) { stride = GD::set_learn<true>(all, feature_mask_off, *g.get()); }
-  else { stride = GD::set_learn<false>(all, feature_mask_off, *g.get()); }
+  if (all.power_t == 0.5) { stride = ::set_learn<true>(all, feature_mask_off, *g.get()); }
+  else { stride = ::set_learn<false>(all, feature_mask_off, *g.get()); }
 
-  all.weights.stride_shift(static_cast<uint32_t>(GD::ceil_log_2(stride - 1)));
+  all.weights.stride_shift(static_cast<uint32_t>(::ceil_log_2(stride - 1)));
 
   auto* bare = g.get();
-  learner<GD::gd, VW::example>* l =
+  learner<VW::reductions::gd, VW::example>* l =
       make_base_learner(std::move(g), g->learn, bare->predict, stack_builder.get_setupfn_name(gd_setup),
           VW::prediction_type_t::SCALAR, VW::label_type_t::SIMPLE)
           .set_learn_returns_prediction(true)
@@ -1508,14 +1506,14 @@ base_learner* VW::reductions::gd_setup(VW::setup_base_i& stack_builder)
           .set_sensitivity(bare->sensitivity)
           .set_multipredict(bare->multipredict)
           .set_update(bare->update)
-          .set_save_load(GD::save_load)
-          .set_end_pass(GD::end_pass)
-          .set_merge_with_all(GD::merge)
-          .set_add_with_all(GD::add)
-          .set_subtract_with_all(GD::subtract)
-          .set_output_example_prediction(VW::details::output_example_prediction_simple_label<GD::gd>)
-          .set_update_stats(VW::details::update_stats_simple_label<GD::gd>)
-          .set_print_update(VW::details::print_update_simple_label<GD::gd>)
+          .set_save_load(::save_load)
+          .set_end_pass(::end_pass)
+          .set_merge_with_all(::merge)
+          .set_add_with_all(::add)
+          .set_subtract_with_all(::subtract)
+          .set_output_example_prediction(VW::details::output_example_prediction_simple_label<VW::reductions::gd>)
+          .set_update_stats(VW::details::update_stats_simple_label<VW::reductions::gd>)
+          .set_print_update(VW::details::print_update_simple_label<VW::reductions::gd>)
           .build();
   return make_base(*l);
 }
diff --git a/vowpalwabbit/core/src/reductions/gd_mf.cc b/vowpalwabbit/core/src/reductions/gd_mf.cc
index 545afa402d6..63260c9e8a8 100644
--- a/vowpalwabbit/core/src/reductions/gd_mf.cc
+++ b/vowpalwabbit/core/src/reductions/gd_mf.cc
@@ -124,7 +124,7 @@ float mf_predict(gdmf& d, VW::example& ec, T& weights)
   float linear_prediction = 0.;
   // linear terms
 
-  for (VW::features& fs : ec) { GD::foreach_feature<float, GD::vec_add, T>(weights, fs, linear_prediction); }
+  for (VW::features& fs : ec) { VW::foreach_feature<float, VW::details::vec_add, T>(weights, fs, linear_prediction); }
 
   // store constant + linear prediction
   // note: constant is now automatically added
@@ -144,13 +144,13 @@ float mf_predict(gdmf& d, VW::example& ec, T& weights)
         // l^k is from index+1 to index+d.rank
         // float x_dot_l = sd_offset_add(weights, ec.atomics[(int)(*i)[0]].begin(), ec.atomics[(int)(*i)[0]].end(), k);
         pred_offset x_dot_l = {0., k};
-        GD::foreach_feature<pred_offset, offset_add, T>(weights, ec.feature_space[static_cast<int>(i[0])], x_dot_l);
+        VW::foreach_feature<pred_offset, offset_add, T>(weights, ec.feature_space[static_cast<int>(i[0])], x_dot_l);
         // x_r * r^k
         // r^k is from index+d.rank+1 to index+2*d.rank
         // float x_dot_r = sd_offset_add(weights, ec.atomics[(int)(*i)[1]].begin(), ec.atomics[(int)(*i)[1]].end(),
         // k+d.rank);
         pred_offset x_dot_r = {0., k + d.rank};
-        GD::foreach_feature<pred_offset, offset_add, T>(weights, ec.feature_space[static_cast<int>(i[1])], x_dot_r);
+        VW::foreach_feature<pred_offset, offset_add, T>(weights, ec.feature_space[static_cast<int>(i[1])], x_dot_r);
 
         prediction += x_dot_l.p * x_dot_r.p;
 
@@ -167,7 +167,7 @@ float mf_predict(gdmf& d, VW::example& ec, T& weights)
 
   all.set_minmax(all.sd, ec.l.simple.label);
 
-  ec.pred.scalar = GD::finalize_prediction(all.sd, all.logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(all.sd, all.logger, ec.partial_prediction);
 
   if (ec.l.simple.label != FLT_MAX)
   {
diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc
index 30e20725c33..3c7b6b8d040 100644
--- a/vowpalwabbit/core/src/reductions/lda_core.cc
+++ b/vowpalwabbit/core/src/reductions/lda_core.cc
@@ -936,7 +936,7 @@ void learn_batch(lda& l)
   for (size_t d = 0; d < batch_size; d++)
   {
     float score = lda_loop(l, l.Elogtheta, &(l.v[d * l.all->lda]), l.examples[d], l.all->power_t);
-    if (l.all->audit) { GD::print_audit_features(*l.all, *l.examples[d]); }
+    if (l.all->audit) { VW::details::print_audit_features(*l.all, *l.examples[d]); }
     // If the doc is empty, give it loss of 0.
     if (l.doc_lengths[d] > 0)
     {
diff --git a/vowpalwabbit/core/src/reductions/mf.cc b/vowpalwabbit/core/src/reductions/mf.cc
index 568b8bd86d4..e06e1e77939 100644
--- a/vowpalwabbit/core/src/reductions/mf.cc
+++ b/vowpalwabbit/core/src/reductions/mf.cc
@@ -100,7 +100,7 @@ void predict(mf& data, single_learner& base, VW::example& ec)
 
   // finalize prediction
   ec.partial_prediction = prediction;
-  ec.pred.scalar = GD::finalize_prediction(data.all->sd, data.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(data.all->sd, data.all->logger, ec.partial_prediction);
 }
 
 void learn(mf& data, single_learner& base, VW::example& ec)
diff --git a/vowpalwabbit/core/src/reductions/mwt.cc b/vowpalwabbit/core/src/reductions/mwt.cc
index 8eb7f73aa2b..969ad2639cd 100644
--- a/vowpalwabbit/core/src/reductions/mwt.cc
+++ b/vowpalwabbit/core/src/reductions/mwt.cc
@@ -98,7 +98,7 @@ void predict_or_learn(mwt& c, single_learner& base, VW::example& ec)
     // For each nonzero feature in observed namespaces, check it's value.
     for (unsigned char ns : ec.indices)
     {
-      if (c.namespaces[ns]) { GD::foreach_feature<mwt, value_policy>(c.all, ec.feature_space[ns], c); }
+      if (c.namespaces[ns]) { VW::foreach_feature<mwt, value_policy>(c.all, ec.feature_space[ns], c); }
     }
     for (uint64_t policy : c.policies)
     {
diff --git a/vowpalwabbit/core/src/reductions/nn.cc b/vowpalwabbit/core/src/reductions/nn.cc
index 0b05343489d..dd0ee5bbf51 100644
--- a/vowpalwabbit/core/src/reductions/nn.cc
+++ b/vowpalwabbit/core/src/reductions/nn.cc
@@ -317,7 +317,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, VW::example& ec)
       else { base.predict(n.output_layer, n.k); }
     }
 
-    n.prediction = GD::finalize_prediction(n.all->sd, n.all->logger, n.output_layer.partial_prediction);
+    n.prediction = VW::details::finalize_prediction(n.all->sd, n.all->logger, n.output_layer.partial_prediction);
 
     if (should_output)
     {
@@ -355,7 +355,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, VW::example& ec)
               float nu = n.outputweight.pred.scalar;
               float gradhw = 0.5f * nu * gradient * sigmahprime;
 
-              ec.l.simple.label = GD::finalize_prediction(n.all->sd, n.all->logger, hidden_units[i].scalar - gradhw);
+              ec.l.simple.label = VW::details::finalize_prediction(n.all->sd, n.all->logger, hidden_units[i].scalar - gradhw);
               ec.pred.scalar = hidden_units[i].scalar;
               if (ec.l.simple.label != hidden_units[i].scalar) { base.update(ec, i); }
             }
diff --git a/vowpalwabbit/core/src/reductions/oja_newton.cc b/vowpalwabbit/core/src/reductions/oja_newton.cc
index d6a20c4f5fa..ecbbc334e7b 100644
--- a/vowpalwabbit/core/src/reductions/oja_newton.cc
+++ b/vowpalwabbit/core/src/reductions/oja_newton.cc
@@ -348,9 +348,9 @@ void make_pred(oja_n_update_data& data, float x, float& wref)
 void predict(OjaNewton& oja_newton_ptr, base_learner&, VW::example& ec)
 {
   oja_newton_ptr.data.prediction = 0;
-  GD::foreach_feature<oja_n_update_data, make_pred>(*oja_newton_ptr.all, ec, oja_newton_ptr.data);
+  VW::foreach_feature<oja_n_update_data, make_pred>(*oja_newton_ptr.all, ec, oja_newton_ptr.data);
   ec.partial_prediction = oja_newton_ptr.data.prediction;
-  ec.pred.scalar = GD::finalize_prediction(oja_newton_ptr.all->sd, oja_newton_ptr.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(oja_newton_ptr.all->sd, oja_newton_ptr.all->logger, ec.partial_prediction);
 }
 
 void update_Z_and_wbar(oja_n_update_data& data, float x, float& wref)  // NOLINT
@@ -408,7 +408,7 @@ void NO_SANITIZE_UNDEFINED learn(OjaNewton& oja_newton_ptr, base_learner& base,
 
   if (oja_newton_ptr.normalize)
   {
-    GD::foreach_feature<oja_n_update_data, update_normalization>(*oja_newton_ptr.all, ec, data);
+    VW::foreach_feature<oja_n_update_data, update_normalization>(*oja_newton_ptr.all, ec, data);
   }
 
   VW::example* next_in_batch = nullptr;
@@ -440,7 +440,7 @@ void NO_SANITIZE_UNDEFINED learn(OjaNewton& oja_newton_ptr, base_learner& base,
 
       data.norm2_x = 0;
       std::fill(data.Zx.begin(), data.Zx.end(), 0.f);
-      GD::foreach_feature<oja_n_update_data, compute_Zx_and_norm>(*oja_newton_ptr.all, ex, data);
+      VW::foreach_feature<oja_n_update_data, compute_Zx_and_norm>(*oja_newton_ptr.all, ex, data);
       oja_newton_ptr.compute_AZx();
 
       oja_newton_ptr.update_eigenvalues();
@@ -448,7 +448,7 @@ void NO_SANITIZE_UNDEFINED learn(OjaNewton& oja_newton_ptr, base_learner& base,
 
       oja_newton_ptr.update_K();
 
-      GD::foreach_feature<oja_n_update_data, update_Z_and_wbar>(*oja_newton_ptr.all, ex, data);
+      VW::foreach_feature<oja_n_update_data, update_Z_and_wbar>(*oja_newton_ptr.all, ex, data);
     }
 
     oja_newton_ptr.update_A();
@@ -459,7 +459,7 @@ void NO_SANITIZE_UNDEFINED learn(OjaNewton& oja_newton_ptr, base_learner& base,
   }
 
   std::fill(data.Zx.begin(), data.Zx.end(), 0.f);
-  GD::foreach_feature<oja_n_update_data, update_wbar_and_Zx>(*oja_newton_ptr.all, ec, data);
+  VW::foreach_feature<oja_n_update_data, update_wbar_and_Zx>(*oja_newton_ptr.all, ec, data);
   oja_newton_ptr.compute_AZx();
 
   oja_newton_ptr.update_b();
@@ -485,8 +485,8 @@ void save_load(OjaNewton& oja_newton_ptr, VW::io_buf& model_file, bool read, boo
 
     double temp = 0.;
     double temp_normalized_sum_norm_x = 0.;
-    if (resume) { GD::save_load_online_state(all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
-    else { GD::save_load_regressor(all, model_file, read, text); }
+    if (resume) { VW::details::save_load_online_state_gd(all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
+    else { VW::details::save_load_regressor_gd(all, model_file, read, text); }
   }
 }
 }  // namespace
diff --git a/vowpalwabbit/core/src/reductions/print.cc b/vowpalwabbit/core/src/reductions/print.cc
index 7877c8e8aa6..9f9a99a5bf2 100644
--- a/vowpalwabbit/core/src/reductions/print.cc
+++ b/vowpalwabbit/core/src/reductions/print.cc
@@ -49,7 +49,7 @@ void learn(print& p, VW::LEARNER::base_learner&, VW::example& ec)
     (*all.trace_message).write(ec.tag.begin(), ec.tag.size());
   }
   (*all.trace_message) << "| ";
-  GD::foreach_feature<VW::workspace, uint64_t, print_feature>(*(p.all), ec, *p.all);
+  VW::foreach_feature<VW::workspace, uint64_t, print_feature>(*(p.all), ec, *p.all);
   (*all.trace_message) << std::endl;
 }
 }  // namespace
diff --git a/vowpalwabbit/core/src/reductions/search/search.cc b/vowpalwabbit/core/src/reductions/search/search.cc
index b9186a7afaa..7fd2b402375 100644
--- a/vowpalwabbit/core/src/reductions/search/search.cc
+++ b/vowpalwabbit/core/src/reductions/search/search.cc
@@ -16,7 +16,7 @@
 #include "vw/core/rand_state.h"
 #include "vw/core/reductions/active.h"
 #include "vw/core/reductions/csoaa.h"
-#include "vw/core/reductions/gd.h"  // for GD::foreach_feature
+#include "vw/core/reductions/gd.h"  // for VW::foreach_feature
 #include "vw/core/reductions/search/search_dep_parser.h"
 #include "vw/core/reductions/search/search_entityrelationtask.h"
 #include "vw/core/reductions/search/search_graph.h"
@@ -651,7 +651,7 @@ void add_neighbor_features(search_private& priv, VW::multi_ex& ec_seq)
       else  // this is actually a neighbor
       {
         VW::example& other = *ec_seq[n + offset];
-        GD::foreach_feature<search_private, add_new_feature>(priv.all, other.feature_space[ns], priv, me.ft_offset);
+        VW::foreach_feature<search_private, add_new_feature>(priv.all, other.feature_space[ns], priv, me.ft_offset);
       }
     }
 
@@ -819,7 +819,7 @@ void add_example_conditioning(search_private& priv, VW::example& ec, size_t cond
       // add the quadratic features
       if (n < priv.acset.max_quad_ngram_length)
       {
-        GD::foreach_feature<search_private, uint64_t, add_new_feature>(*priv.all, ec, priv);
+        VW::foreach_feature<search_private, uint64_t, add_new_feature>(*priv.all, ec, priv);
       }
     }
   }
diff --git a/vowpalwabbit/core/src/reductions/search/search_graph.cc b/vowpalwabbit/core/src/reductions/search/search_graph.cc
index 8a48ab80f3e..50ba95fad58 100644
--- a/vowpalwabbit/core/src/reductions/search/search_graph.cc
+++ b/vowpalwabbit/core/src/reductions/search/search_graph.cc
@@ -327,11 +327,11 @@ void add_edge_features(Search::search& sch, task_data& D, size_t n, VW::multi_ex
     if (pred_total <= 1.)  // single edge
     {
       D.neighbor_predictions[0] = static_cast<float>(last_pred);
-      GD::foreach_feature<task_data, uint64_t, add_edge_features_single_fn>(sch.get_vw_pointer_unsafe(), edge, D);
+      VW::foreach_feature<task_data, uint64_t, add_edge_features_single_fn>(sch.get_vw_pointer_unsafe(), edge, D);
     }
     else
     {  // lots of edges
-      GD::foreach_feature<task_data, uint64_t, add_edge_features_group_fn>(sch.get_vw_pointer_unsafe(), edge, D);
+      VW::foreach_feature<task_data, uint64_t, add_edge_features_group_fn>(sch.get_vw_pointer_unsafe(), edge, D);
     }
   }
   ec[n]->indices.push_back(VW::details::NEIGHBOR_NAMESPACE);
diff --git a/vowpalwabbit/core/src/reductions/stagewise_poly.cc b/vowpalwabbit/core/src/reductions/stagewise_poly.cc
index be0043cde92..815bd72b501 100644
--- a/vowpalwabbit/core/src/reductions/stagewise_poly.cc
+++ b/vowpalwabbit/core/src/reductions/stagewise_poly.cc
@@ -469,7 +469,7 @@ void synthetic_create_rec(stagewise_poly& poly, float v, uint64_t findex)
 #ifdef DEBUG
       poly.max_depth = (poly.max_depth > poly.cur_depth) ? poly.max_depth : poly.cur_depth;
 #endif  // DEBUG
-      GD::foreach_feature<stagewise_poly, uint64_t, synthetic_create_rec>(*(poly.all), *(poly.original_ec), poly);
+      VW::foreach_feature<stagewise_poly, uint64_t, synthetic_create_rec>(*(poly.all), *(poly.original_ec), poly);
       --poly.cur_depth;
       poly.synth_rec_f = parent_f;
     }
@@ -490,7 +490,7 @@ void synthetic_create(stagewise_poly& poly, VW::example& ec, bool training)
    * parent, and recurse just on that feature (which arguably correctly interprets poly.cur_depth).
    * Problem with this is if there is a collision with the root...
    */
-  GD::foreach_feature<stagewise_poly, uint64_t, synthetic_create_rec>(*poly.all, *poly.original_ec, poly);
+  VW::foreach_feature<stagewise_poly, uint64_t, synthetic_create_rec>(*poly.all, *poly.original_ec, poly);
   synthetic_decycle(poly);
 
   if (training)
diff --git a/vowpalwabbit/core/src/reductions/svrg.cc b/vowpalwabbit/core/src/reductions/svrg.cc
index f8805fdcaa9..01ba72d1bfb 100644
--- a/vowpalwabbit/core/src/reductions/svrg.cc
+++ b/vowpalwabbit/core/src/reductions/svrg.cc
@@ -43,7 +43,7 @@ class svrg
   svrg(VW::workspace* all) : all(all) {}
 };
 
-// Mimic GD::inline_predict but with offset for predicting with either
+// Mimic VW::inline_predict but with offset for predicting with either
 // stable versus inner weights.
 
 template <int offset>
@@ -58,7 +58,7 @@ inline float inline_predict(VW::workspace& all, VW::example& ec)
 {
   const auto& simple_red_features = ec.ex_reduction_features.template get<VW::simple_label_reduction_features>();
   float acc = simple_red_features.initial;
-  GD::foreach_feature<float, vec_add<offset> >(all, ec, acc);
+  VW::foreach_feature<float, vec_add<offset> >(all, ec, acc);
   return acc;
 }
 
@@ -66,13 +66,13 @@ inline float inline_predict(VW::workspace& all, VW::example& ec)
 
 float predict_stable(const svrg& s, VW::example& ec)
 {
-  return GD::finalize_prediction(s.all->sd, s.all->logger, inline_predict<W_STABLE>(*s.all, ec));
+  return VW::details::finalize_prediction(s.all->sd, s.all->logger, inline_predict<W_STABLE>(*s.all, ec));
 }
 
 void predict(svrg& s, base_learner&, VW::example& ec)
 {
   ec.partial_prediction = inline_predict<W_INNER>(*s.all, ec);
-  ec.pred.scalar = GD::finalize_prediction(s.all->sd, s.all->logger, ec.partial_prediction);
+  ec.pred.scalar = VW::details::finalize_prediction(s.all->sd, s.all->logger, ec.partial_prediction);
 }
 
 float gradient_scalar(const svrg& s, const VW::example& ec, float pred)
@@ -111,13 +111,13 @@ void update_inner(const svrg& s, VW::example& ec)
   u.g_scalar_stable = gradient_scalar(s, ec, predict_stable(s, ec));
   u.eta = s.all->eta;
   u.norm = static_cast<float>(s.stable_grad_count);
-  GD::foreach_feature<update, update_inner_feature>(*s.all, ec, u);
+  VW::foreach_feature<update, update_inner_feature>(*s.all, ec, u);
 }
 
 void update_stable(const svrg& s, VW::example& ec)
 {
   float g = gradient_scalar(s, ec, predict_stable(s, ec));
-  GD::foreach_feature<float, update_stable_feature>(*s.all, ec, g);
+  VW::foreach_feature<float, update_stable_feature>(*s.all, ec, g);
 }
 
 void learn(svrg& s, base_learner& base, VW::example& ec)
@@ -169,8 +169,8 @@ void save_load(svrg& s, VW::io_buf& model_file, bool read, bool text)
 
     double temp = 0.;
     double temp_normalized_sum_norm_x = 0.;
-    if (resume) { GD::save_load_online_state(*s.all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
-    else { GD::save_load_regressor(*s.all, model_file, read, text); }
+    if (resume) { VW::details::save_load_online_state_gd(*s.all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
+    else { VW::details::save_load_regressor_gd(*s.all, model_file, read, text); }
   }
 }
 }  // namespace
diff --git a/vowpalwabbit/core/src/vw.cc b/vowpalwabbit/core/src/vw.cc
index c7f4ad5b6a7..61d3f044e0a 100644
--- a/vowpalwabbit/core/src/vw.cc
+++ b/vowpalwabbit/core/src/vw.cc
@@ -781,7 +781,7 @@ VW::feature* VW::get_features(VW::workspace& all, example* ec, size_t& feature_n
   features_and_source fs;
   fs.stride_shift = all.weights.stride_shift();
   fs.mask = all.weights.mask() >> all.weights.stride_shift();
-  GD::foreach_feature<::features_and_source, uint64_t, vec_store>(all, *ec, fs);
+  VW::foreach_feature<::features_and_source, uint64_t, vec_store>(all, *ec, fs);
 
   auto* features_array = new feature[fs.feature_map.size()];
   std::memcpy(features_array, fs.feature_map.data(), fs.feature_map.size() * sizeof(feature));
diff --git a/vowpalwabbit/slim/include/vw/slim/vw_slim_predict.h b/vowpalwabbit/slim/include/vw/slim/vw_slim_predict.h
index 2480be6294c..0d58099c214 100644
--- a/vowpalwabbit/slim/include/vw/slim/vw_slim_predict.h
+++ b/vowpalwabbit/slim/include/vw/slim/vw_slim_predict.h
@@ -269,13 +269,13 @@ class vw_predict
       // permutations is not supported by slim so we can just use combinations!
       _generate_interactions.update_interactions_if_new_namespace_seen<
           VW::details::generate_namespace_combinations_with_repetition, false>(_interactions, ex.indices);
-      score = GD::inline_predict<W>(*_weights, false, _ignore_linear, _generate_interactions.generated_interactions,
+      score = VW::inline_predict<W>(*_weights, false, _ignore_linear, _generate_interactions.generated_interactions,
           _unused_extent_interactions,
           /* permutations */ false, ex, _generate_interactions_object_cache);
     }
     else
     {
-      score = GD::inline_predict<W>(*_weights, false, _ignore_linear, _interactions, _unused_extent_interactions,
+      score = VW::inline_predict<W>(*_weights, false, _ignore_linear, _interactions, _unused_extent_interactions,
           /* permutations */ false, ex, _generate_interactions_object_cache);
     }
     return S_VW_PREDICT_OK;

From 9f4b54871b6281b5979f5fabe98a9932736bc384 Mon Sep 17 00:00:00 2001
From: Jack Gerrits <jackgerrits95@gmail.com>
Date: Fri, 6 Jan 2023 15:24:16 -0500
Subject: [PATCH 2/4] formatting

---
 .../core/include/vw/core/gd_predict.h         | 30 ++++++-----
 vowpalwabbit/core/src/reductions/ftrl.cc      | 13 +++--
 vowpalwabbit/core/src/reductions/gd.cc        | 52 ++++++++++---------
 vowpalwabbit/core/src/reductions/nn.cc        |  3 +-
 .../core/src/reductions/oja_newton.cc         |  8 ++-
 vowpalwabbit/core/src/reductions/svrg.cc      |  5 +-
 6 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/vowpalwabbit/core/include/vw/core/gd_predict.h b/vowpalwabbit/core/include/vw/core/gd_predict.h
index b2117a3bd71..879d622ef28 100644
--- a/vowpalwabbit/core/include/vw/core/gd_predict.h
+++ b/vowpalwabbit/core/include/vw/core/gd_predict.h
@@ -13,16 +13,16 @@
 
 namespace VW
 {
-  namespace details
-  {
-    template <class DataT>
+namespace details
+{
+template <class DataT>
 inline void dummy_func(DataT&, const VW::audit_strings*)
 {
 }  // should never be called due to call_audit overload
 
 inline void vec_add(float& p, float fx, float fw) { p += fw * fx; }
 
-  }
+}  // namespace details
 // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_index)
 template <class DataT, void (*FuncT)(DataT&, float feature_value, uint64_t feature_index), class WeightsT>
 void foreach_feature(WeightsT& /*weights*/, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
@@ -105,8 +105,6 @@ inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
       extent_interactions, permutations, ec, dat, num_interacted_features_ignored, cache);
 }
 
-
-
 template <class WeightsT>
 inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
@@ -130,7 +128,7 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
       extent_interactions, permutations, ec, initial, num_interacted_features, cache);
   return initial;
 }
-}
+}  // namespace VW
 
 // namespace GD
 // {
@@ -146,7 +144,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 // // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
 // template <class DataT, void (*FuncT)(DataT&, const float feature_value, float& weight_reference), class WeightsT>
 // VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+// inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult
+// = 1.)
 // {
 //   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
 // }
@@ -170,7 +169,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 //     VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
 //                                                              // ambiguity in old complers
 // {
-//   VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations, ec,
+//   VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations,
+//   ec,
 //       dat, weights, num_interacted_features, cache);
 // }
 
@@ -184,7 +184,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 //     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
 //     DataT& dat, size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache)
 // {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
+//   interactions,
 //       extent_interactions, permutations, ec, dat, num_interacted_features, cache);
 // }
 
@@ -196,7 +197,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 //     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
 //     DataT& dat, VW::details::generate_interactions_object_cache& cache)
 // {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
+//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
+//   interactions,
 //       extent_interactions, permutations, ec, dat, cache);
 // }
 
@@ -208,7 +210,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 //     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
 //     VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 // {
-//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec,
+//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
+//   permutations, ec,
 //       cache, initial);
 // }
 
@@ -220,7 +223,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 //     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
 //     size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 // {
-//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec,
+//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
+//   permutations, ec,
 //       num_interacted_features, cache, initial);
 // }
 // }  // namespace GD
\ No newline at end of file
diff --git a/vowpalwabbit/core/src/reductions/ftrl.cc b/vowpalwabbit/core/src/reductions/ftrl.cc
index 7b44a810058..e4c7b3cc40b 100644
--- a/vowpalwabbit/core/src/reductions/ftrl.cc
+++ b/vowpalwabbit/core/src/reductions/ftrl.cc
@@ -114,15 +114,15 @@ void multipredict(ftrl& b, base_learner&, VW::example& ec, size_t count, size_t
   {
     VW::details::multipredict_info<VW::sparse_parameters> mp = {
         count, step, pred, all.weights.sparse_weights, static_cast<float>(all.sd->gravity)};
-    VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t, VW::details::vec_add_multipredict>(
-        all, ec, mp, num_features_from_interactions);
+    VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t,
+        VW::details::vec_add_multipredict>(all, ec, mp, num_features_from_interactions);
   }
   else
   {
     VW::details::multipredict_info<VW::dense_parameters> mp = {
         count, step, pred, all.weights.dense_weights, static_cast<float>(all.sd->gravity)};
-    VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t, VW::details::vec_add_multipredict>(
-        all, ec, mp, num_features_from_interactions);
+    VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t,
+        VW::details::vec_add_multipredict>(all, ec, mp, num_features_from_interactions);
   }
   ec.num_features_from_interactions = num_features_from_interactions;
   if (all.sd->contraction != 1.)
@@ -131,7 +131,10 @@ void multipredict(ftrl& b, base_learner&, VW::example& ec, size_t count, size_t
   }
   if (finalize_predictions)
   {
-    for (size_t c = 0; c < count; c++) { pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar); }
+    for (size_t c = 0; c < count; c++)
+    {
+      pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar);
+    }
   }
   if (audit)
   {
diff --git a/vowpalwabbit/core/src/reductions/gd.cc b/vowpalwabbit/core/src/reductions/gd.cc
index ed7be22f68a..7fc131df30a 100644
--- a/vowpalwabbit/core/src/reductions/gd.cc
+++ b/vowpalwabbit/core/src/reductions/gd.cc
@@ -12,9 +12,8 @@
 #include "vw/core/prediction_type.h"
 #include "vw/core/setup_base.h"
 
-#include <cfloat>
 #include <algorithm>
-
+#include <cfloat>
 
 #if !defined(VW_NO_INLINE_SIMD)
 #  if !defined(__SSE2__) && (defined(_M_AMD64) || defined(_M_X64))
@@ -97,7 +96,6 @@ void copy_weights(WeightsT& dest, const WeightsT& source, size_t length)
   for (size_t i = 0; i < full_weights_size; i++) { dest[i] = source[i]; }
 }
 
-
 void sync_weights(VW::workspace& all)
 {
   // todo, fix length dependence
@@ -204,7 +202,8 @@ void train(VW::reductions::gd& g, VW::example& ec, float update)
 {
   if VW_STD17_CONSTEXPR (normalized != 0) { update *= g.update_multiplier; }
   VW_DBG(ec) << "gd: train() spare=" << spare << std::endl;
-  VW::foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare>>(*g.all, ec, update);
+  VW::foreach_feature<float, update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare>>(
+      *g.all, ec, update);
 }
 
 void end_pass(VW::reductions::gd& g)
@@ -275,8 +274,8 @@ void merge(const std::vector<float>& per_model_weighting, const std::vector<cons
   }
 }
 
-void add(const VW::workspace& /* ws1 */, const VW::reductions::gd& data1, const VW::workspace& ws2, VW::reductions::gd& data2,
-    VW::workspace& ws_out, VW::reductions::gd& data_out)
+void add(const VW::workspace& /* ws1 */, const VW::reductions::gd& data1, const VW::workspace& ws2,
+    VW::reductions::gd& data2, VW::workspace& ws_out, VW::reductions::gd& data_out)
 {
   const size_t length = static_cast<size_t>(1) << ws_out.num_bits;
   // When adding, output the weights from the model delta (2nd arugment to addition)
@@ -294,8 +293,8 @@ void add(const VW::workspace& /* ws1 */, const VW::reductions::gd& data1, const
   }
 }
 
-void subtract(const VW::workspace& ws1, const VW::reductions::gd& data1, const VW::workspace& /* ws2 */, VW::reductions::gd& data2,
-    VW::workspace& ws_out, VW::reductions::gd& data_out)
+void subtract(const VW::workspace& ws1, const VW::reductions::gd& data1, const VW::workspace& /* ws2 */,
+    VW::reductions::gd& data2, VW::workspace& ws_out, VW::reductions::gd& data_out)
 {
   const size_t length = static_cast<size_t>(1) << ws_out.num_bits;
   // When subtracting, output the weights from the newer model (1st arugment to subtraction)
@@ -403,7 +402,7 @@ void print_lda_features(VW::workspace& all, VW::example& ec)
   }
   std::cout << " total of " << count << " features." << std::endl;
 }
-}
+}  // namespace
 
 void VW::details::print_features(VW::workspace& all, VW::example& ec)
 {
@@ -465,7 +464,6 @@ float VW::details::finalize_prediction(VW::shared_data* sd, VW::io::logger& logg
   return ret;
 }
 
-
 namespace
 {
 class trunc_data
@@ -519,8 +517,8 @@ inline void vec_add_trunc_multipredict(VW::details::multipredict_info<T>& mp, co
 }
 
 template <bool l1, bool audit>
-void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t count, size_t step, VW::polyprediction* pred,
-    bool finalize_predictions)
+void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t count, size_t step,
+    VW::polyprediction* pred, bool finalize_predictions)
 {
   VW::workspace& all = *g.all;
   for (size_t c = 0; c < count; c++)
@@ -541,8 +539,8 @@ void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t
     }
     else
     {
-      VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t, VW::details::vec_add_multipredict>(
-          all, ec, mp, num_features_from_interactions);
+      VW::foreach_feature<VW::details::multipredict_info<VW::sparse_parameters>, uint64_t,
+          VW::details::vec_add_multipredict>(all, ec, mp, num_features_from_interactions);
     }
   }
   else
@@ -556,8 +554,8 @@ void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t
     }
     else
     {
-      VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t, VW::details::vec_add_multipredict>(
-          all, ec, mp, num_features_from_interactions);
+      VW::foreach_feature<VW::details::multipredict_info<VW::dense_parameters>, uint64_t,
+          VW::details::vec_add_multipredict>(all, ec, mp, num_features_from_interactions);
     }
   }
   ec.num_features_from_interactions = num_features_from_interactions;
@@ -568,7 +566,10 @@ void multipredict(VW::reductions::gd& g, base_learner&, VW::example& ec, size_t
   }
   if (finalize_predictions)
   {
-    for (size_t c = 0; c < count; c++) { pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar); }
+    for (size_t c = 0; c < count; c++)
+    {
+      pred[c].scalar = VW::details::finalize_prediction(all.sd, all.logger, pred[c].scalar);
+    }
   }
   if (audit)
   {
@@ -928,7 +929,7 @@ void save_load_regressor(VW::workspace& all, VW::io_buf& model_file, bool read,
     }
   }
 }
-}
+}  // namespace
 
 void VW::details::save_load_regressor_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text)
 {
@@ -939,8 +940,8 @@ void VW::details::save_load_regressor_gd(VW::workspace& all, VW::io_buf& model_f
 namespace
 {
 template <class T>
-void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, VW::reductions::gd* g,
-    std::stringstream& msg, uint32_t ftrl_size, T& weights)
+void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file, bool read, bool text,
+    VW::reductions::gd* g, std::stringstream& msg, uint32_t ftrl_size, T& weights)
 {
   uint64_t length = static_cast<uint64_t>(1) << all.num_bits;
 
@@ -1068,10 +1069,10 @@ void save_load_online_state_weights(VW::workspace& all, VW::io_buf& model_file,
     }
   }
 }
-}
+}  // namespace
 
-void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text, double& total_weight,
-    double& normalized_sum_norm_x, VW::reductions::gd* g, uint32_t ftrl_size)
+void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& model_file, bool read, bool text,
+    double& total_weight, double& normalized_sum_norm_x, VW::reductions::gd* g, uint32_t ftrl_size)
 {
   std::stringstream msg;
 
@@ -1216,7 +1217,8 @@ void VW::details::save_load_online_state_gd(VW::workspace& all, VW::io_buf& mode
   else { save_load_online_state_weights(all, model_file, read, text, g, msg, ftrl_size, all.weights.dense_weights); }
 }
 
-namespace {
+namespace
+{
 void save_load(VW::reductions::gd& g, VW::io_buf& model_file, bool read, bool text)
 {
   VW::workspace& all = *g.all;
@@ -1354,7 +1356,7 @@ uint64_t ceil_log_2(uint64_t v)
   else { return 1 + ceil_log_2(v >> 1); }
 }
 
-}
+}  // namespace
 
 base_learner* VW::reductions::gd_setup(VW::setup_base_i& stack_builder)
 {
diff --git a/vowpalwabbit/core/src/reductions/nn.cc b/vowpalwabbit/core/src/reductions/nn.cc
index dd0ee5bbf51..b22c8bb235a 100644
--- a/vowpalwabbit/core/src/reductions/nn.cc
+++ b/vowpalwabbit/core/src/reductions/nn.cc
@@ -355,7 +355,8 @@ void predict_or_learn_multi(nn& n, single_learner& base, VW::example& ec)
               float nu = n.outputweight.pred.scalar;
               float gradhw = 0.5f * nu * gradient * sigmahprime;
 
-              ec.l.simple.label = VW::details::finalize_prediction(n.all->sd, n.all->logger, hidden_units[i].scalar - gradhw);
+              ec.l.simple.label =
+                  VW::details::finalize_prediction(n.all->sd, n.all->logger, hidden_units[i].scalar - gradhw);
               ec.pred.scalar = hidden_units[i].scalar;
               if (ec.l.simple.label != hidden_units[i].scalar) { base.update(ec, i); }
             }
diff --git a/vowpalwabbit/core/src/reductions/oja_newton.cc b/vowpalwabbit/core/src/reductions/oja_newton.cc
index ecbbc334e7b..51ef91f7f37 100644
--- a/vowpalwabbit/core/src/reductions/oja_newton.cc
+++ b/vowpalwabbit/core/src/reductions/oja_newton.cc
@@ -350,7 +350,8 @@ void predict(OjaNewton& oja_newton_ptr, base_learner&, VW::example& ec)
   oja_newton_ptr.data.prediction = 0;
   VW::foreach_feature<oja_n_update_data, make_pred>(*oja_newton_ptr.all, ec, oja_newton_ptr.data);
   ec.partial_prediction = oja_newton_ptr.data.prediction;
-  ec.pred.scalar = VW::details::finalize_prediction(oja_newton_ptr.all->sd, oja_newton_ptr.all->logger, ec.partial_prediction);
+  ec.pred.scalar =
+      VW::details::finalize_prediction(oja_newton_ptr.all->sd, oja_newton_ptr.all->logger, ec.partial_prediction);
 }
 
 void update_Z_and_wbar(oja_n_update_data& data, float x, float& wref)  // NOLINT
@@ -485,7 +486,10 @@ void save_load(OjaNewton& oja_newton_ptr, VW::io_buf& model_file, bool read, boo
 
     double temp = 0.;
     double temp_normalized_sum_norm_x = 0.;
-    if (resume) { VW::details::save_load_online_state_gd(all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
+    if (resume)
+    {
+      VW::details::save_load_online_state_gd(all, model_file, read, text, temp, temp_normalized_sum_norm_x);
+    }
     else { VW::details::save_load_regressor_gd(all, model_file, read, text); }
   }
 }
diff --git a/vowpalwabbit/core/src/reductions/svrg.cc b/vowpalwabbit/core/src/reductions/svrg.cc
index 01ba72d1bfb..01ca6c61712 100644
--- a/vowpalwabbit/core/src/reductions/svrg.cc
+++ b/vowpalwabbit/core/src/reductions/svrg.cc
@@ -169,7 +169,10 @@ void save_load(svrg& s, VW::io_buf& model_file, bool read, bool text)
 
     double temp = 0.;
     double temp_normalized_sum_norm_x = 0.;
-    if (resume) { VW::details::save_load_online_state_gd(*s.all, model_file, read, text, temp, temp_normalized_sum_norm_x); }
+    if (resume)
+    {
+      VW::details::save_load_online_state_gd(*s.all, model_file, read, text, temp, temp_normalized_sum_norm_x);
+    }
     else { VW::details::save_load_regressor_gd(*s.all, model_file, read, text); }
   }
 }

From 0b8f857d7378a15a40e760d361c06e4594bf7dc2 Mon Sep 17 00:00:00 2001
From: Jack Gerrits <jackgerrits95@gmail.com>
Date: Fri, 6 Jan 2023 15:28:11 -0500
Subject: [PATCH 3/4] uncomment compat definitions

---
 .../core/include/vw/core/gd_predict.h         | 196 +++++++++---------
 .../core/include/vw/core/reductions/gd.h      | 138 ++++++------
 2 files changed, 167 insertions(+), 167 deletions(-)

diff --git a/vowpalwabbit/core/include/vw/core/gd_predict.h b/vowpalwabbit/core/include/vw/core/gd_predict.h
index 879d622ef28..93f7cfe609e 100644
--- a/vowpalwabbit/core/include/vw/core/gd_predict.h
+++ b/vowpalwabbit/core/include/vw/core/gd_predict.h
@@ -130,101 +130,101 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
 }
 }  // namespace VW
 
-// namespace GD
-// {
-
-// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_index)
-// template <class DataT, void (*FuncT)(DataT&, float feature_value, uint64_t feature_index), class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
-// {
-//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
-// }
-
-// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
-// template <class DataT, void (*FuncT)(DataT&, const float feature_value, float& weight_reference), class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult
-// = 1.)
-// {
-//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
-// }
-
-// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
-// template <class DataT, void (*FuncT)(DataT&, float, float), class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(
-//     const WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
-// {
-//   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
-// }
-
-// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
-//     class WeightsT>  // nullptr func can't be used as template param in old
-//                      // compilers
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void generate_interactions(const std::vector<std::vector<VW::namespace_index>>& interactions,
-//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
-//     DataT& dat, WeightsT& weights, size_t& num_interacted_features,
-//     VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
-//                                                              // ambiguity in old complers
-// {
-//   VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations,
-//   ec,
-//       dat, weights, num_interacted_features, cache);
-// }
-
-// // iterate through all namespaces and quadratic&cubic features, callback function FuncT(some_data_R, feature_value_x,
-// // WeightOrIndexT) where WeightOrIndexT is EITHER float& feature_weight OR uint64_t feature_index
-// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
-//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
-//     const std::vector<std::vector<VW::namespace_index>>& interactions,
-//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
-//     DataT& dat, size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache)
-// {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
-//   interactions,
-//       extent_interactions, permutations, ec, dat, num_interacted_features, cache);
-// }
-
-// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
-//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
-//     const std::vector<std::vector<VW::namespace_index>>& interactions,
-//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
-//     DataT& dat, VW::details::generate_interactions_object_cache& cache)
-// {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
-//   interactions,
-//       extent_interactions, permutations, ec, dat, cache);
-// }
-
-// template <class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
-//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
-//     const std::vector<std::vector<VW::namespace_index>>& interactions,
-//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
-//     VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
-// {
-//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
-//   permutations, ec,
-//       cache, initial);
-// }
-
-// template <class WeightsT>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
-//     std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
-//     const std::vector<std::vector<VW::namespace_index>>& interactions,
-//     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
-//     size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
-// {
-//   return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
-//   permutations, ec,
-//       num_interacted_features, cache, initial);
-// }
-// }  // namespace GD
\ No newline at end of file
+namespace GD
+{
+
+// iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_index)
+template <class DataT, void (*FuncT)(DataT&, float feature_value, uint64_t feature_index), class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+{
+  VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+}
+
+// iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+template <class DataT, void (*FuncT)(DataT&, const float feature_value, float& weight_reference), class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult
+= 1.)
+{
+  VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+}
+
+// iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+template <class DataT, void (*FuncT)(DataT&, float, float), class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(
+    const WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
+{
+  VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
+}
+
+template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
+    class WeightsT>  // nullptr func can't be used as template param in old
+                     // compilers
+VW_DEPRECATED("Moved to VW namespace")
+inline void generate_interactions(const std::vector<std::vector<VW::namespace_index>>& interactions,
+    const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+    DataT& dat, WeightsT& weights, size_t& num_interacted_features,
+    VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
+                                                             // ambiguity in old complers
+{
+  VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations,
+  ec,
+      dat, weights, num_interacted_features, cache);
+}
+
+// iterate through all namespaces and quadratic&cubic features, callback function FuncT(some_data_R, feature_value_x,
+// WeightOrIndexT) where WeightOrIndexT is EITHER float& feature_weight OR uint64_t feature_index
+template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
+    std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+    const std::vector<std::vector<VW::namespace_index>>& interactions,
+    const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+    DataT& dat, size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache)
+{
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
+  interactions,
+      extent_interactions, permutations, ec, dat, num_interacted_features, cache);
+}
+
+template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT), class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
+    std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+    const std::vector<std::vector<VW::namespace_index>>& interactions,
+    const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+    DataT& dat, VW::details::generate_interactions_object_cache& cache)
+{
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
+  interactions,
+      extent_interactions, permutations, ec, dat, cache);
+}
+
+template <class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
+    std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+    const std::vector<std::vector<VW::namespace_index>>& interactions,
+    const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+    VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
+{
+  return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
+  permutations, ec,
+      cache, initial);
+}
+
+template <class WeightsT>
+VW_DEPRECATED("Moved to VW namespace")
+inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
+    std::array<bool, VW::NUM_NAMESPACES>& ignore_linear,
+    const std::vector<std::vector<VW::namespace_index>>& interactions,
+    const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
+    size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
+{
+  return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
+  permutations, ec,
+      num_interacted_features, cache, initial);
+}
+}  // namespace GD
\ No newline at end of file
diff --git a/vowpalwabbit/core/include/vw/core/reductions/gd.h b/vowpalwabbit/core/include/vw/core/reductions/gd.h
index d2eae5e2f00..61ec296937a 100644
--- a/vowpalwabbit/core/include/vw/core/reductions/gd.h
+++ b/vowpalwabbit/core/include/vw/core/reductions/gd.h
@@ -247,72 +247,72 @@ inline void generate_interactions(VW::workspace& all, VW::example_predict& ec, R
 
 }  // namespace INTERACTIONS
 
-// namespace GD
-// {
-
-// using gd = VW::reductions::gd;
-
-// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
-// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
-// {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat);
-// }
-
-// // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
-// template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
-// {
-//   VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat, num_interacted_features);
-// }
-
-// // iterate through all namespaces and quadratic&cubic features, callback function T(some_data_R, feature_value_x,
-// // feature_weight)
-// template <class DataT, void (*FuncT)(DataT&, float, float&)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
-// {
-//   VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat);
-// }
-
-// template <class DataT, void (*FuncT)(DataT&, float, float)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
-// {
-//   VW::foreach_feature<DataT, float, FuncT>(all, ec, dat);
-// }
-
-// template <class DataT, void (*FuncT)(DataT&, float, float&)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
-// {
-//   VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat, num_interacted_features);
-// }
-
-// template <class DataT, void (*FuncT)(DataT&, float, const float&)>
-// VW_DEPRECATED("Moved to VW namespace")
-// inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
-// {
-//   VW::foreach_feature<DataT, const float&, FuncT>(all, ec, dat, num_interacted_features);
-// }
-
-// VW_DEPRECATED("Moved to VW namespace")
-// inline float inline_predict(VW::workspace& all, VW::example& ec)
-// {
-//   return VW::inline_predict(all, ec);
-// }
-
-// VW_DEPRECATED("Moved to VW namespace")
-// inline float inline_predict(VW::workspace& all, VW::example& ec, size_t& num_generated_features)
-// {
-//   return VW::inline_predict(all, ec, num_generated_features);
-// }
-
-// VW_DEPRECATED("Moved to VW namespace")
-// inline float trunc_weight(const float w, const float gravity)
-// {
-//   return VW::trunc_weight(w, gravity);
-// }
-// }
\ No newline at end of file
+namespace GD
+{
+
+using gd = VW::reductions::gd;
+
+// iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+{
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat);
+}
+
+// iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
+template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+{
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT>(all, ec, dat, num_interacted_features);
+}
+
+// iterate through all namespaces and quadratic&cubic features, callback function T(some_data_R, feature_value_x,
+// feature_weight)
+template <class DataT, void (*FuncT)(DataT&, float, float&)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+{
+  VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat);
+}
+
+template <class DataT, void (*FuncT)(DataT&, float, float)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat)
+{
+  VW::foreach_feature<DataT, float, FuncT>(all, ec, dat);
+}
+
+template <class DataT, void (*FuncT)(DataT&, float, float&)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+{
+  VW::foreach_feature<DataT, float&, FuncT>(all, ec, dat, num_interacted_features);
+}
+
+template <class DataT, void (*FuncT)(DataT&, float, const float&)>
+VW_DEPRECATED("Moved to VW namespace")
+inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, size_t& num_interacted_features)
+{
+  VW::foreach_feature<DataT, const float&, FuncT>(all, ec, dat, num_interacted_features);
+}
+
+VW_DEPRECATED("Moved to VW namespace")
+inline float inline_predict(VW::workspace& all, VW::example& ec)
+{
+  return VW::inline_predict(all, ec);
+}
+
+VW_DEPRECATED("Moved to VW namespace")
+inline float inline_predict(VW::workspace& all, VW::example& ec, size_t& num_generated_features)
+{
+  return VW::inline_predict(all, ec, num_generated_features);
+}
+
+VW_DEPRECATED("Moved to VW namespace")
+inline float trunc_weight(const float w, const float gravity)
+{
+  return VW::trunc_weight(w, gravity);
+}
+}
\ No newline at end of file

From e4c97f1c7733bd77db192c188eff6e6d4a9ec603 Mon Sep 17 00:00:00 2001
From: Jack Gerrits <jackgerrits95@gmail.com>
Date: Fri, 6 Jan 2023 16:37:15 -0500
Subject: [PATCH 4/4] format

---
 .../core/include/vw/core/gd_predict.h         | 30 ++++++++-----------
 .../core/include/vw/core/reductions/gd.h      | 12 ++------
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/vowpalwabbit/core/include/vw/core/gd_predict.h b/vowpalwabbit/core/include/vw/core/gd_predict.h
index 93f7cfe609e..868756250f8 100644
--- a/vowpalwabbit/core/include/vw/core/gd_predict.h
+++ b/vowpalwabbit/core/include/vw/core/gd_predict.h
@@ -144,8 +144,7 @@ void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint
 // iterate through one namespace (or its part), callback function FuncT(some_data_R, feature_value_x, feature_weight)
 template <class DataT, void (*FuncT)(DataT&, const float feature_value, float& weight_reference), class WeightsT>
 VW_DEPRECATED("Moved to VW namespace")
-inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult
-= 1.)
+inline void foreach_feature(WeightsT& weights, const VW::features& fs, DataT& dat, uint64_t offset = 0, float mult = 1.)
 {
   VW::foreach_feature<DataT, FuncT, WeightsT>(weights, fs, dat, offset, mult);
 }
@@ -162,16 +161,15 @@ inline void foreach_feature(
 template <class DataT, class WeightOrIndexT, void (*FuncT)(DataT&, float, WeightOrIndexT),
     class WeightsT>  // nullptr func can't be used as template param in old
                      // compilers
-VW_DEPRECATED("Moved to VW namespace")
-inline void generate_interactions(const std::vector<std::vector<VW::namespace_index>>& interactions,
+VW_DEPRECATED("Moved to VW namespace") inline void generate_interactions(
+    const std::vector<std::vector<VW::namespace_index>>& interactions,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     DataT& dat, WeightsT& weights, size_t& num_interacted_features,
     VW::details::generate_interactions_object_cache& cache)  // default value removed to eliminate
                                                              // ambiguity in old complers
 {
-  VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(interactions, extent_interactions, permutations,
-  ec,
-      dat, weights, num_interacted_features, cache);
+  VW::generate_interactions<DataT, WeightOrIndexT, FuncT, WeightsT>(
+      interactions, extent_interactions, permutations, ec, dat, weights, num_interacted_features, cache);
 }
 
 // iterate through all namespaces and quadratic&cubic features, callback function FuncT(some_data_R, feature_value_x,
@@ -184,8 +182,7 @@ inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     DataT& dat, size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache)
 {
-  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
-  interactions,
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear, interactions,
       extent_interactions, permutations, ec, dat, num_interacted_features, cache);
 }
 
@@ -197,9 +194,8 @@ inline void foreach_feature(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     DataT& dat, VW::details::generate_interactions_object_cache& cache)
 {
-  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(weights, ignore_some_linear, ignore_linear,
-  interactions,
-      extent_interactions, permutations, ec, dat, cache);
+  VW::foreach_feature<DataT, WeightOrIndexT, FuncT, WeightsT>(
+      weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec, dat, cache);
 }
 
 template <class WeightsT>
@@ -210,9 +206,8 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 {
-  return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
-  permutations, ec,
-      cache, initial);
+  return VW::inline_predict(
+      weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations, ec, cache, initial);
 }
 
 template <class WeightsT>
@@ -223,8 +218,7 @@ inline float inline_predict(WeightsT& weights, bool ignore_some_linear,
     const std::vector<std::vector<VW::extent_term>>& extent_interactions, bool permutations, VW::example_predict& ec,
     size_t& num_interacted_features, VW::details::generate_interactions_object_cache& cache, float initial = 0.f)
 {
-  return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions,
-  permutations, ec,
-      num_interacted_features, cache, initial);
+  return VW::inline_predict(weights, ignore_some_linear, ignore_linear, interactions, extent_interactions, permutations,
+      ec, num_interacted_features, cache, initial);
 }
 }  // namespace GD
\ No newline at end of file
diff --git a/vowpalwabbit/core/include/vw/core/reductions/gd.h b/vowpalwabbit/core/include/vw/core/reductions/gd.h
index 61ec296937a..4039d665096 100644
--- a/vowpalwabbit/core/include/vw/core/reductions/gd.h
+++ b/vowpalwabbit/core/include/vw/core/reductions/gd.h
@@ -299,10 +299,7 @@ inline void foreach_feature(VW::workspace& all, VW::example& ec, DataT& dat, siz
 }
 
 VW_DEPRECATED("Moved to VW namespace")
-inline float inline_predict(VW::workspace& all, VW::example& ec)
-{
-  return VW::inline_predict(all, ec);
-}
+inline float inline_predict(VW::workspace& all, VW::example& ec) { return VW::inline_predict(all, ec); }
 
 VW_DEPRECATED("Moved to VW namespace")
 inline float inline_predict(VW::workspace& all, VW::example& ec, size_t& num_generated_features)
@@ -311,8 +308,5 @@ inline float inline_predict(VW::workspace& all, VW::example& ec, size_t& num_gen
 }
 
 VW_DEPRECATED("Moved to VW namespace")
-inline float trunc_weight(const float w, const float gravity)
-{
-  return VW::trunc_weight(w, gravity);
-}
-}
\ No newline at end of file
+inline float trunc_weight(const float w, const float gravity) { return VW::trunc_weight(w, gravity); }
+}  // namespace GD
\ No newline at end of file