Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up gradient checker #2374

Merged
merged 1 commit into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions include/lbann/models/model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,10 @@ class model
void reset_epoch_statistics(execution_mode mode);

/** @brief Forward propagation step. */
void forward_prop(execution_mode mode);
void forward_prop(execution_mode mode, bool skip_callbacks = false);
/** @brief Backward propagation step. */
void backward_prop(bool compute_weight_grads_only = true);
void backward_prop(bool compute_weight_grads_only = true,
bool skip_callbacks = false);
/** Evaluate any metrics in the model */
void evaluate_metrics(execution_mode mode, uint64_t current_mini_batch_size);
/** @brief Clear each optimizer's gradient.
Expand Down
81 changes: 15 additions & 66 deletions src/callbacks/check_gradients.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,30 +58,12 @@ namespace {
EvalType compute_objective_function(model& m)
{
const auto& c = static_cast<SGDExecutionContext&>(m.get_execution_context());
m.get_activation_reference_counter().clear();

// Forward prop, skipping input layers

if (m.is_subgraph_parallelism_enabled()) {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr &&
l->get_run_layer_in_subgraph()) {
l->forward_prop();
}
}
}
else // sub-graph parallelism not enabled
{
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr) {
l->forward_prop();
}
}
}
const auto mode = c.get_execution_mode();

// Get objective function value
m.forward_prop(mode, true);
auto&& obj = m.get_objective_function();
const auto mode = c.get_execution_mode();

const auto mini_batch_size = m.get_current_mini_batch_size();
obj->start_evaluation(mode, mini_batch_size);
return obj->finish_evaluation(mode, mini_batch_size);
Expand Down Expand Up @@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
// Get weights matrix and gradient
auto const& weights_matrix = dtw.get_values_sharded();
auto const& gradient = dtw.get_optimizer()->get_gradient_sharded();

// Iterate through weights matrix entries
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
Expand Down Expand Up @@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
for (auto&& met : m.get_metrics()) {
met->reset_statistics(mode);
}
for (auto&& w : m.get_weights()) {
auto&& opt = w->get_optimizer();
if (opt != nullptr) {
opt->clear_gradient();
}
}
m.get_activation_reference_counter().clear();
m.clear_gradients();

// Load data in input layers
data_coordinator& dc = get_trainer().get_data_coordinator();
dc.fetch_active_batch_synchronous(mode);
El::Int current_mini_batch_size = dc.get_current_mini_batch_size(mode);
m.set_current_mini_batch_size(current_mini_batch_size);

// checking subgrpah parallelism
if (m.is_subgraph_parallelism_enabled()) {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr &&
l->get_run_layer_in_subgraph()) {
l->forward_prop();
}
}
}
else {
for (auto&& l : m.get_layers()) {
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr) {
l->forward_prop();
}
}
}

// Compute objective function
const EvalType objective = compute_objective_function(m);

// Compute gradients
m.get_objective_function()->differentiate();
m.get_objective_function()->compute_weight_regularization();

// Compute analytical gradients through model
m.backward_prop(false, /*skip_callbacks=*/true);

// Choose finite difference step
// Note: Consider a central difference scheme:
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
Expand All @@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
// epsilon based on the minimum step size of the float data type
const EvalType epsilon =
std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
const EvalType step_size =
const EvalType step_size = std::max(
std::numeric_limits<EvalType>::epsilon(),
(m_step_size > EvalType{0} ? m_step_size
: std::fabs(objective) * El::Sqrt(epsilon));
: std::fabs(objective) * El::Sqrt(epsilon)));
EvalType expected_error =
std::pow((epsilon * objective / step_size + std::pow(step_size, 4) / 18),
0.9);

// Compute gradients
m.get_objective_function()->differentiate();
m.get_objective_function()->compute_weight_regularization();

// checking subgraph parallelism
if (m.is_subgraph_parallelism_enabled()) {
for (El::Int i = layers.size() - 1; i >= 0; --i) {
if (layers[i]->get_run_layer_in_subgraph()) {
layers[i]->back_prop();
}
}
}
else {
for (El::Int i = layers.size() - 1; i >= 0; --i) {
layers[i]->back_prop();
}
}

// Print objective function value
if (comm.am_world_master()) {
std::cout << std::string(64, '-') << "\n"
Expand Down Expand Up @@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
}

// Clean up
// TODO: Why
auto&& dataset = dc.get_dataset(mode);
dataset.set_initial_position();
m.get_objective_function()->reset_statistics(mode);
Expand Down
40 changes: 26 additions & 14 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1573,10 +1573,11 @@ void model::clear_gradients()
}
}

void model::forward_prop(execution_mode mode)
void model::forward_prop(execution_mode mode, bool skip_callbacks)
{
LBANN_CALIPER_MARK_FUNCTION;
do_model_forward_prop_begin_cbs(mode);
if (!skip_callbacks)
do_model_forward_prop_begin_cbs(mode);

// Clear activations in reference counter
m_activation_refcnt.clear();
Expand All @@ -1586,25 +1587,30 @@ void model::forward_prop(execution_mode mode)

if (this->is_subgraph_parallelism_enabled()) {
if (l.get_run_layer_in_subgraph()) {
do_layer_forward_prop_begin_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_begin_cbs(mode, &l);
l.forward_prop();
do_layer_forward_prop_end_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_end_cbs(mode, &l);
}
else {
// To Do: Fix last batch problem in sub-graph parallelism
// experimental code to fix last batch problem in subgraph parallelism
}
}
else {
do_layer_forward_prop_begin_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_begin_cbs(mode, &l);
l.forward_prop();
do_layer_forward_prop_end_cbs(mode, &l);
if (!skip_callbacks)
do_layer_forward_prop_end_cbs(mode, &l);
}
}
do_model_forward_prop_end_cbs(mode);
if (!skip_callbacks)
do_model_forward_prop_end_cbs(mode);
}

void model::backward_prop(bool compute_weight_grads_only)
void model::backward_prop(bool compute_weight_grads_only, bool skip_callbacks)
{
LBANN_CALIPER_MARK_FUNCTION;

Expand All @@ -1614,7 +1620,8 @@ void model::backward_prop(bool compute_weight_grads_only)
bool const envvar_disable_layers =
!arg_parser.get<bool>(LBANN_OPTION_NO_BACKPROP_DISABLE);

do_model_backward_prop_begin_cbs();
if (!skip_callbacks)
do_model_backward_prop_begin_cbs();

for (El::Int i = get_num_layers() - 1; i >= 0; --i) {

Expand Down Expand Up @@ -1644,21 +1651,25 @@ void model::backward_prop(bool compute_weight_grads_only)

if (this->is_subgraph_parallelism_enabled()) {
if (l.get_run_layer_in_subgraph()) {
do_layer_backward_prop_begin_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_begin_cbs(&l);
if (enable_layer)
l.back_prop();
do_layer_backward_prop_end_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_end_cbs(&l);
}
else {
// To Do: Fix last batch problem in sub-graph parallelism
// experimental code to fix last batch problem in subgraph parallelism
}
}
else {
do_layer_backward_prop_begin_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_begin_cbs(&l);
if (enable_layer)
l.back_prop();
do_layer_backward_prop_end_cbs(&l);
if (!skip_callbacks)
do_layer_backward_prop_end_cbs(&l);
}

// Terminate early if all gradients have been computed
Expand All @@ -1683,7 +1694,8 @@ void model::backward_prop(bool compute_weight_grads_only)
}
}

do_model_backward_prop_end_cbs();
if (!skip_callbacks)
do_model_backward_prop_end_cbs();
}

void model::update_weights()
Expand Down
Loading