Skip to content

Commit f9ea28b

Browse files
committed
Use model methods for gradient checking
1 parent cebacc2 commit f9ea28b

File tree

3 files changed

+44
-82
lines changed

3 files changed

+44
-82
lines changed

Diff for: include/lbann/models/model.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,10 @@ class model
399399
void reset_epoch_statistics(execution_mode mode);
400400

401401
/** @brief Forward propagation step. */
402-
void forward_prop(execution_mode mode);
402+
void forward_prop(execution_mode mode, bool skip_callbacks = false);
403403
/** @brief Backward propagation step. */
404-
void backward_prop(bool compute_weight_grads_only = true);
404+
void backward_prop(bool compute_weight_grads_only = true,
405+
bool skip_callbacks = false);
405406
/** Evaluate any metrics in the model */
406407
void evaluate_metrics(execution_mode mode, uint64_t current_mini_batch_size);
407408
/** @brief Clear each optimizer's gradient.

Diff for: src/callbacks/check_gradients.cpp

+15-66
Original file line numberDiff line numberDiff line change
@@ -58,30 +58,12 @@ namespace {
5858
EvalType compute_objective_function(model& m)
5959
{
6060
const auto& c = static_cast<SGDExecutionContext&>(m.get_execution_context());
61-
m.get_activation_reference_counter().clear();
62-
63-
// Forward prop, skipping input layers
64-
65-
if (m.is_subgraph_parallelism_enabled()) {
66-
for (auto&& l : m.get_layers()) {
67-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr &&
68-
l->get_run_layer_in_subgraph()) {
69-
l->forward_prop();
70-
}
71-
}
72-
}
73-
else // sub-graph parallelism not enabled
74-
{
75-
for (auto&& l : m.get_layers()) {
76-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr) {
77-
l->forward_prop();
78-
}
79-
}
80-
}
61+
const auto mode = c.get_execution_mode();
8162

8263
// Get objective function value
64+
m.forward_prop(mode, true);
8365
auto&& obj = m.get_objective_function();
84-
const auto mode = c.get_execution_mode();
66+
8567
const auto mini_batch_size = m.get_current_mini_batch_size();
8668
obj->start_evaluation(mode, mini_batch_size);
8769
return obj->finish_evaluation(mode, mini_batch_size);
@@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
134116
// Get weights matrix and gradient
135117
auto const& weights_matrix = dtw.get_values_sharded();
136118
auto const& gradient = dtw.get_optimizer()->get_gradient_sharded();
119+
137120
// Iterate through weights matrix entries
138121
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
139122
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
@@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
275258
for (auto&& met : m.get_metrics()) {
276259
met->reset_statistics(mode);
277260
}
278-
for (auto&& w : m.get_weights()) {
279-
auto&& opt = w->get_optimizer();
280-
if (opt != nullptr) {
281-
opt->clear_gradient();
282-
}
283-
}
284-
m.get_activation_reference_counter().clear();
261+
m.clear_gradients();
285262

286263
// Load data in input layers
287264
data_coordinator& dc = get_trainer().get_data_coordinator();
288265
dc.fetch_active_batch_synchronous(mode);
289266
El::Int current_mini_batch_size = dc.get_current_mini_batch_size(mode);
290267
m.set_current_mini_batch_size(current_mini_batch_size);
291268

292-
// checking subgrpah parallelism
293-
if (m.is_subgraph_parallelism_enabled()) {
294-
for (auto&& l : m.get_layers()) {
295-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr &&
296-
l->get_run_layer_in_subgraph()) {
297-
l->forward_prop();
298-
}
299-
}
300-
}
301-
else {
302-
for (auto&& l : m.get_layers()) {
303-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr) {
304-
l->forward_prop();
305-
}
306-
}
307-
}
308-
309269
// Compute objective function
310270
const EvalType objective = compute_objective_function(m);
311271

272+
// Compute gradients
273+
m.get_objective_function()->differentiate();
274+
m.get_objective_function()->compute_weight_regularization();
275+
276+
// Compute analytical gradients through model
277+
m.backward_prop(false, /*skip_callbacks=*/true);
278+
312279
// Choose finite difference step
313280
// Note: Consider a central difference scheme:
314281
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
@@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
323290
// epsilon based on the minimum step size of the float data type
324291
const EvalType epsilon =
325292
std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
326-
const EvalType step_size =
293+
const EvalType step_size = std::max(
294+
std::numeric_limits<EvalType>::epsilon(),
327295
(m_step_size > EvalType{0} ? m_step_size
328-
: std::fabs(objective) * El::Sqrt(epsilon));
296+
: std::fabs(objective) * El::Sqrt(epsilon)));
329297
EvalType expected_error =
330298
std::pow((epsilon * objective / step_size + std::pow(step_size, 4) / 18),
331299
0.9);
332300

333-
// Compute gradients
334-
m.get_objective_function()->differentiate();
335-
m.get_objective_function()->compute_weight_regularization();
336-
337-
// checking subgraph parallelism
338-
if (m.is_subgraph_parallelism_enabled()) {
339-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
340-
if (layers[i]->get_run_layer_in_subgraph()) {
341-
layers[i]->back_prop();
342-
}
343-
}
344-
}
345-
else {
346-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
347-
layers[i]->back_prop();
348-
}
349-
}
350-
351301
// Print objective function value
352302
if (comm.am_world_master()) {
353303
std::cout << std::string(64, '-') << "\n"
@@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
383333
}
384334

385335
// Clean up
386-
// TODO: Why
387336
auto&& dataset = dc.get_dataset(mode);
388337
dataset.set_initial_position();
389338
m.get_objective_function()->reset_statistics(mode);

Diff for: src/models/model.cpp

+26-14
Original file line numberDiff line numberDiff line change
@@ -1573,10 +1573,11 @@ void model::clear_gradients()
15731573
}
15741574
}
15751575

1576-
void model::forward_prop(execution_mode mode)
1576+
void model::forward_prop(execution_mode mode, bool skip_callbacks)
15771577
{
15781578
LBANN_CALIPER_MARK_FUNCTION;
1579-
do_model_forward_prop_begin_cbs(mode);
1579+
if (!skip_callbacks)
1580+
do_model_forward_prop_begin_cbs(mode);
15801581

15811582
// Clear activations in reference counter
15821583
m_activation_refcnt.clear();
@@ -1586,25 +1587,30 @@ void model::forward_prop(execution_mode mode)
15861587

15871588
if (this->is_subgraph_parallelism_enabled()) {
15881589
if (l.get_run_layer_in_subgraph()) {
1589-
do_layer_forward_prop_begin_cbs(mode, &l);
1590+
if (!skip_callbacks)
1591+
do_layer_forward_prop_begin_cbs(mode, &l);
15901592
l.forward_prop();
1591-
do_layer_forward_prop_end_cbs(mode, &l);
1593+
if (!skip_callbacks)
1594+
do_layer_forward_prop_end_cbs(mode, &l);
15921595
}
15931596
else {
15941597
// To Do: Fix last batch problem in sub-graph parallelism
15951598
// experimental code to fix last batch problem in subgraph parallelism
15961599
}
15971600
}
15981601
else {
1599-
do_layer_forward_prop_begin_cbs(mode, &l);
1602+
if (!skip_callbacks)
1603+
do_layer_forward_prop_begin_cbs(mode, &l);
16001604
l.forward_prop();
1601-
do_layer_forward_prop_end_cbs(mode, &l);
1605+
if (!skip_callbacks)
1606+
do_layer_forward_prop_end_cbs(mode, &l);
16021607
}
16031608
}
1604-
do_model_forward_prop_end_cbs(mode);
1609+
if (!skip_callbacks)
1610+
do_model_forward_prop_end_cbs(mode);
16051611
}
16061612

1607-
void model::backward_prop(bool compute_weight_grads_only)
1613+
void model::backward_prop(bool compute_weight_grads_only, bool skip_callbacks)
16081614
{
16091615
LBANN_CALIPER_MARK_FUNCTION;
16101616

@@ -1614,7 +1620,8 @@ void model::backward_prop(bool compute_weight_grads_only)
16141620
bool const envvar_disable_layers =
16151621
!arg_parser.get<bool>(LBANN_OPTION_NO_BACKPROP_DISABLE);
16161622

1617-
do_model_backward_prop_begin_cbs();
1623+
if (!skip_callbacks)
1624+
do_model_backward_prop_begin_cbs();
16181625

16191626
for (El::Int i = get_num_layers() - 1; i >= 0; --i) {
16201627

@@ -1644,21 +1651,25 @@ void model::backward_prop(bool compute_weight_grads_only)
16441651

16451652
if (this->is_subgraph_parallelism_enabled()) {
16461653
if (l.get_run_layer_in_subgraph()) {
1647-
do_layer_backward_prop_begin_cbs(&l);
1654+
if (!skip_callbacks)
1655+
do_layer_backward_prop_begin_cbs(&l);
16481656
if (enable_layer)
16491657
l.back_prop();
1650-
do_layer_backward_prop_end_cbs(&l);
1658+
if (!skip_callbacks)
1659+
do_layer_backward_prop_end_cbs(&l);
16511660
}
16521661
else {
16531662
// To Do: Fix last batch problem in sub-graph parallelism
16541663
// experimental code to fix last batch problem in subgraph parallelism
16551664
}
16561665
}
16571666
else {
1658-
do_layer_backward_prop_begin_cbs(&l);
1667+
if (!skip_callbacks)
1668+
do_layer_backward_prop_begin_cbs(&l);
16591669
if (enable_layer)
16601670
l.back_prop();
1661-
do_layer_backward_prop_end_cbs(&l);
1671+
if (!skip_callbacks)
1672+
do_layer_backward_prop_end_cbs(&l);
16621673
}
16631674

16641675
// Terminate early if all gradients have been computed
@@ -1683,7 +1694,8 @@ void model::backward_prop(bool compute_weight_grads_only)
16831694
}
16841695
}
16851696

1686-
do_model_backward_prop_end_cbs();
1697+
if (!skip_callbacks)
1698+
do_model_backward_prop_end_cbs();
16871699
}
16881700

16891701
void model::update_weights()

0 commit comments

Comments
 (0)