Skip to content

Commit 4615528

Browse files
committed
Use model methods for gradient checking
1 parent 2756920 commit 4615528

File tree

3 files changed

+44
-82
lines changed

3 files changed

+44
-82
lines changed

Diff for: include/lbann/models/model.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,10 @@ class model
399399
void reset_epoch_statistics(execution_mode mode);
400400

401401
/** @brief Forward propagation step. */
402-
void forward_prop(execution_mode mode);
402+
void forward_prop(execution_mode mode, bool skip_callbacks = false);
403403
/** @brief Backward propagation step. */
404-
void backward_prop(bool compute_weight_grads_only = true);
404+
void backward_prop(bool compute_weight_grads_only = true,
405+
bool skip_callbacks = false);
405406
/** Evaluate any metrics in the model */
406407
void evaluate_metrics(execution_mode mode, uint64_t current_mini_batch_size);
407408
/** @brief Clear each optimizer's gradient.

Diff for: src/callbacks/check_gradients.cpp

+15-66
Original file line numberDiff line numberDiff line change
@@ -58,30 +58,12 @@ namespace {
5858
EvalType compute_objective_function(model& m)
5959
{
6060
const auto& c = static_cast<SGDExecutionContext&>(m.get_execution_context());
61-
m.get_activation_reference_counter().clear();
62-
63-
// Forward prop, skipping input layers
64-
65-
if (m.is_subgraph_parallelism_enabled()) {
66-
for (auto&& l : m.get_layers()) {
67-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr &&
68-
l->get_run_layer_in_subgraph()) {
69-
l->forward_prop();
70-
}
71-
}
72-
}
73-
else // sub-graph parallelism not enabled
74-
{
75-
for (auto&& l : m.get_layers()) {
76-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr) {
77-
l->forward_prop();
78-
}
79-
}
80-
}
61+
const auto mode = c.get_execution_mode();
8162

8263
// Get objective function value
64+
m.forward_prop(mode, true);
8365
auto&& obj = m.get_objective_function();
84-
const auto mode = c.get_execution_mode();
66+
8567
const auto mini_batch_size = m.get_current_mini_batch_size();
8668
obj->start_evaluation(mode, mini_batch_size);
8769
return obj->finish_evaluation(mode, mini_batch_size);
@@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
134116
// Get weights matrix and gradient
135117
auto const& weights_matrix = dtw.get_values_sharded();
136118
auto const& gradient = dtw.get_optimizer()->get_gradient_sharded();
119+
137120
// Iterate through weights matrix entries
138121
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
139122
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
@@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
275258
for (auto&& met : m.get_metrics()) {
276259
met->reset_statistics(mode);
277260
}
278-
for (auto&& w : m.get_weights()) {
279-
auto&& opt = w->get_optimizer();
280-
if (opt != nullptr) {
281-
opt->clear_gradient();
282-
}
283-
}
284-
m.get_activation_reference_counter().clear();
261+
m.clear_gradients();
285262

286263
// Load data in input layers
287264
data_coordinator& dc = get_trainer().get_data_coordinator();
288265
dc.fetch_active_batch_synchronous(mode);
289266
El::Int current_mini_batch_size = dc.get_current_mini_batch_size(mode);
290267
m.set_current_mini_batch_size(current_mini_batch_size);
291268

292-
// checking subgrpah parallelism
293-
if (m.is_subgraph_parallelism_enabled()) {
294-
for (auto&& l : m.get_layers()) {
295-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr &&
296-
l->get_run_layer_in_subgraph()) {
297-
l->forward_prop();
298-
}
299-
}
300-
}
301-
else {
302-
for (auto&& l : m.get_layers()) {
303-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr) {
304-
l->forward_prop();
305-
}
306-
}
307-
}
308-
309269
// Compute objective function
310270
const EvalType objective = compute_objective_function(m);
311271

272+
// Compute gradients
273+
m.get_objective_function()->differentiate();
274+
m.get_objective_function()->compute_weight_regularization();
275+
276+
// Compute analytical gradients through model
277+
m.backward_prop(false, /*skip_callbacks=*/true);
278+
312279
// Choose finite difference step
313280
// Note: Consider a central difference scheme:
314281
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
@@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
323290
// epsilon based on the minimum step size of the float data type
324291
const EvalType epsilon =
325292
std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
326-
const EvalType step_size =
293+
const EvalType step_size = std::max(
294+
std::numeric_limits<EvalType>::epsilon(),
327295
(m_step_size > EvalType{0} ? m_step_size
328-
: std::fabs(objective) * El::Sqrt(epsilon));
296+
: std::fabs(objective) * El::Sqrt(epsilon)));
329297
EvalType expected_error =
330298
std::pow((epsilon * objective / step_size + std::pow(step_size, 4) / 18),
331299
0.9);
332300

333-
// Compute gradients
334-
m.get_objective_function()->differentiate();
335-
m.get_objective_function()->compute_weight_regularization();
336-
337-
// checking subgraph parallelism
338-
if (m.is_subgraph_parallelism_enabled()) {
339-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
340-
if (layers[i]->get_run_layer_in_subgraph()) {
341-
layers[i]->back_prop();
342-
}
343-
}
344-
}
345-
else {
346-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
347-
layers[i]->back_prop();
348-
}
349-
}
350-
351301
// Print objective function value
352302
if (comm.am_world_master()) {
353303
std::cout << std::string(64, '-') << "\n"
@@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
383333
}
384334

385335
// Clean up
386-
// TODO: Why
387336
auto&& dataset = dc.get_dataset(mode);
388337
dataset.set_initial_position();
389338
m.get_objective_function()->reset_statistics(mode);

Diff for: src/models/model.cpp

+26-14
Original file line numberDiff line numberDiff line change
@@ -1555,10 +1555,11 @@ void model::clear_gradients()
15551555
}
15561556
}
15571557

1558-
void model::forward_prop(execution_mode mode)
1558+
void model::forward_prop(execution_mode mode, bool skip_callbacks)
15591559
{
15601560
LBANN_CALIPER_MARK_FUNCTION;
1561-
do_model_forward_prop_begin_cbs(mode);
1561+
if (!skip_callbacks)
1562+
do_model_forward_prop_begin_cbs(mode);
15621563

15631564
// Clear activations in reference counter
15641565
m_activation_refcnt.clear();
@@ -1568,25 +1569,30 @@ void model::forward_prop(execution_mode mode)
15681569

15691570
if (this->is_subgraph_parallelism_enabled()) {
15701571
if (l.get_run_layer_in_subgraph() || l.get_name() == "layer1") {
1571-
do_layer_forward_prop_begin_cbs(mode, &l);
1572+
if (!skip_callbacks)
1573+
do_layer_forward_prop_begin_cbs(mode, &l);
15721574
l.forward_prop();
1573-
do_layer_forward_prop_end_cbs(mode, &l);
1575+
if (!skip_callbacks)
1576+
do_layer_forward_prop_end_cbs(mode, &l);
15741577
}
15751578
else {
15761579
// To Do: Fix last batch problem in sub-graph parallelism
15771580
// experimental code to fix last batch problem in subgraph parallelism
15781581
}
15791582
}
15801583
else {
1581-
do_layer_forward_prop_begin_cbs(mode, &l);
1584+
if (!skip_callbacks)
1585+
do_layer_forward_prop_begin_cbs(mode, &l);
15821586
l.forward_prop();
1583-
do_layer_forward_prop_end_cbs(mode, &l);
1587+
if (!skip_callbacks)
1588+
do_layer_forward_prop_end_cbs(mode, &l);
15841589
}
15851590
}
1586-
do_model_forward_prop_end_cbs(mode);
1591+
if (!skip_callbacks)
1592+
do_model_forward_prop_end_cbs(mode);
15871593
}
15881594

1589-
void model::backward_prop(bool compute_weight_grads_only)
1595+
void model::backward_prop(bool compute_weight_grads_only, bool skip_callbacks)
15901596
{
15911597
LBANN_CALIPER_MARK_FUNCTION;
15921598

@@ -1596,7 +1602,8 @@ void model::backward_prop(bool compute_weight_grads_only)
15961602
bool const envvar_disable_layers =
15971603
!arg_parser.get<bool>(LBANN_OPTION_NO_BACKPROP_DISABLE);
15981604

1599-
do_model_backward_prop_begin_cbs();
1605+
if (!skip_callbacks)
1606+
do_model_backward_prop_begin_cbs();
16001607

16011608
for (El::Int i = get_num_layers() - 1; i >= 0; --i) {
16021609

@@ -1626,21 +1633,25 @@ void model::backward_prop(bool compute_weight_grads_only)
16261633

16271634
if (this->is_subgraph_parallelism_enabled()) {
16281635
if (l.get_run_layer_in_subgraph()) {
1629-
do_layer_backward_prop_begin_cbs(&l);
1636+
if (!skip_callbacks)
1637+
do_layer_backward_prop_begin_cbs(&l);
16301638
if (enable_layer)
16311639
l.back_prop();
1632-
do_layer_backward_prop_end_cbs(&l);
1640+
if (!skip_callbacks)
1641+
do_layer_backward_prop_end_cbs(&l);
16331642
}
16341643
else {
16351644
// To Do: Fix last batch problem in sub-graph parallelism
16361645
// experimental code to fix last batch problem in subgraph parallelism
16371646
}
16381647
}
16391648
else {
1640-
do_layer_backward_prop_begin_cbs(&l);
1649+
if (!skip_callbacks)
1650+
do_layer_backward_prop_begin_cbs(&l);
16411651
if (enable_layer)
16421652
l.back_prop();
1643-
do_layer_backward_prop_end_cbs(&l);
1653+
if (!skip_callbacks)
1654+
do_layer_backward_prop_end_cbs(&l);
16441655
}
16451656

16461657
// Terminate early if all gradients have been computed
@@ -1665,7 +1676,8 @@ void model::backward_prop(bool compute_weight_grads_only)
16651676
}
16661677
}
16671678

1668-
do_model_backward_prop_end_cbs();
1679+
if (!skip_callbacks)
1680+
do_model_backward_prop_end_cbs();
16691681
}
16701682

16711683
void model::update_weights()

0 commit comments

Comments
 (0)