Skip to content

Commit 9c33434

Browse files
committed
Use model methods for gradient checking
1 parent 4e92814 commit 9c33434

File tree

3 files changed

+44
-82
lines changed

3 files changed

+44
-82
lines changed

Diff for: include/lbann/models/model.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,10 @@ class model
399399
void reset_epoch_statistics(execution_mode mode);
400400

401401
/** @brief Forward propagation step. */
402-
void forward_prop(execution_mode mode);
402+
void forward_prop(execution_mode mode, bool skip_callbacks = false);
403403
/** @brief Backward propagation step. */
404-
void backward_prop(bool compute_weight_grads_only = true);
404+
void backward_prop(bool compute_weight_grads_only = true,
405+
bool skip_callbacks = false);
405406
/** Evaluate any metrics in the model */
406407
void evaluate_metrics(execution_mode mode, size_t current_mini_batch_size);
407408
/** @brief Clear each optimizer's gradient.

Diff for: src/callbacks/check_gradients.cpp

+15-66
Original file line numberDiff line numberDiff line change
@@ -58,30 +58,12 @@ namespace {
5858
EvalType compute_objective_function(model& m)
5959
{
6060
const auto& c = static_cast<SGDExecutionContext&>(m.get_execution_context());
61-
m.get_activation_reference_counter().clear();
62-
63-
// Forward prop, skipping input layers
64-
65-
if (m.is_subgraph_parallelism_enabled()) {
66-
for (auto&& l : m.get_layers()) {
67-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr &&
68-
l->get_run_layer_in_subgraph()) {
69-
l->forward_prop();
70-
}
71-
}
72-
}
73-
else // sub-graph parallelism not enabled
74-
{
75-
for (auto&& l : m.get_layers()) {
76-
if (dynamic_cast<input_layer<DataType>*>(l) == nullptr) {
77-
l->forward_prop();
78-
}
79-
}
80-
}
61+
const auto mode = c.get_execution_mode();
8162

8263
// Get objective function value
64+
m.forward_prop(mode, true);
8365
auto&& obj = m.get_objective_function();
84-
const auto mode = c.get_execution_mode();
66+
8567
const auto mini_batch_size = m.get_current_mini_batch_size();
8668
obj->start_evaluation(mode, mini_batch_size);
8769
return obj->finish_evaluation(mode, mini_batch_size);
@@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
134116
// Get weights matrix and gradient
135117
auto const& weights_matrix = dtw.get_values_sharded();
136118
auto const& gradient = dtw.get_optimizer()->get_gradient_sharded();
119+
137120
// Iterate through weights matrix entries
138121
for (El::Int col = 0; col < weights_matrix.Width(); ++col) {
139122
for (El::Int row = 0; row < weights_matrix.Height(); ++row) {
@@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
275258
for (auto&& met : m.get_metrics()) {
276259
met->reset_statistics(mode);
277260
}
278-
for (auto&& w : m.get_weights()) {
279-
auto&& opt = w->get_optimizer();
280-
if (opt != nullptr) {
281-
opt->clear_gradient();
282-
}
283-
}
284-
m.get_activation_reference_counter().clear();
261+
m.clear_gradients();
285262

286263
// Load data in input layers
287264
data_coordinator& dc = get_trainer().get_data_coordinator();
288265
dc.fetch_active_batch_synchronous(mode);
289266
El::Int current_mini_batch_size = dc.get_current_mini_batch_size(mode);
290267
m.set_current_mini_batch_size(current_mini_batch_size);
291268

292-
// checking subgrpah parallelism
293-
if (m.is_subgraph_parallelism_enabled()) {
294-
for (auto&& l : m.get_layers()) {
295-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr &&
296-
l->get_run_layer_in_subgraph()) {
297-
l->forward_prop();
298-
}
299-
}
300-
}
301-
else {
302-
for (auto&& l : m.get_layers()) {
303-
if (dynamic_cast<input_layer<DataType>*>(l) != nullptr) {
304-
l->forward_prop();
305-
}
306-
}
307-
}
308-
309269
// Compute objective function
310270
const EvalType objective = compute_objective_function(m);
311271

272+
// Compute gradients
273+
m.get_objective_function()->differentiate();
274+
m.get_objective_function()->compute_weight_regularization();
275+
276+
// Compute analytical gradients through model
277+
m.backward_prop(false, /*skip_callbacks=*/true);
278+
312279
// Choose finite difference step
313280
// Note: Consider a central difference scheme:
314281
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
@@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
323290
// epsilon based on the minimum step size of the float data type
324291
const EvalType epsilon =
325292
std::pow(std::numeric_limits<DataType>::epsilon(), 0.9);
326-
const EvalType step_size =
293+
const EvalType step_size = std::max(
294+
std::numeric_limits<EvalType>::epsilon(),
327295
(m_step_size > EvalType{0} ? m_step_size
328-
: std::fabs(objective) * El::Sqrt(epsilon));
296+
: std::fabs(objective) * El::Sqrt(epsilon)));
329297
EvalType expected_error =
330298
std::pow((epsilon * objective / step_size + std::pow(step_size, 4) / 18),
331299
0.9);
332300

333-
// Compute gradients
334-
m.get_objective_function()->differentiate();
335-
m.get_objective_function()->compute_weight_regularization();
336-
337-
// checking subgraph parallelism
338-
if (m.is_subgraph_parallelism_enabled()) {
339-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
340-
if (layers[i]->get_run_layer_in_subgraph()) {
341-
layers[i]->back_prop();
342-
}
343-
}
344-
}
345-
else {
346-
for (El::Int i = layers.size() - 1; i >= 0; --i) {
347-
layers[i]->back_prop();
348-
}
349-
}
350-
351301
// Print objective function value
352302
if (comm.am_world_master()) {
353303
std::cout << std::string(64, '-') << "\n"
@@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
383333
}
384334

385335
// Clean up
386-
// TODO: Why
387336
auto&& dataset = dc.get_dataset(mode);
388337
dataset.set_initial_position();
389338
m.get_objective_function()->reset_statistics(mode);

Diff for: src/models/model.cpp

+26-14
Original file line numberDiff line numberDiff line change
@@ -1550,10 +1550,11 @@ void model::clear_gradients()
15501550
}
15511551
}
15521552

1553-
void model::forward_prop(execution_mode mode)
1553+
void model::forward_prop(execution_mode mode, bool skip_callbacks)
15541554
{
15551555
LBANN_CALIPER_MARK_FUNCTION;
1556-
do_model_forward_prop_begin_cbs(mode);
1556+
if (!skip_callbacks)
1557+
do_model_forward_prop_begin_cbs(mode);
15571558

15581559
// Clear activations in reference counter
15591560
m_activation_refcnt.clear();
@@ -1563,25 +1564,30 @@ void model::forward_prop(execution_mode mode)
15631564

15641565
if (this->is_subgraph_parallelism_enabled()) {
15651566
if (l.get_run_layer_in_subgraph() || l.get_name() == "layer1") {
1566-
do_layer_forward_prop_begin_cbs(mode, &l);
1567+
if (!skip_callbacks)
1568+
do_layer_forward_prop_begin_cbs(mode, &l);
15671569
l.forward_prop();
1568-
do_layer_forward_prop_end_cbs(mode, &l);
1570+
if (!skip_callbacks)
1571+
do_layer_forward_prop_end_cbs(mode, &l);
15691572
}
15701573
else {
15711574
// To Do: Fix last batch problem in sub-graph parallelism
15721575
// experimental code to fix last batch problem in subgraph parallelism
15731576
}
15741577
}
15751578
else {
1576-
do_layer_forward_prop_begin_cbs(mode, &l);
1579+
if (!skip_callbacks)
1580+
do_layer_forward_prop_begin_cbs(mode, &l);
15771581
l.forward_prop();
1578-
do_layer_forward_prop_end_cbs(mode, &l);
1582+
if (!skip_callbacks)
1583+
do_layer_forward_prop_end_cbs(mode, &l);
15791584
}
15801585
}
1581-
do_model_forward_prop_end_cbs(mode);
1586+
if (!skip_callbacks)
1587+
do_model_forward_prop_end_cbs(mode);
15821588
}
15831589

1584-
void model::backward_prop(bool compute_weight_grads_only)
1590+
void model::backward_prop(bool compute_weight_grads_only, bool skip_callbacks)
15851591
{
15861592
LBANN_CALIPER_MARK_FUNCTION;
15871593

@@ -1591,7 +1597,8 @@ void model::backward_prop(bool compute_weight_grads_only)
15911597
bool const envvar_disable_layers =
15921598
!arg_parser.get<bool>(LBANN_OPTION_NO_BACKPROP_DISABLE);
15931599

1594-
do_model_backward_prop_begin_cbs();
1600+
if (!skip_callbacks)
1601+
do_model_backward_prop_begin_cbs();
15951602

15961603
for (El::Int i = get_num_layers() - 1; i >= 0; --i) {
15971604

@@ -1621,21 +1628,25 @@ void model::backward_prop(bool compute_weight_grads_only)
16211628

16221629
if (this->is_subgraph_parallelism_enabled()) {
16231630
if (l.get_run_layer_in_subgraph()) {
1624-
do_layer_backward_prop_begin_cbs(&l);
1631+
if (!skip_callbacks)
1632+
do_layer_backward_prop_begin_cbs(&l);
16251633
if (enable_layer)
16261634
l.back_prop();
1627-
do_layer_backward_prop_end_cbs(&l);
1635+
if (!skip_callbacks)
1636+
do_layer_backward_prop_end_cbs(&l);
16281637
}
16291638
else {
16301639
// To Do: Fix last batch problem in sub-graph parallelism
16311640
// experimental code to fix last batch problem in subgraph parallelism
16321641
}
16331642
}
16341643
else {
1635-
do_layer_backward_prop_begin_cbs(&l);
1644+
if (!skip_callbacks)
1645+
do_layer_backward_prop_begin_cbs(&l);
16361646
if (enable_layer)
16371647
l.back_prop();
1638-
do_layer_backward_prop_end_cbs(&l);
1648+
if (!skip_callbacks)
1649+
do_layer_backward_prop_end_cbs(&l);
16391650
}
16401651

16411652
// Terminate early if all gradients have been computed
@@ -1660,7 +1671,8 @@ void model::backward_prop(bool compute_weight_grads_only)
16601671
}
16611672
}
16621673

1663-
do_model_backward_prop_end_cbs();
1674+
if (!skip_callbacks)
1675+
do_model_backward_prop_end_cbs();
16641676
}
16651677

16661678
void model::update_weights()

0 commit comments

Comments
 (0)