@@ -58,30 +58,12 @@ namespace {
58
58
EvalType compute_objective_function (model& m)
59
59
{
60
60
const auto & c = static_cast <SGDExecutionContext&>(m.get_execution_context ());
61
- m.get_activation_reference_counter ().clear ();
62
-
63
- // Forward prop, skipping input layers
64
-
65
- if (m.is_subgraph_parallelism_enabled ()) {
66
- for (auto && l : m.get_layers ()) {
67
- if (dynamic_cast <input_layer<DataType>*>(l) == nullptr &&
68
- l->get_run_layer_in_subgraph ()) {
69
- l->forward_prop ();
70
- }
71
- }
72
- }
73
- else // sub-graph parallelism not enabled
74
- {
75
- for (auto && l : m.get_layers ()) {
76
- if (dynamic_cast <input_layer<DataType>*>(l) == nullptr ) {
77
- l->forward_prop ();
78
- }
79
- }
80
- }
61
+ const auto mode = c.get_execution_mode ();
81
62
82
63
// Get objective function value
64
+ m.forward_prop (mode, true );
83
65
auto && obj = m.get_objective_function ();
84
- const auto mode = c. get_execution_mode ();
66
+
85
67
const auto mini_batch_size = m.get_current_mini_batch_size ();
86
68
obj->start_evaluation (mode, mini_batch_size);
87
69
return obj->finish_evaluation (mode, mini_batch_size);
@@ -134,6 +116,7 @@ struct CheckWeightsFunctor : DefaultErrorReporter
134
116
// Get weights matrix and gradient
135
117
auto const & weights_matrix = dtw.get_values_sharded ();
136
118
auto const & gradient = dtw.get_optimizer ()->get_gradient_sharded ();
119
+
137
120
// Iterate through weights matrix entries
138
121
for (El::Int col = 0 ; col < weights_matrix.Width (); ++col) {
139
122
for (El::Int row = 0 ; row < weights_matrix.Height (); ++row) {
@@ -275,40 +258,24 @@ void check_gradients::do_check_gradients(model& m) const
275
258
for (auto && met : m.get_metrics ()) {
276
259
met->reset_statistics (mode);
277
260
}
278
- for (auto && w : m.get_weights ()) {
279
- auto && opt = w->get_optimizer ();
280
- if (opt != nullptr ) {
281
- opt->clear_gradient ();
282
- }
283
- }
284
- m.get_activation_reference_counter ().clear ();
261
+ m.clear_gradients ();
285
262
286
263
// Load data in input layers
287
264
data_coordinator& dc = get_trainer ().get_data_coordinator ();
288
265
dc.fetch_active_batch_synchronous (mode);
289
266
El::Int current_mini_batch_size = dc.get_current_mini_batch_size (mode);
290
267
m.set_current_mini_batch_size (current_mini_batch_size);
291
268
292
- // checking subgrpah parallelism
293
- if (m.is_subgraph_parallelism_enabled ()) {
294
- for (auto && l : m.get_layers ()) {
295
- if (dynamic_cast <input_layer<DataType>*>(l) != nullptr &&
296
- l->get_run_layer_in_subgraph ()) {
297
- l->forward_prop ();
298
- }
299
- }
300
- }
301
- else {
302
- for (auto && l : m.get_layers ()) {
303
- if (dynamic_cast <input_layer<DataType>*>(l) != nullptr ) {
304
- l->forward_prop ();
305
- }
306
- }
307
- }
308
-
309
269
// Compute objective function
310
270
const EvalType objective = compute_objective_function (m);
311
271
272
+ // Compute gradients
273
+ m.get_objective_function ()->differentiate ();
274
+ m.get_objective_function ()->compute_weight_regularization ();
275
+
276
+ // Compute analytical gradients through model
277
+ m.backward_prop (false , /* skip_callbacks=*/ true );
278
+
312
279
// Choose finite difference step
313
280
// Note: Consider a central difference scheme:
314
281
// f'(x) ~ ( - f(x+2h) + 8 f(x+h) - 8 f(x-h) + f(x-2h) ) / 12h
@@ -323,31 +290,14 @@ void check_gradients::do_check_gradients(model& m) const
323
290
// epsilon based on the minimum step size of the float data type
324
291
const EvalType epsilon =
325
292
std::pow (std::numeric_limits<DataType>::epsilon (), 0.9 );
326
- const EvalType step_size =
293
+ const EvalType step_size = std::max (
294
+ std::numeric_limits<EvalType>::epsilon (),
327
295
(m_step_size > EvalType{0 } ? m_step_size
328
- : std::fabs (objective) * El::Sqrt (epsilon));
296
+ : std::fabs (objective) * El::Sqrt (epsilon))) ;
329
297
EvalType expected_error =
330
298
std::pow ((epsilon * objective / step_size + std::pow (step_size, 4 ) / 18 ),
331
299
0.9 );
332
300
333
- // Compute gradients
334
- m.get_objective_function ()->differentiate ();
335
- m.get_objective_function ()->compute_weight_regularization ();
336
-
337
- // checking subgraph parallelism
338
- if (m.is_subgraph_parallelism_enabled ()) {
339
- for (El::Int i = layers.size () - 1 ; i >= 0 ; --i) {
340
- if (layers[i]->get_run_layer_in_subgraph ()) {
341
- layers[i]->back_prop ();
342
- }
343
- }
344
- }
345
- else {
346
- for (El::Int i = layers.size () - 1 ; i >= 0 ; --i) {
347
- layers[i]->back_prop ();
348
- }
349
- }
350
-
351
301
// Print objective function value
352
302
if (comm.am_world_master ()) {
353
303
std::cout << std::string (64 , ' -' ) << " \n "
@@ -383,7 +333,6 @@ void check_gradients::do_check_gradients(model& m) const
383
333
}
384
334
385
335
// Clean up
386
- // TODO: Why
387
336
auto && dataset = dc.get_dataset (mode);
388
337
dataset.set_initial_position ();
389
338
m.get_objective_function ()->reset_statistics (mode);
0 commit comments