diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS index 6223c4ca319..1c1b63c2773 100644 --- a/egs/swbd/s5c/RESULTS +++ b/egs/swbd/s5c/RESULTS @@ -152,13 +152,23 @@ exit 0 %WER 19.4 | 2628 21594 | 82.7 12.0 5.3 2.1 19.4 54.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 20.8 | 2628 21594 | 81.3 13.1 5.6 2.2 20.8 56.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys -# bidirectional LSTM with the same configuration as the above experiment, plus self-repair of all nonliearities and clipgradient activated -%WER 10.4 | 1831 21395 | 90.5 6.2 3.3 0.9 10.4 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys -%WER 11.3 | 1831 21395 | 89.8 6.8 3.3 1.1 11.3 46.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# bidirectional LSTM with the same configuration as the above experiment, with self-repair of all nonliearities and clipgradient, and max-change-per-component activated +%WER 14.9 | 4459 42989 | 86.7 9.0 4.3 1.6 14.9 50.5 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 15.9 | 4459 42989 | 85.7 9.8 4.5 1.7 15.9 52.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 10.2 | 1831 21395 | 90.8 6.1 3.2 1.0 10.2 44.4 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.2 | 1831 21395 | 89.9 6.8 3.3 1.1 11.2 46.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 19.4 | 2628 21594 | 82.7 11.8 5.4 2.2 19.4 54.5 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +%WER 20.6 | 2628 21594 | 81.5 12.8 5.7 2.2 20.6 56.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys + +( +# bidirectional LSTM with the same configuration as the above experiment, with self-repair of all nonliearities and clipgradient activated %WER 15.0 | 4459 42989 | 86.5 9.1 4.5 1.5 15.0 50.4 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys %WER 16.0 | 4459 42989 | 85.6 9.9 4.5 1.6 16.0 52.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +%WER 10.4 | 1831 21395 | 90.5 6.2 3.3 0.9 10.4 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +%WER 11.3 | 1831 21395 | 89.8 6.8 3.3 1.1 11.3 46.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys %WER 19.6 | 2628 21594 | 82.5 12.1 5.5 2.1 19.6 54.8 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys %WER 20.7 | 2628 21594 | 81.4 12.9 5.7 2.2 20.7 56.8 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys +) # results with nnet3 tdnn: local/nnet3/run_tdnn.sh (11.10.2015) (2 epoch training on speed-perturbed and volume perturbed data) %WER 12.1 | 1831 21395 | 89.1 7.1 3.8 1.3 12.1 48.1 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh old mode 100644 new mode 100755 index 5379149c9bd..26cdaed29d7 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh @@ -130,6 +130,7 @@ if [ $stage -le 12 ]; then --recurrent-projection-dim 256 \ --non-recurrent-projection-dim 256 \ --label-delay $label_delay \ + --self-repair-scale-clipgradient 1.0 \ $dir/configs || exit 1; fi diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh old mode 100644 new mode 100755 diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 9b9ce4a54ad..cf755a8d2ec 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -90,23 +90,29 @@ def AddPermuteLayer(config_lines, name, input, column_map): return {'descriptor': '{0}_permute'.format(name), 'dimension': input['dimension']} -def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): +def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "", max_change_per_component = 0.75): components = config_lines['components'] component_nodes = config_lines['component-nodes'] - components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) + # Per-component max-change option + max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' + + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) return {'descriptor': '{0}_affine'.format(name), 'dimension': output_dim} -def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None): +def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None, max_change_per_component = 0.75): components = config_lines['components'] component_nodes = config_lines['component-nodes'] # self_repair_scale is a constant scaling the self-repair vector computed in RectifiedLinearComponent self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' - components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) + # Per-component max-change option + max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' + + components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options)) components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string)) components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms)) @@ -244,6 +250,7 @@ def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objecti def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = " param-stddev=0 bias-stddev=0 ", + max_change_per_component = 1.5, label_delay=None, use_presoftmax_prior_scale = False, prior_scale_file = None, @@ -261,7 +268,7 @@ def AddFinalLayer(config_lines, input, output_dim, prev_layer_output = AddAffineLayer(config_lines, final_node_prefix , input, output_dim, - ng_affine_options) + ng_affine_options, max_change_per_component) if include_log_softmax: if use_presoftmax_prior_scale : components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file)) @@ -288,7 +295,8 @@ def AddLstmLayer(config_lines, ng_affine_options = "", lstm_delay = -1, self_repair_scale_nonlinearity = None, - self_repair_scale_clipgradient = None): + self_repair_scale_clipgradient = None, + max_change_per_component = 0.75): assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -316,24 +324,26 @@ def AddLstmLayer(config_lines, self_repair_clipgradient_string = "self-repair-scale={0:.2f}".format(self_repair_scale_clipgradient) if self_repair_scale_clipgradient is not None else '' # Natural gradient per element scale parameters ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + # Per-component max-change option + max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else '' # Parameter Definitions W*(* replaced by - to have valid names) components.append("# Input gate control : W_i* matrices") - components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options)) components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options)) components.append("# Forget gate control : W_f* matrices") - components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options)) components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options)) components.append("# Output gate control : W_o* matrices") - components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options)) components.append("# note : the cell outputs pass through a diagonal matrix") - components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1} {2} {3}".format(name, cell_dim, ng_per_element_scale_options, max_change_options)) components.append("# Cell input matrices : W_c* matrices") - components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options, max_change_options)) components.append("# Defining the non-linearities") @@ -385,7 +395,7 @@ def AddLstmLayer(config_lines, # add the recurrent connections if (add_recurrent_projection and add_non_recurrent_projection): components.append("# projection matrices : Wrm and Wpm") - components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options)) + components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options, max_change_options)) components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) component_nodes.append("# r_t and p_t") component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) @@ -396,7 +406,7 @@ def AddLstmLayer(config_lines, elif add_recurrent_projection: components.append("# projection matrices : Wrm") - components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options)) + components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options)) components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) component_nodes.append("# r_t") component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name)) @@ -425,7 +435,8 @@ def AddBLstmLayer(config_lines, ng_affine_options = "", lstm_delay = [-1,1], self_repair_scale_nonlinearity = None, - self_repair_scale_clipgradient = None): + self_repair_scale_clipgradient = None, + max_change_per_component = 0.75): assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0) output_forward = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim, recurrent_projection_dim, non_recurrent_projection_dim, @@ -433,14 +444,16 @@ def AddBLstmLayer(config_lines, ng_per_element_scale_options, ng_affine_options, lstm_delay = lstm_delay[0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, - self_repair_scale_clipgradient = self_repair_scale_clipgradient) + self_repair_scale_clipgradient = self_repair_scale_clipgradient, + max_change_per_component = max_change_per_component) output_backward = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim, recurrent_projection_dim, non_recurrent_projection_dim, clipping_threshold, norm_based_clipping, ng_per_element_scale_options, ng_affine_options, lstm_delay = lstm_delay[1], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, - self_repair_scale_clipgradient = self_repair_scale_clipgradient) + self_repair_scale_clipgradient = self_repair_scale_clipgradient, + max_change_per_component = max_change_per_component) output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor']) output_dim = output_forward['dimension'] + output_backward['dimension'] diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 53739f0f9ce..8e6e3d8e0e2 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -50,6 +50,12 @@ def GetArgs(): default=0.0) parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, help="add the final softmax layer ", default=True, choices = ["false", "true"]) + parser.add_argument("--max-change-per-component", type=float, + help="Enforces per-component max change (except for the final affine layer). " + "if 0 it would not be enforced.", default=0.75) + parser.add_argument("--max-change-per-component-final", type=float, + help="Enforces per-component max change for the final affine layer. " + "if 0 it would not be enforced.", default=1.5) # LSTM options parser.add_argument("--num-lstm-layers", type=int, @@ -122,6 +128,9 @@ def CheckArgs(args): if not args.ivector_dim >= 0: raise Exception("ivector-dim has to be non-negative") + if not args.max_change_per_component >= 0 or not args.max_change_per_component_final >= 0: + raise Exception("max-change-per-component and max_change-per-component-final should be non-negative") + if (args.num_lstm_layers < 1): sys.exit("--num-lstm-layers has to be a positive integer") if (args.clipping_threshold < 0): @@ -215,7 +224,8 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, norm_based_clipping, clipping_threshold, ng_per_element_scale_options, ng_affine_options, label_delay, include_log_softmax, xent_regularize, - self_repair_scale_nonlinearity, self_repair_scale_clipgradient): + self_repair_scale_nonlinearity, self_repair_scale_clipgradient, + max_change_per_component, max_change_per_component_final): config_lines = {'components':[], 'component-nodes':[]} @@ -238,22 +248,27 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, recurrent_projection_dim, non_recurrent_projection_dim, clipping_threshold, norm_based_clipping, ng_per_element_scale_options, ng_affine_options, - lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) + lstm_delay = lstm_delay[i], + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient, + max_change_per_component = max_change_per_component) else: # add a uni-directional LSTM layer prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, cell_dim, recurrent_projection_dim, non_recurrent_projection_dim, clipping_threshold, norm_based_clipping, ng_per_element_scale_options, ng_affine_options, - lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) + lstm_delay = lstm_delay[i][0], + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient, + max_change_per_component = max_change_per_component) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, max_change_per_component = max_change_per_component_final, label_delay = label_delay, include_log_softmax = include_log_softmax) if xent_regularize != 0.0: nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, include_log_softmax = True, label_delay = label_delay, + max_change_per_component = max_change_per_component_final, name_affix = 'xent') config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines @@ -262,14 +277,15 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, for i in range(num_lstm_layers, num_hidden_layers): prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1), prev_layer_output, hidden_dim, - ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity) + ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity, max_change_per_component = max_change_per_component) # make the intermediate config file for layerwise discriminative # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) + nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, max_change_per_component = max_change_per_component_final, label_delay = label_delay, include_log_softmax = include_log_softmax) if xent_regularize != 0.0: nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, include_log_softmax = True, label_delay = label_delay, + max_change_per_component = max_change_per_component_final, name_affix = 'xent') config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines @@ -326,7 +342,9 @@ def Main(): include_log_softmax = args.include_log_softmax, xent_regularize = args.xent_regularize, self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity, - self_repair_scale_clipgradient = args.self_repair_scale_clipgradient) + self_repair_scale_clipgradient = args.self_repair_scale_clipgradient, + max_change_per_component = args.max_change_per_component, + max_change_per_component_final = args.max_change_per_component_final) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py index bac260e93bc..60b291c9481 100755 --- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -95,6 +95,12 @@ def GetArgs(): parser.add_argument("--final-layer-normalize-target", type=float, help="RMS target for final layer (set to <1 if final layer learns too fast", default=1.0) + parser.add_argument("--max-change-per-component", type=float, + help="Enforces per-component max change (except for the final affine layer). " + "if 0 it would not be enforced.", default=0.75) + parser.add_argument("--max-change-per-component-final", type=float, + help="Enforces per-component max change for the final affine layer. " + "if 0 it would not be enforced.", default=1.5) parser.add_argument("--subset-dim", type=int, default=0, help="dimension of the subset of units to be sent to the central frame") parser.add_argument("--pnorm-input-dim", type=int, @@ -204,6 +210,9 @@ def CheckArgs(args): args.add_lda = False warnings.warn("--add-lda is set to false as CNN layers are used.") + if not args.max_change_per_component >= 0 or not args.max_change_per_component_final >= 0: + raise Exception("max-change-per-component and max_change-per-component-final should be non-negative") + return args def AddConvMaxpLayer(config_lines, name, input, args): @@ -333,6 +342,7 @@ def MakeConfigs(config_dir, splice_indexes_string, xent_regularize, xent_separate_forward_affine, self_repair_scale, + max_change_per_component, max_change_per_component_final, objective_type): parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) @@ -426,13 +436,15 @@ def MakeConfigs(config_dir, splice_indexes_string, if nonlin_type == "relu" : prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", prev_layer_output, nonlin_output_dim, + norm_target_rms = final_layer_normalize_target, self_repair_scale = self_repair_scale, - norm_target_rms = final_layer_normalize_target) + max_change_per_component = max_change_per_component) prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", prev_layer_output, nonlin_output_dim, + norm_target_rms = final_layer_normalize_target, self_repair_scale = self_repair_scale, - norm_target_rms = final_layer_normalize_target) + max_change_per_component = max_change_per_component) elif nonlin_type == "pnorm" : prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain", prev_layer_output, nonlin_input_dim, nonlin_output_dim, @@ -445,6 +457,7 @@ def MakeConfigs(config_dir, splice_indexes_string, raise Exception("Unknown nonlinearity type") nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, + max_change_per_component = max_change_per_component_final, use_presoftmax_prior_scale = use_presoftmax_prior_scale, prior_scale_file = prior_scale_file, include_log_softmax = include_log_softmax) @@ -452,6 +465,7 @@ def MakeConfigs(config_dir, splice_indexes_string, nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( 0.5 / xent_regularize), + max_change_per_component = max_change_per_component_final, use_presoftmax_prior_scale = use_presoftmax_prior_scale, prior_scale_file = prior_scale_file, include_log_softmax = True, @@ -460,8 +474,9 @@ def MakeConfigs(config_dir, splice_indexes_string, if nonlin_type == "relu": prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), prev_layer_output, nonlin_output_dims[i], + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target, self_repair_scale = self_repair_scale, - norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + max_change_per_component = max_change_per_component) elif nonlin_type == "pnorm": prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i), prev_layer_output, nonlin_input_dim, nonlin_output_dim, @@ -478,6 +493,7 @@ def MakeConfigs(config_dir, splice_indexes_string, # Usually used with an objective-type such as "quadratic". # Applications are k-binary classification such Ideal Ratio Mask prediction. nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, + max_change_per_component = max_change_per_component_final, use_presoftmax_prior_scale = use_presoftmax_prior_scale, prior_scale_file = prior_scale_file, include_log_softmax = include_log_softmax, @@ -487,6 +503,7 @@ def MakeConfigs(config_dir, splice_indexes_string, nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( 0.5 / xent_regularize), + max_change_per_component = max_change_per_component_final, use_presoftmax_prior_scale = use_presoftmax_prior_scale, prior_scale_file = prior_scale_file, include_log_softmax = True, @@ -538,6 +555,8 @@ def Main(): xent_regularize = args.xent_regularize, xent_separate_forward_affine = args.xent_separate_forward_affine, self_repair_scale = args.self_repair_scale_nonlinearity, + max_change_per_component = args.max_change_per_component, + max_change_per_component_final = args.max_change_per_component_final, objective_type = args.objective_type) if __name__ == "__main__": diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 1da521eebd3..c8dfff9e92a 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -34,17 +34,16 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, num_minibatches_processed_(0) { if (opts.nnet_config.zero_component_stats) ZeroComponentStats(nnet); - if (opts.nnet_config.momentum == 0.0 && - opts.nnet_config.max_param_change == 0.0) { - delta_nnet_= NULL; - } else { - KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 && - opts.nnet_config.max_param_change >= 0.0); - delta_nnet_ = nnet_->Copy(); - bool is_gradient = false; // setting this to true would disable the - // natural-gradient updates. - SetZero(is_gradient, delta_nnet_); - } + KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 && + opts.nnet_config.max_param_change >= 0.0); + delta_nnet_ = nnet_->Copy(); + bool is_gradient = false; // setting this to true would disable the + // natural-gradient updates. + SetZero(is_gradient, delta_nnet_); + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); + num_max_change_per_component_applied_.resize(num_updatable, 0); + num_max_change_global_applied_ = 0; + if (opts.nnet_config.read_cache != "") { bool binary; try { @@ -71,8 +70,7 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { const NnetComputation *computation = compiler_.Compile(request); NnetComputer computer(nnet_config.compute_config, *computation, - *nnet_, - (delta_nnet_ == NULL ? nnet_ : delta_nnet_)); + *nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, chain_eg.inputs); computer.Forward(); @@ -80,27 +78,7 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { this->ProcessOutputs(chain_eg, &computer); computer.Backward(); - if (delta_nnet_ != NULL) { - BaseFloat scale = (1.0 - nnet_config.momentum); - if (nnet_config.max_param_change != 0.0) { - BaseFloat param_delta = - std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale; - if (param_delta > nnet_config.max_param_change) { - if (param_delta - param_delta != 0.0) { - KALDI_WARN << "Infinite parameter change, will not apply."; - SetZero(false, delta_nnet_); - } else { - scale *= nnet_config.max_param_change / param_delta; - KALDI_LOG << "Parameter change too big: " << param_delta << " > " - << "--max-param-change=" << nnet_config.max_param_change - << ", scaling by " - << nnet_config.max_param_change / param_delta; - } - } - } - AddNnet(*delta_nnet_, scale, nnet_); - ScaleNnet(nnet_config.momentum, delta_nnet_); - } + UpdateParamsWithMaxChange(); } @@ -169,6 +147,88 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg, } } +void NnetChainTrainer::UpdateParamsWithMaxChange() { + KALDI_ASSERT(delta_nnet_ != NULL); + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + // computes scaling factors for per-component max-change + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); + Vector scale_factors = Vector(num_updatable); + BaseFloat param_delta_squared = 0.0; + int32 num_max_change_per_component_applied_per_minibatch = 0; + BaseFloat min_scale = 1.0; + std::string component_name_with_min_scale; + BaseFloat max_change_with_min_scale; + int32 i = 0; + for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { + Component *comp = delta_nnet_->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *uc = dynamic_cast(comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + BaseFloat max_param_change_per_comp = uc->MaxChange(); + KALDI_ASSERT(max_param_change_per_comp >= 0.0); + BaseFloat dot_prod = uc->DotProduct(*uc); + if (max_param_change_per_comp != 0.0 && + std::sqrt(dot_prod) > max_param_change_per_comp) { + scale_factors(i) = max_param_change_per_comp / std::sqrt(dot_prod); + num_max_change_per_component_applied_[i]++; + num_max_change_per_component_applied_per_minibatch++; + KALDI_VLOG(2) << "Parameters in " << delta_nnet_->GetComponentName(c) + << " change too big: " << std::sqrt(dot_prod) << " > " + << "max-change=" << max_param_change_per_comp + << ", scaling by " << scale_factors(i); + } else { + scale_factors(i) = 1.0; + } + if (i == 0 || scale_factors(i) < min_scale) { + min_scale = scale_factors(i); + component_name_with_min_scale = delta_nnet_->GetComponentName(c); + max_change_with_min_scale = max_param_change_per_comp; + } + param_delta_squared += std::pow(scale_factors(i), 2.0) * dot_prod; + i++; + } + } + KALDI_ASSERT(i == scale_factors.Dim()); + BaseFloat param_delta = std::sqrt(param_delta_squared); + // computes the scale for global max-change (with momentum) + BaseFloat scale = (1.0 - nnet_config.momentum); + if (nnet_config.max_param_change != 0.0) { + param_delta *= scale; + if (param_delta > nnet_config.max_param_change) { + if (param_delta - param_delta != 0.0) { + KALDI_WARN << "Infinite parameter change, will not apply."; + SetZero(false, delta_nnet_); + } else { + scale *= nnet_config.max_param_change / param_delta; + num_max_change_global_applied_++; + } + } + } + if ((nnet_config.max_param_change != 0.0 && + param_delta > nnet_config.max_param_change && + param_delta - param_delta == 0.0) || min_scale < 1.0) { + std::ostringstream ostr; + if (min_scale < 1.0) + ostr << "Per-component max-change active on " + << num_max_change_per_component_applied_per_minibatch + << " / " << num_updatable << " Updatable Components." + << "(smallest factor=" << min_scale << " on " + << component_name_with_min_scale + << " with max-change=" << max_change_with_min_scale <<"). "; + if (param_delta > nnet_config.max_param_change) + ostr << "Global max-change factor was " + << nnet_config.max_param_change / param_delta + << " with max-change=" << nnet_config.max_param_change << "."; + KALDI_LOG << ostr.str(); + } + // applies both of the max-change scalings all at once, component by component + // and updates parameters + scale_factors.Scale(scale); + AddNnetComponents(*delta_nnet_, scale_factors, scale, nnet_); + ScaleNnet(nnet_config.momentum, delta_nnet_); +} bool NnetChainTrainer::PrintTotalStats() const { unordered_map::const_iterator @@ -183,6 +243,29 @@ bool NnetChainTrainer::PrintTotalStats() const { return ans; } +void NnetChainTrainer::PrintMaxChangeStats() const { + KALDI_ASSERT(delta_nnet_ != NULL); + int32 i = 0; + for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { + Component *comp = delta_nnet_->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *uc = dynamic_cast(comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + if (num_max_change_per_component_applied_[i] > 0) + KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) + << ", per-component max-change was enforced " + << (100.0 * num_max_change_per_component_applied_[i]) / + num_minibatches_processed_ << " \% of the time."; + i++; + } + } + if (num_max_change_global_applied_ > 0) + KALDI_LOG << "The global max-change was enforced " + << (100.0 * num_max_change_global_applied_) / + num_minibatches_processed_ << " \% of the time."; +} NnetChainTrainer::~NnetChainTrainer() { if (opts_.nnet_config.write_cache != "") { diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index a4810fe16c6..4a3d84353d9 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -64,11 +64,20 @@ class NnetChainTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + ~NnetChainTrainer(); private: void ProcessOutputs(const NnetChainExample &eg, NnetComputer *computer); + // Applies per-component max-change and global max-change to all updatable + // components in *delta_nnet_, and use *delta_nnet_ to update parameters + // in *nnet_. + void UpdateParamsWithMaxChange(); + const NnetChainTrainingOptions opts_; chain::DenominatorGraph den_graph_; @@ -85,6 +94,10 @@ class NnetChainTrainer { // So we store the objective functions per output layer. int32 num_minibatches_processed_; + // stats for max-change. + std::vector num_max_change_per_component_applied_; + int32 num_max_change_global_applied_; + unordered_map objf_info_; }; diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 168a2a5350a..cfc28242156 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -183,7 +183,9 @@ bool Component::IsComputable(const MiscComputationInfo &misc_info, void UpdatableComponent::InitLearningRatesFromConfig(ConfigLine *cfl) { cfl->GetValue("learning-rate", &learning_rate_); cfl->GetValue("learning-rate-factor", &learning_rate_factor_); - if (learning_rate_ < 0.0 || learning_rate_factor_ < 0.0) + max_change_ = 0.0; + cfl->GetValue("max-change", &max_change_); + if (learning_rate_ < 0.0 || learning_rate_factor_ < 0.0 || max_change_ < 0.0) KALDI_ERR << "Bad initializer " << cfl->WholeLine(); } @@ -210,6 +212,12 @@ void UpdatableComponent::ReadUpdatableCommon(std::istream &is, bool binary) { } else { is_gradient_ = false; } + if (token == "") { + ReadBasicType(is, binary, &max_change_); + ReadToken(is, binary, &token); + } else { + max_change_ = 0.0; + } if (token == "") { ReadBasicType(is, binary, &learning_rate_); } else { @@ -232,6 +240,10 @@ void UpdatableComponent::WriteUpdatableCommon(std::ostream &os, WriteToken(os, binary, ""); WriteBasicType(os, binary, is_gradient_); } + if (max_change_ > 0.0) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, max_change_); + } WriteToken(os, binary, ""); WriteBasicType(os, binary, learning_rate_); } @@ -246,6 +258,8 @@ std::string UpdatableComponent::Info() const { stream << ", is-gradient=true"; if (learning_rate_factor_ != 1.0) stream << ", learning-rate-factor=" << learning_rate_factor_; + if (max_change_ > 0.0) + stream << ", max-change=" << max_change_; return stream.str(); } diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 164f9d056e7..e1391630c9f 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -373,7 +373,7 @@ class UpdatableComponent: public Component { UpdatableComponent(const UpdatableComponent &other): learning_rate_(other.learning_rate_), learning_rate_factor_(other.learning_rate_factor_), - is_gradient_(other.is_gradient_) { } + is_gradient_(other.is_gradient_), max_change_(other.max_change_) { } /// \brief Sets parameters to zero, and if treat_as_gradient is true, /// sets is_gradient_ to true and sets learning_rate_ to 1, ignoring @@ -381,7 +381,7 @@ class UpdatableComponent: public Component { virtual void SetZero(bool treat_as_gradient) = 0; UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0), - is_gradient_(false) { } + is_gradient_(false), max_change_(0.0) { } virtual ~UpdatableComponent() { } @@ -408,6 +408,12 @@ class UpdatableComponent: public Component { /// a different value than x will returned. BaseFloat LearningRate() const { return learning_rate_; } + /// Gets per-component max-change value. Note: the components themselves do + /// not enforce the per-component max-change; it's enforced in class + /// NnetTrainer by querying the max-changes for each component. + /// See NnetTrainer::UpdateParamsWithMaxChange() in nnet3/nnet-training.cc. + BaseFloat MaxChange() const { return max_change_; } + virtual std::string Info() const; /// The following new virtual function returns the total dimension of @@ -446,6 +452,7 @@ class UpdatableComponent: public Component { ///< than as parameters. Its main effect is that we disable ///< any natural-gradient update and just compute the standard ///< gradient. + BaseFloat max_change_; ///< configuration value for imposing max-change private: const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow. diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 6940ba8302a..ba352af19be 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -2671,9 +2671,10 @@ void NaturalGradientAffineComponent::Init( SetNaturalGradientConfigs(); if (max_change_per_sample > 0.0) KALDI_WARN << "You are setting a positive max_change_per_sample for " - << "NaturalGradientAffineComponent. But the per-component " - << "gradient clipping mechansim has been removed. Instead it's currently " - << "done at the whole model level."; + << "NaturalGradientAffineComponent. But it has been deprecated. " + << "Please use max_change for all updatable components instead " + << "to activate the per-component max change mechanism."; + KALDI_ASSERT(max_change_per_sample >= 0.0); max_change_per_sample_ = max_change_per_sample; is_gradient_ = false; // not configurable; there's no reason you'd want this update_count_ = 0.0; @@ -3366,9 +3367,9 @@ void NaturalGradientPerElementScaleComponent::Init( max_change_per_minibatch_ = max_change_per_minibatch; if (max_change_per_minibatch > 0.0) KALDI_WARN << "You are setting a positive max_change_per_minibatch for " - << "NaturalGradientPerElementScaleComponent. But the per-component " - << "gradient clipping mechansim has been removed. Instead it's currently " - << "done at the whole model level."; + << "NaturalGradientPerElementScaleComponent. But it has been deprecated. " + << "Please use max_change for all updatable components instead " + << "to activate the per-component max change mechanism."; } void NaturalGradientPerElementScaleComponent::Init( diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 037bc45013b..7251e4bc54b 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -32,16 +32,16 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, num_minibatches_processed_(0) { if (config.zero_component_stats) ZeroComponentStats(nnet); - if (config.momentum == 0.0 && config.max_param_change == 0.0) { - delta_nnet_= NULL; - } else { - KALDI_ASSERT(config.momentum >= 0.0 && - config.max_param_change >= 0.0); - delta_nnet_ = nnet_->Copy(); - bool is_gradient = false; // setting this to true would disable the - // natural-gradient updates. - SetZero(is_gradient, delta_nnet_); - } + KALDI_ASSERT(config.momentum >= 0.0 && + config.max_param_change >= 0.0); + delta_nnet_ = nnet_->Copy(); + bool is_gradient = false; // setting this to true would disable the + // natural-gradient updates. + SetZero(is_gradient, delta_nnet_); + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); + num_max_change_per_component_applied_.resize(num_updatable, 0); + num_max_change_global_applied_ = 0; + if (config_.read_cache != "") { bool binary; try { @@ -65,8 +65,7 @@ void NnetTrainer::Train(const NnetExample &eg) { const NnetComputation *computation = compiler_.Compile(request); NnetComputer computer(config_.compute_config, *computation, - *nnet_, - (delta_nnet_ == NULL ? nnet_ : delta_nnet_)); + *nnet_, delta_nnet_); // give the inputs to the computer object. computer.AcceptInputs(*nnet_, eg.io); computer.Forward(); @@ -74,26 +73,7 @@ void NnetTrainer::Train(const NnetExample &eg) { this->ProcessOutputs(eg, &computer); computer.Backward(); - if (delta_nnet_ != NULL) { - BaseFloat scale = (1.0 - config_.momentum); - if (config_.max_param_change != 0.0) { - BaseFloat param_delta = - std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale; - if (param_delta > config_.max_param_change) { - if (param_delta - param_delta != 0.0) { - KALDI_WARN << "Infinite parameter change, will not apply."; - SetZero(false, delta_nnet_); - } else { - scale *= config_.max_param_change / param_delta; - KALDI_LOG << "Parameter change too big: " << param_delta << " > " - << "--max-param-change=" << config_.max_param_change - << ", scaling by " << config_.max_param_change / param_delta; - } - } - } - AddNnet(*delta_nnet_, scale, nnet_); - ScaleNnet(config_.momentum, delta_nnet_); - } + UpdateParamsWithMaxChange(); } void NnetTrainer::ProcessOutputs(const NnetExample &eg, @@ -118,6 +98,88 @@ void NnetTrainer::ProcessOutputs(const NnetExample &eg, } } +void NnetTrainer::UpdateParamsWithMaxChange() { + KALDI_ASSERT(delta_nnet_ != NULL); + // computes scaling factors for per-component max-change + const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); + Vector scale_factors = Vector(num_updatable); + BaseFloat param_delta_squared = 0.0; + int32 num_max_change_per_component_applied_per_minibatch = 0; + BaseFloat min_scale = 1.0; + std::string component_name_with_min_scale; + BaseFloat max_change_with_min_scale; + int32 i = 0; + for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { + Component *comp = delta_nnet_->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *uc = dynamic_cast(comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + BaseFloat max_param_change_per_comp = uc->MaxChange(); + KALDI_ASSERT(max_param_change_per_comp >= 0.0); + BaseFloat dot_prod = uc->DotProduct(*uc); + if (max_param_change_per_comp != 0.0 && + std::sqrt(dot_prod) > max_param_change_per_comp) { + scale_factors(i) = max_param_change_per_comp / std::sqrt(dot_prod); + num_max_change_per_component_applied_[i]++; + num_max_change_per_component_applied_per_minibatch++; + KALDI_VLOG(2) << "Parameters in " << delta_nnet_->GetComponentName(c) + << " change too big: " << std::sqrt(dot_prod) << " > " + << "max-change=" << max_param_change_per_comp + << ", scaling by " << scale_factors(i); + } else { + scale_factors(i) = 1.0; + } + if (i == 0 || scale_factors(i) < min_scale) { + min_scale = scale_factors(i); + component_name_with_min_scale = delta_nnet_->GetComponentName(c); + max_change_with_min_scale = max_param_change_per_comp; + } + param_delta_squared += std::pow(scale_factors(i), 2.0) * dot_prod; + i++; + } + } + KALDI_ASSERT(i == scale_factors.Dim()); + BaseFloat param_delta = std::sqrt(param_delta_squared); + // computes the scale for global max-change (with momentum) + BaseFloat scale = (1.0 - config_.momentum); + if (config_.max_param_change != 0.0) { + param_delta *= scale; + if (param_delta > config_.max_param_change) { + if (param_delta - param_delta != 0.0) { + KALDI_WARN << "Infinite parameter change, will not apply."; + SetZero(false, delta_nnet_); + } else { + scale *= config_.max_param_change / param_delta; + num_max_change_global_applied_++; + } + } + } + if ((config_.max_param_change != 0.0 && + param_delta > config_.max_param_change && + param_delta - param_delta == 0.0) || min_scale < 1.0) { + std::ostringstream ostr; + if (min_scale < 1.0) + ostr << "Per-component max-change active on " + << num_max_change_per_component_applied_per_minibatch + << " / " << num_updatable << " Updatable Components." + << "(smallest factor=" << min_scale << " on " + << component_name_with_min_scale + << " with max-change=" << max_change_with_min_scale <<"). "; + if (param_delta > config_.max_param_change) + ostr << "Global max-change factor was " + << config_.max_param_change / param_delta + << " with max-change=" << config_.max_param_change << "."; + KALDI_LOG << ostr.str(); + } + // applies both of the max-change scalings all at once, component by component + // and updates parameters + scale_factors.Scale(scale); + AddNnetComponents(*delta_nnet_, scale_factors, scale, nnet_); + ScaleNnet(config_.momentum, delta_nnet_); +} + bool NnetTrainer::PrintTotalStats() const { unordered_map::const_iterator iter = objf_info_.begin(), @@ -128,9 +190,34 @@ bool NnetTrainer::PrintTotalStats() const { const ObjectiveFunctionInfo &info = iter->second; ans = ans || info.PrintTotalStats(name); } + PrintMaxChangeStats(); return ans; } +void NnetTrainer::PrintMaxChangeStats() const { + KALDI_ASSERT(delta_nnet_ != NULL); + int32 i = 0; + for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { + Component *comp = delta_nnet_->GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + UpdatableComponent *uc = dynamic_cast(comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + if (num_max_change_per_component_applied_[i] > 0) + KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) + << ", per-component max-change was enforced " + << (100.0 * num_max_change_per_component_applied_[i]) / + num_minibatches_processed_ << " \% of the time."; + i++; + } + } + if (num_max_change_global_applied_ > 0) + KALDI_LOG << "The global max-change was enforced " + << (100.0 * num_max_change_global_applied_) / + num_minibatches_processed_ << " \% of the time."; +} + void ObjectiveFunctionInfo::UpdateStats( const std::string &output_name, int32 minibatches_per_phase, diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index 2900edc1c13..70c90267c66 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -151,11 +151,20 @@ class NnetTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + ~NnetTrainer(); private: void ProcessOutputs(const NnetExample &eg, NnetComputer *computer); + // Applies per-component max-change and global max-change to all updatable + // components in *delta_nnet_, and use *delta_nnet_ to update parameters + // in *nnet_. + void UpdateParamsWithMaxChange(); + const NnetTrainerOptions config_; Nnet *nnet_; Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != @@ -170,6 +179,10 @@ class NnetTrainer { // So we store the objective functions per output layer. int32 num_minibatches_processed_; + // stats for max-change. + std::vector num_max_change_per_component_applied_; + int32 num_max_change_global_applied_; + unordered_map objf_info_; }; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 955e200d072..d65193d9a54 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -351,6 +351,33 @@ void ScaleNnet(BaseFloat scale, Nnet *nnet) { } } +void AddNnetComponents(const Nnet &src, const Vector &alphas, + BaseFloat scale, Nnet *dest) { + if (src.NumComponents() != dest->NumComponents()) + KALDI_ERR << "Trying to add incompatible nnets."; + int32 i = 0; + for (int32 c = 0; c < src.NumComponents(); c++) { + const Component *src_comp = src.GetComponent(c); + Component *dest_comp = dest->GetComponent(c); + if (src_comp->Properties() & kUpdatableComponent) { + // For now all updatable components inherit from class UpdatableComponent. + // If that changes in future, we will change this code. + const UpdatableComponent *src_uc = + dynamic_cast(src_comp); + UpdatableComponent *dest_uc = + dynamic_cast(dest_comp); + if (src_uc == NULL || dest_uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + "UpdatableComponent; change this code."; + KALDI_ASSERT(i < alphas.Dim()); + dest_uc->Add(alphas(i++), *src_uc); + } else { // add stored stats + dest_comp->Add(scale, *src_comp); + } + } + KALDI_ASSERT(i == alphas.Dim()); +} + void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest) { if (src.NumComponents() != dest->NumComponents()) KALDI_ERR << "Trying to add incompatible nnets."; diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 9606bd5d5b7..1e0dcefd703 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -145,6 +145,12 @@ void ScaleNnetComponents(const Vector &scales, /// stored stats). void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest); +/// Does *dest += alpha * src for updatable components (affect nnet parameters), +/// and *dest += scale * src for other components (affect stored stats). +/// Here, alphas is a vector of size equal to the number of updatable components +void AddNnetComponents(const Nnet &src, const Vector &alphas, + BaseFloat scale, Nnet *dest); + /// Returns the total of the number of parameters in the updatable components of /// the nnet. int32 NumParameters(const Nnet &src);