diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py index 9b9ce4a54ad..376ea291a28 100644 --- a/egs/wsj/s5/steps/nnet3/components.py +++ b/egs/wsj/s5/steps/nnet3/components.py @@ -10,6 +10,8 @@ def GetSumDescriptor(inputs): sum_descriptors = inputs + if len(inputs) == 1: + return inputs while len(sum_descriptors) != 1: cur_sum_descriptors = [] pair = [] @@ -52,8 +54,9 @@ def AddNoOpLayer(config_lines, name, input): components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension'])) component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor'])) - return {'descriptor': '{0}_noop'.format(name), - 'dimension': input['dimension']} + return {'output' : {'descriptor': '{0}_noop'.format(name), + 'dimension': input['dimension']}, + 'num_learnable_params' : 0} def AddLdaLayer(config_lines, name, input, lda_file): return AddFixedAffineLayer(config_lines, name, input, lda_file) @@ -65,8 +68,9 @@ def AddFixedAffineLayer(config_lines, name, input, matrix_file): components.append('component name={0}_fixaffine type=FixedAffineComponent matrix={1}'.format(name, matrix_file)) component_nodes.append('component-node name={0}_fixaffine component={0}_fixaffine input={1}'.format(name, input['descriptor'])) - return {'descriptor': '{0}_fixaffine'.format(name), - 'dimension': input['dimension']} + return {'output' : {'descriptor': '{0}_fixaffine'.format(name), + 'dimension': input['dimension']}, + 'num_learnable_params' : 0} def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): @@ -77,8 +81,9 @@ def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks): components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks)) component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor'])) - return {'descriptor' : '{0}_block_affine'.format(name), - 'dimension' : output_dim} + return {'output' : {'descriptor' : '{0}_block_affine'.format(name), + 'dimension' : output_dim}, + 'num_learnable_params' : (input['dimension']+1) * output_dim} def AddPermuteLayer(config_lines, name, input, column_map): components = config_lines['components'] @@ -87,8 +92,9 @@ def AddPermuteLayer(config_lines, name, input, column_map): components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes)) component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor'])) - return {'descriptor': '{0}_permute'.format(name), - 'dimension': input['dimension']} + return {'output' : {'descriptor': '{0}_permute'.format(name), + 'dimension': input['dimension']}, + 'num_learnable_params' : 0 } def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""): components = config_lines['components'] @@ -97,8 +103,9 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = "" components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options)) component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) - return {'descriptor': '{0}_affine'.format(name), - 'dimension': output_dim} + return {'output' : {'descriptor': '{0}_affine'.format(name), + 'dimension': output_dim}, + 'num_learnable_params' : (input['dimension']+1) * output_dim } def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None): components = config_lines['components'] @@ -113,9 +120,9 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor'])) component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name)) component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name)) - - return {'descriptor': '{0}_renorm'.format(name), - 'dimension': output_dim} + return {'output' : {'descriptor': '{0}_renorm'.format(name), + 'dimension': output_dim}, + 'num_learnable_params' : input['dimension'] * output_dim } def AddAffPnormLayer(config_lines, name, input, pnorm_input_dim, pnorm_output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0): components = config_lines['components'] @@ -129,8 +136,9 @@ def AddAffPnormLayer(config_lines, name, input, pnorm_input_dim, pnorm_output_di component_nodes.append("component-node name={0}_pnorm component={0}_pnorm input={0}_affine".format(name)) component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_pnorm".format(name)) - return {'descriptor': '{0}_renorm'.format(name), - 'dimension': pnorm_output_dim} + return {'output' : {'descriptor': '{0}_renorm'.format(name), + 'dimension': pnorm_output_dim}, + 'num_learnable_params' : input['dimension'] * pnorm_input_dim } def AddConvolutionLayer(config_lines, name, input, input_x_dim, input_y_dim, input_z_dim, @@ -140,7 +148,7 @@ def AddConvolutionLayer(config_lines, name, input, param_stddev = None, bias_stddev = None, filter_bias_file = None, is_updatable = True): - assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim) + assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim ) components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -164,10 +172,11 @@ def AddConvolutionLayer(config_lines, name, input, num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step) num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step) output_dim = num_x_steps * num_y_steps * num_filters; - return {'descriptor': '{0}_conv_t'.format(name), - 'dimension': output_dim, - '3d-dim': [num_x_steps, num_y_steps, num_filters], - 'vectorization': 'zyx'} + return {'output' : {'descriptor': '{0}_conv_t'.format(name), + 'dimension': output_dim, + '3d-dim': [num_x_steps, num_y_steps, num_filters], + 'vectorization': 'zyx'}, + 'num_learnable_params' : filt_x_dim * filt_y_dim * input_z_dim } # The Maxpooling component assumes input vectorizations of type zyx def AddMaxpoolingLayer(config_lines, name, input, @@ -202,11 +211,11 @@ def AddMaxpoolingLayer(config_lines, name, input, num_pools_z = 1 + (input_z_dim - pool_z_size) / pool_z_step; output_dim = num_pools_x * num_pools_y * num_pools_z; - return {'descriptor': '{0}_maxp_t'.format(name), - 'dimension': output_dim, - '3d-dim': [num_pools_x, num_pools_y, num_pools_z], - 'vectorization': 'zyx'} - + return {'output' : {'descriptor': '{0}_maxp_t'.format(name), + 'dimension': output_dim, + '3d-dim': [num_pools_x, num_pools_y, num_pools_z], + 'vectorization': 'zyx'}, + 'num_learnable_params' : 0 } def AddSoftmaxLayer(config_lines, name, input): components = config_lines['components'] @@ -215,8 +224,9 @@ def AddSoftmaxLayer(config_lines, name, input): components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension'])) component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor'])) - return {'descriptor': '{0}_log_softmax'.format(name), - 'dimension': input['dimension']} + return {'output' : {'descriptor': '{0}_log_softmax'.format(name), + 'dimension': input['dimension']}, + 'num_learnable_params' : 0 } def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None): @@ -227,8 +237,9 @@ def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None): self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else '' components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string)) component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor'])) - return {'descriptor': '{0}_sigmoid'.format(name), - 'dimension': input['dimension']} + return {'output' : {'descriptor': '{0}_sigmoid'.format(name), + 'dimension': input['dimension']}, + 'num_learnable_params' : 0} def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"): components = config_lines['components'] @@ -254,29 +265,38 @@ def AddFinalLayer(config_lines, input, output_dim, components = config_lines['components'] component_nodes = config_lines['component-nodes'] + num_learnable_params = 0 if name_affix is not None: final_node_prefix = 'Final-' + str(name_affix) else: final_node_prefix = 'Final' - prev_layer_output = AddAffineLayer(config_lines, + prev_layer = AddAffineLayer(config_lines, final_node_prefix , input, output_dim, ng_affine_options) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + if include_log_softmax: if use_presoftmax_prior_scale : components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file)) component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix, prev_layer_output['descriptor'])) prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix) - prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output) + prev_layer = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] elif add_final_sigmoid: # Useful when you need the final outputs to be probabilities # between 0 and 1. # Usually used with an objective-type such as "quadratic" - prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output) + prev_layer = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] # we use the same name_affix as a prefix in for affine/scale nodes but as a # suffix for output node AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type) + return num_learnable_params def AddLstmLayer(config_lines, name, input, cell_dim, @@ -289,7 +309,9 @@ def AddLstmLayer(config_lines, lstm_delay = -1, self_repair_scale_nonlinearity = None, self_repair_scale_clipgradient = None): + assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0) + num_learnable_params = 0 components = config_lines['components'] component_nodes = config_lines['component-nodes'] @@ -319,21 +341,29 @@ def AddLstmLayer(config_lines, # Parameter Definitions W*(* replaced by - to have valid names) components.append("# Input gate control : W_i* matrices") components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim components.append("# note : the cell outputs pass through a diagonal matrix") components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + num_learnable_params += cell_dim components.append("# Forget gate control : W_f* matrices") components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim + components.append("# note : the cell outputs pass through a diagonal matrix") components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + num_learnable_params += cell_dim components.append("# Output gate control : W_o* matrices") components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim components.append("# note : the cell outputs pass through a diagonal matrix") components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options)) + num_learnable_params += cell_dim components.append("# Cell input matrices : W_c* matrices") components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options)) + num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim components.append("# Defining the non-linearities") @@ -386,6 +416,7 @@ def AddLstmLayer(config_lines, if (add_recurrent_projection and add_non_recurrent_projection): components.append("# projection matrices : Wrm and Wpm") components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options)) + num_learnable_params += (cell_dim + 1) * (non_recurrent_projection_dim + recurrent_projection_dim) components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) component_nodes.append("# r_t and p_t") component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name)) @@ -397,6 +428,7 @@ def AddLstmLayer(config_lines, elif add_recurrent_projection: components.append("# projection matrices : Wrm") components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options)) + num_learnable_params += (cell_dim + 1) * recurrent_projection_dim components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string)) component_nodes.append("# r_t") component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name)) @@ -410,10 +442,9 @@ def AddLstmLayer(config_lines, output_descriptor = '{0}_r_t'.format(name) output_dim = cell_dim - return { - 'descriptor': output_descriptor, - 'dimension':output_dim - } + return {'output' : {'descriptor': output_descriptor, + 'dimension':output_dim}, + 'num_learnable_params' : num_learnable_params} def AddBLstmLayer(config_lines, name, input, cell_dim, @@ -427,25 +458,225 @@ def AddBLstmLayer(config_lines, self_repair_scale_nonlinearity = None, self_repair_scale_clipgradient = None): assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0) - output_forward = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim, + num_learnable_params = 0 + prev_layer = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim, recurrent_projection_dim, non_recurrent_projection_dim, clipping_threshold, norm_based_clipping, ng_per_element_scale_options, ng_affine_options, lstm_delay = lstm_delay[0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) - output_backward = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim, + output_forward = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + + prev_layer = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim, recurrent_projection_dim, non_recurrent_projection_dim, clipping_threshold, norm_based_clipping, ng_per_element_scale_options, ng_affine_options, lstm_delay = lstm_delay[1], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) + output_backward = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor']) output_dim = output_forward['dimension'] + output_backward['dimension'] - return { - 'descriptor': output_descriptor, - 'dimension':output_dim - } - + return {'output' : {'descriptor': output_descriptor, + 'dimension':output_dim}, + 'num_learnable_params' : num_learnable_params} + +def AddTdnnLayer(config_lines, name, input, splice_indexes, + nonlin_type, nonlin_input_dim, nonlin_output_dim, + subset_dim = 0, ng_affine_options = " bias-stddev=0 ", + self_repair_scale = 0, norm_target_rms = 1.0): + + # prepare the layer input + try: + zero_index = splice_indexes.index(0) + except ValueError: + zero_index = None + + # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor + prev_layer_output_descriptor = input['descriptor'] + subset_output = input + if subset_dim > 0: + # if subset_dim is specified the script expects a zero in the splice indexes + assert(zero_index is not None) + subset_node_config = "dim-range-node name={0}_input input-node={1} dim-offset={2} dim={3}".format(name, prev_layer_output_descriptor, 0, subset_dim) + subset_output = {'descriptor' : '{0}_input'.format(name), + 'dimension' : subset_dim} + config_lines['component-nodes'].append(subset_node_config) + appended_descriptors = [] + appended_dimension = 0 + for j in range(len(splice_indexes)): + if j == zero_index: + appended_descriptors.append(input['descriptor']) + appended_dimension += input['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[j])) + appended_dimension += subset_output['dimension'] + prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension} + + # add the affine layer + if nonlin_type == "relu": + prev_layer = AddAffRelNormLayer(config_lines, name, + prev_layer_output, + nonlin_output_dim, + ng_affine_options = ng_affine_options, + self_repair_scale = self_repair_scale, + norm_target_rms = norm_target_rms) + prev_layer_output = prev_layer['output'] + elif nonlin_type == "pnorm": + prev_layer = AddAffPnormLayer(config_lines, name, + prev_layer_output, + nonlin_input_dim, nonlin_output_dim, + ng_affine_options = ng_affine_options, + norm_target_rms = norm_target_rms) + else: + raise Exception("Unknown nonlinearity type") + + return prev_layer + + + + +# Convenience functions + +def SpliceInput(input, splice_indexes): + appended_descriptors = [] + appended_dimension = 0 + + try: + zero_index = splice_indexes.index(0) + except ValueError: + zero_index = None + + for j in range(len(splice_indexes)): + if j == zero_index: + appended_descriptors.append(input['descriptor']) + appended_dimension += input['dimension'] + continue + appended_descriptors.append('Offset({0}, {1})'.format(input['descriptor'], splice_indexes[j])) + appended_dimension += input['dimension'] + + return {'output' : {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), + 'dimension' : appended_dimension}, + 'num_learnable_params' : 0} + +# this model does not have add_final_sigmoid and objective_type options +# as this is specific to chain training and we don't have recipes +# with chain trianing + raw training +def AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines, + input, num_targets, + nonlin_type, nonlin_input_dim, nonlin_output_dim, + use_presoftmax_prior_scale, + prior_scale_file, + include_log_softmax, + self_repair_scale, + xent_regularize, + final_layer_normalize_target, + ng_affine_options, + label_delay = None): + + num_learnable_params = 0 + num_learnable_params_xent = 0 + if nonlin_type == "relu" : + prev_layer_chain = AddAffRelNormLayer(config_lines, "Pre_final_chain", + input, nonlin_output_dim, + ng_affine_options = ng_affine_options, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + prev_layer_xent = AddAffRelNormLayer(config_lines, "Pre_final_xent", + input, nonlin_output_dim, + ng_affine_options = ng_affine_options, + self_repair_scale = self_repair_scale, + norm_target_rms = final_layer_normalize_target) + elif nonlin_type == "pnorm" : + prev_layer_chain = AddAffPnormLayer(config_lines, "Pre_final_chain", + input, nonlin_input_dim, nonlin_output_dim, + ng_affine_options = ng_affine_options, + norm_target_rms = final_layer_normalize_target) + + prev_layer_xent = AddAffPnormLayer(config_lines, "Pre_final_xent", + input, nonlin_input_dim, nonlin_output_dim, + ng_affine_options = ng_affine_options, + norm_target_rms = final_layer_normalize_target) + else: + raise Exception("Unknown nonlinearity type") + + prev_layer_output_chain = prev_layer_chain['output'] + prev_layer_output_xent = prev_layer_xent['output'] + + num_learnable_params += prev_layer_chain['num_learnable_params'] + num_learnable_params_xent += prev_layer_xent['num_learnable_params'] + + # we do not add the ng_affine_options here as Final layer has different defaults + num_learnable_params += AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + label_delay = label_delay) + + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 1.0 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + num_learnable_params_xent += AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( + 0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent', + label_delay = label_delay) + + return [num_learnable_params, num_learnable_params_xent] + +def AddFinalLayerWithXentRegularizer(config_lines, input, num_targets, + use_presoftmax_prior_scale, + prior_scale_file, + include_log_softmax, + self_repair_scale, + xent_regularize, + add_final_sigmoid, + objective_type, + label_delay = None): + + # add_final_sigmoid adds a sigmoid as a final layer as alternative + # to log-softmax layer. + # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers + # This is useful when you need the final outputs to be probabilities between 0 and 1. + # Usually used with an objective-type such as "quadratic". + # Applications are k-binary classification such Ideal Ratio Mask prediction. + num_learnable_params = AddFinalLayer(config_lines, input, num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type, + label_delay = label_delay) + + if xent_regularize != 0.0: + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 1.0 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + num_learnable_params_xent = AddFinalLayer(config_lines, input, num_targets, + ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(0.5 / xent_regularize), + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = True, + name_affix = 'xent', + label_delay = label_delay) + + return [num_learnable_params, num_learnable_params_xent] diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py index 53739f0f9ce..3387beb093a 100755 --- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py @@ -14,8 +14,31 @@ def GetArgs(): # we add compulsary arguments as named arguments for readability - parser = argparse.ArgumentParser(description="Writes config files and variables " - "for LSTMs creation and training", + parser = argparse.ArgumentParser(description="Writes config files and variables for LSTMs creation and training," + " it also supports adding TDNN layers before, in between and after LSTMs." + " This is done by interpreting --splice-indexes, --num-lstm-layers and --lstm-start-layer-index.\n" + " When the splicing indexes at a layer corresponding to an LSTM is not [0] a TDNN layer is added before it.\n" + " e.g.\n --splice-indexes '-2,-1,0,1,2 0 0 0' --num-lstm-layers 3 --lstm-start-layer-index 0 \n" + " This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n" + " and 3 lstm layers.\n" + " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0' --num-lstm-layers 3 --lstm-start-layer-index 0 \n" + " This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n" + " TDNN layer with splicing -3,0,3 + LSTM layer,\n" + " TDNN layer with splicing -3,0,3 + LSTM layer,\n" + " and an LSTM layer\n" + " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0 0' --num-lstm-layers 3 --lstm-start-layer-index 1 \n" + " This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n" + " TDNN layer with splicing -3,0,3 \n" + " TDNN layer with splicing -3,0,3 + LSTM layer,\n" + " LSTM layer,\n" + " and an LSTM layer\n" + " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0 0 -3,0,3 -3,0,3 -3,0,3' --num-lstm-layers 3 --lstm-start-layer-index 1 \n" + " This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n" + " TDNN layer with splicing -3,0,3 \n" + " TDNN layer with splicing -3,0,3 + LSTM layer,\n" + " TDNN layer with splicing -3,0,3 + LSTM layer,\n" + " and an LSTM layer\n" + , epilog="See steps/nnet3/lstm/train.sh for example.") # Only one of these arguments can be specified, and one of them has to @@ -48,12 +71,20 @@ def GetArgs(): help="For chain models, if nonzero, add a separate output for cross-entropy " "regularization (with learning-rate-factor equal to the inverse of this)", default=0.0) + parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction, + help="if using --xent-regularize, gives it separate last-but-one weight matrix", + default=False, choices = ["false", "true"]) + parser.add_argument("--final-layer-normalize-target", type=float, + help="RMS target for final layer (set to <1 if final layer learns too fast", + default=1.0) parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction, help="add the final softmax layer ", default=True, choices = ["false", "true"]) # LSTM options parser.add_argument("--num-lstm-layers", type=int, help="Number of LSTM layers to be stacked", default=1) + parser.add_argument("--lstm-start-layer-index", type=int, + help="layer number to start lstms from. Layer indexing starts from 0.", default=1) parser.add_argument("--cell-dim", type=int, help="dimension of lstm-cell") parser.add_argument("--recurrent-projection-dim", type=int, @@ -123,18 +154,23 @@ def CheckArgs(args): raise Exception("ivector-dim has to be non-negative") if (args.num_lstm_layers < 1): - sys.exit("--num-lstm-layers has to be a positive integer") + raise Exception("--num-lstm-layers has to be a positive integer") + if (args.lstm_start_layer_index < 1): + raise Exception("--lstm-start-layer-index has to be positive number.") + elif (args.lstm_start_layer_index > 1): + warnings.warn("TDNN/Affine layers are going to be stacked before LSTM layers, we don't support shrinkage in this scenario") + if (args.clipping_threshold < 0): - sys.exit("--clipping-threshold has to be a non-negative") + raise Exception("--clipping-threshold has to be a non-negative") if args.lstm_delay is None: args.lstm_delay = [[-1]] * args.num_lstm_layers else: try: args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip()) except ValueError: - sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay)) + raise Exception("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay)) if len(args.lstm_delay) != args.num_lstm_layers: - sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers") + raise Exception("--lstm-delay: Number of delays provided has to match --num-lstm-layers") return args @@ -146,38 +182,34 @@ def PrintConfig(file_name, config_lines): f.close() def ParseSpliceString(splice_indexes, label_delay=None): - ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ] - split1 = splice_indexes.split(" "); # we already checked the string is nonempty. - if len(split1) < 1: - splice_indexes = "0" - - left_context=0 - right_context=0 + splice_array = [] + left_context = 0 + right_context = 0 if label_delay is not None: left_context = -label_delay right_context = label_delay - splice_array = [] + split1 = splice_indexes.split(); # we already checked the string is nonempty. + if len(split1) < 1: + raise Exception("invalid splice-indexes argument, too short: " + + splice_indexes) try: - for i in range(len(split1)): - indexes = map(lambda x: int(x), split1[i].strip().split(",")) - print(indexes) - if len(indexes) < 1: - raise ValueError("invalid --splice-indexes argument, too-short element: " - + splice_indexes) - - if (i > 0) and ((len(indexes) != 1) or (indexes[0] != 0)): - raise ValueError("elements of --splice-indexes splicing is only allowed initial layer.") - - if not indexes == sorted(indexes): - raise ValueError("elements of --splice-indexes must be sorted: " - + splice_indexes) - left_context += -indexes[0] - right_context += indexes[-1] - splice_array.append(indexes) + for string in split1: + split2 = string.split(",") + if len(split2) < 1: + raise Exception("invalid splice-indexes argument, too-short element: " + + splice_indexes) + int_list = [] + for int_str in split2: + int_list.append(int(int_str)) + if not int_list == sorted(int_list): + raise Exception("elements of splice-indexes must be sorted: " + + splice_indexes) + left_context += -int_list[0] + right_context += int_list[-1] + splice_array.append(int_list) except ValueError as e: - raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e)) - + raise Exception("invalid splice-indexes argument " + splice_indexes + str(e)) left_context = max(0, left_context) right_context = max(0, right_context) @@ -209,18 +241,30 @@ def ParseLstmDelayString(lstm_delay): def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, - splice_indexes, lstm_delay, cell_dim, hidden_dim, + splice_indexes_string, lstm_delay, cell_dim, hidden_dim, recurrent_projection_dim, non_recurrent_projection_dim, - num_lstm_layers, num_hidden_layers, + num_lstm_layers, lstm_start_layer_index, norm_based_clipping, clipping_threshold, ng_per_element_scale_options, ng_affine_options, label_delay, include_log_softmax, xent_regularize, + xent_separate_forward_affine, final_layer_normalize_target, self_repair_scale_nonlinearity, self_repair_scale_clipgradient): + num_learnable_params = 0 + num_learnable_params_xent = 0 # number of parameters in the xent branch config_lines = {'components':[], 'component-nodes':[]} - config_files={} - prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + + + [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(config_dir, + splice_indexes_string, + label_delay, + num_lstm_layers, + lstm_start_layer_index) + + + prev_layer = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + prev_layer_output = prev_layer['output'] # Add the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) @@ -229,95 +273,239 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, nodes.AddOutputLayer(init_config_lines, prev_layer_output) config_files[config_dir + '/init.config'] = init_config_lines - prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') - - for i in range(num_lstm_layers): - if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer - prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1), - prev_layer_output, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, - lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) - else: # add a uni-directional LSTM layer - prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), - prev_layer_output, cell_dim, - recurrent_projection_dim, non_recurrent_projection_dim, - clipping_threshold, norm_based_clipping, - ng_per_element_scale_options, ng_affine_options, - lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient) - # make the intermediate config file for layerwise discriminative - # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) - - - if xent_regularize != 0.0: - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - include_log_softmax = True, label_delay = label_delay, - name_affix = 'xent') - - config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + prev_layer = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + prev_layer_output = prev_layer['output'] + + # we don't want to add a simple affine layer after input layer as in TDNN only configs, + # so we reduce the number of hidden layers and splice_indexes + splice_indexes = splice_indexes[1:] + num_hidden_layers = num_hidden_layers - 1 + lstm_start_layer_index -= 1 + + num_layers_added = 0 + # stacking the TDNN/affine layers before the LSTM layers + for i in range(lstm_start_layer_index): + num_layers_added += 1 + # we will just support Relu non-linearities + if splice_indexes[i] == [0]: + # add a normal affine layer + prev_layer = nodes.AddAffRelNormLayer(config_lines, 'Affine{0}'.format(num_layers_added), + prev_layer_output, + hidden_dim, + self_repair_scale = self_repair_scale_nonlinearity, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] + else : + prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added), + prev_layer_output, + splice_indexes = splice_indexes[i], + nonlin_type = 'relu', + nonlin_input_dim = hidden_dim, + nonlin_output_dim = hidden_dim, + self_repair_scale = self_repair_scale_nonlinearity, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] + + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale_nonlinearity, + xent_regularize = xent_regularize, + label_delay = label_delay, + add_final_sigmoid = False, + objective_type='linear') + + config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines config_lines = {'components':[], 'component-nodes':[]} - for i in range(num_lstm_layers, num_hidden_layers): - prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1), - prev_layer_output, hidden_dim, - ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity) - # make the intermediate config file for layerwise discriminative - # training - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax) - if xent_regularize != 0.0: - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - include_log_softmax = True, label_delay = label_delay, - name_affix = 'xent') - config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines + # stacking the LSTM layers + for i in range(lstm_start_layer_index, lstm_start_layer_index + num_lstm_layers): + num_layers_added += 1 + if splice_indexes[i] != [0]: + # there is a non-zero splice-indexes at this layer index so we pre-pend a tdnn layer + warnings.warn("Adding a TDNN layer before LSTM at layer {l}" + " as splice-indexes are {s} and not 0".format(l=i, s=','.join(map(lambda x: str(x), splice_indexes[i])))) + + prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added), + prev_layer_output, + splice_indexes = splice_indexes[i], + nonlin_type = 'relu', + nonlin_input_dim = prev_layer_output['dimension'], + nonlin_output_dim = prev_layer_output['dimension'], + self_repair_scale = self_repair_scale_nonlinearity, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + + lstm_index = i - lstm_start_layer_index + if len(lstm_delay[lstm_index]) == 2: # add a bi-directional LSTM layer + prev_layer = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(num_layers_added), + prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[lstm_index], + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, + self_repair_scale_clipgradient = self_repair_scale_clipgradient) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + + else: # add a uni-directional LSTM layer + prev_layer = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(num_layers_added), + prev_layer_output, cell_dim, + recurrent_projection_dim, non_recurrent_projection_dim, + clipping_threshold, norm_based_clipping, + ng_per_element_scale_options, ng_affine_options, + lstm_delay = lstm_delay[lstm_index][0], + self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, + self_repair_scale_clipgradient = self_repair_scale_clipgradient) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] + + + + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale_nonlinearity, + xent_regularize = xent_regularize, + label_delay = label_delay, + objective_type = 'linear', + add_final_sigmoid = False) + + + config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines + config_lines = {'components':[], 'component-nodes':[]} + + # stacking TDNN/affine layers after the LSTM layers + for i in range(lstm_start_layer_index + num_lstm_layers, num_hidden_layers): + num_layers_added += 1 + if xent_separate_forward_affine and i == num_hidden_layers - 1: + # xent_separate_forward_affine is only honored only when adding the final hidden layer + # this is the final layer so assert that splice index is [0] + assert(splice_indexes[i] == [0]) + if xent_regularize == 0.0: + raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero") + + # we use named arguments as we do not want argument offset errors + num_params_final, num_params_final_xent = nodes.AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + nonlin_type = 'relu', + nonlin_input_dim = hidden_dim, + nonlin_output_dim = hidden_dim, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale_nonlinearity, + xent_regularize = xent_regularize, + label_delay = label_delay, + final_layer_normalize_target = final_layer_normalize_target) + else: + # we will just support Relu non-linearities + if splice_indexes[i] == [0]: + # add a normal affine layer + prev_layer = nodes.AddAffRelNormLayer(config_lines, 'Affine{0}'.format(num_layers_added), + prev_layer_output, + hidden_dim, + self_repair_scale = self_repair_scale_nonlinearity, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] + else : + prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added), + prev_layer_output, + splice_indexes = splice_indexes[i], + nonlin_type = 'relu', + nonlin_input_dim = hidden_dim, + nonlin_output_dim = hidden_dim, + self_repair_scale = self_repair_scale_nonlinearity, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] + + # a final layer is added after each new layer as we are generating + # configs for layer-wise discriminative training + num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + use_presoftmax_prior_scale = False, + prior_scale_file = None, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale_nonlinearity, + xent_regularize = xent_regularize, + label_delay = label_delay, + add_final_sigmoid = False, + objective_type='linear') + + config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines config_lines = {'components':[], 'component-nodes':[]} + num_learnable_params += num_params_final + num_learnable_params_xent = num_params_final_xent + # printing out the configs # init.config used to train lda-mllt train for key in config_files.keys(): PrintConfig(key, config_files[key]) + # write the files used by other scripts like steps/nnet3/get_egs.sh + f = open(config_dir + "/vars", "w") + print('model_left_context=' + str(left_context), file=f) + print('model_right_context=' + str(right_context), file=f) + print('num_hidden_layers=' + str(num_hidden_layers), file=f) + # print('initial_right_context=' + str(splice_array[0][-1]), file=f) + f.close() + + print('This model has num_learnable_params={0:,} and num_learnable_params_xent={1:,}'.format(num_learnable_params, num_learnable_params_xent)) -def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers): + +def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers, lstm_start_layer_index): parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay) left_context = parsed_splice_output['left_context'] right_context = parsed_splice_output['right_context'] num_hidden_layers = parsed_splice_output['num_hidden_layers'] splice_indexes = parsed_splice_output['splice_indexes'] - if (num_hidden_layers < num_lstm_layers): - raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes") - # write the files used by other scripts like steps/nnet3/get_egs.sh - f = open(config_dir + "/vars", "w") - print('model_left_context=' + str(left_context), file=f) - print('model_right_context=' + str(right_context), file=f) - print('num_hidden_layers=' + str(num_hidden_layers), file=f) - # print('initial_right_context=' + str(splice_array[0][-1]), file=f) - f.close() + if (num_hidden_layers < lstm_start_layer_index + num_lstm_layers): + raise Exception("num-lstm-layers : (number of lstm layers + lstm start layer index) " + " has to be smaller than number of layers determined from splice-indexes") + return [left_context, right_context, num_hidden_layers, splice_indexes] def Main(): args = GetArgs() - [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers) MakeConfigs(config_dir = args.config_dir, feat_dim = args.feat_dim, ivector_dim = args.ivector_dim, num_targets = args.num_targets, - splice_indexes = splice_indexes, lstm_delay = args.lstm_delay, + splice_indexes_string = args.splice_indexes, lstm_delay = args.lstm_delay, cell_dim = args.cell_dim, hidden_dim = args.hidden_dim, recurrent_projection_dim = args.recurrent_projection_dim, non_recurrent_projection_dim = args.non_recurrent_projection_dim, num_lstm_layers = args.num_lstm_layers, - num_hidden_layers = num_hidden_layers, + lstm_start_layer_index = args.lstm_start_layer_index, norm_based_clipping = args.norm_based_clipping, clipping_threshold = args.clipping_threshold, ng_per_element_scale_options = args.ng_per_element_scale_options, @@ -325,6 +513,8 @@ def Main(): label_delay = args.label_delay, include_log_softmax = args.include_log_softmax, xent_regularize = args.xent_regularize, + xent_separate_forward_affine = args.xent_separate_forward_affine, + final_layer_normalize_target = args.final_layer_normalize_target, self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity, self_repair_scale_clipgradient = args.self_repair_scale_clipgradient) diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py index bac260e93bc..0e6a24d5619 100755 --- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py +++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py @@ -227,6 +227,7 @@ def AddConvMaxpLayer(config_lines, name, input, args): # The ivectors are processed through an affine layer parallel to the CNN layers, # then concatenated with the CNN output and passed to the deeper part of the network. def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0): + num_learnable_params = 0 cnn_args = ParseCnnString(cnn_layer) num_cnn_layers = len(cnn_args) # We use an Idct layer here to convert MFCC to FBANK features @@ -244,19 +245,27 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c 'vectorization': 'yzx'} for cl in range(0, num_cnn_layers): - prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl]) + prev_layer = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl]) + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] if cnn_bottleneck_dim > 0: - prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "") + prev_layer = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "") + prev_layer_output = prev_layer['output'] + num_learnable_params = prev_layer['num_learnable_params'] if ivector_dim > 0: iv_layer_output = {'descriptor': 'ReplaceIndex(ivector, t, 0)', 'dimension': ivector_dim} - iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "") + iv_layer = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "") + iv_layer_output = iv_layer['output'] + num_learnable_params += iv_layer['num_learnable_params'] + prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor']) prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension'] - return prev_layer_output + return {'output' : prev_layer_output, + 'num_learnable_params' : num_learnable_params} def PrintConfig(file_name, config_lines): f = open(file_name, 'w') @@ -337,8 +346,6 @@ def MakeConfigs(config_dir, splice_indexes_string, parsed_splice_output = ParseSpliceString(splice_indexes_string.strip()) - left_context = parsed_splice_output['left_context'] - right_context = parsed_splice_output['right_context'] num_hidden_layers = parsed_splice_output['num_hidden_layers'] splice_indexes = parsed_splice_output['splice_indexes'] input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim @@ -349,12 +356,20 @@ def MakeConfigs(config_dir, splice_indexes_string, prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir) + # start the config generation process config_lines = {'components':[], 'component-nodes':[]} - config_files={} - prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) - # Add the init config lines for estimating the preconditioning matrices + num_learnable_params = 0 + num_learnable_params_xent = 0 # number of parameters in xent-branch of chain models + + prev_layer = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim) + prev_layer_output = prev_layer['output'] + # we moved the first splice layer to before the LDA.. + # so the input to the first affine layer is going to [0] index + splice_indexes[0] = [0] + + # Adding the init config lines for estimating the preconditioning matrices init_config_lines = copy.deepcopy(config_lines) init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to') init_config_lines['components'].insert(0, '# preconditioning matrix computation') @@ -362,19 +377,21 @@ def MakeConfigs(config_dir, splice_indexes_string, config_files[config_dir + '/init.config'] = init_config_lines if cnn_layer is not None: - prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, + prev_layer = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes[0], ivector_dim) + prev_layer_output = prev_layer['output'] + num_learnable_params += prev_layer['num_learnable_params'] if add_lda: - prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') - - left_context = 0 - right_context = 0 - # we moved the first splice layer to before the LDA.. - # so the input to the first affine layer is going to [0] index - splice_indexes[0] = [0] + prev_layer = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat') + prev_layer_output = prev_layer['output'] - if not nonlin_output_dim is None: + # generating the output-dims for each layer + if nonlin_type == "pnorm": + # we don't increase the output dimension for each layer + # we might support this in the future + nonlin_output_dims = [nonlin_output_dim] * num_hidden_layers + elif not nonlin_output_dim is None: nonlin_output_dims = [nonlin_output_dim] * num_hidden_layers elif nonlin_output_dim_init < nonlin_output_dim_final and num_hidden_layers == 1: raise Exception("num-hidden-layers has to be greater than 1 if relu-dim-init and relu-dim-final is different.") @@ -385,119 +402,73 @@ def MakeConfigs(config_dir, splice_indexes_string, assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error nonlin_output_dims[-1] = nonlin_output_dim_final # It ensures that the dim of the last hidden layer is exactly the same as what is specified - for i in range(0, num_hidden_layers): - # make the intermediate config file for layerwise discriminative training - - # prepare the spliced input - if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0): - try: - zero_index = splice_indexes[i].index(0) - except ValueError: - zero_index = None - # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor - prev_layer_output_descriptor = prev_layer_output['descriptor'] - subset_output = prev_layer_output - if subset_dim > 0: - # if subset_dim is specified the script expects a zero in the splice indexes - assert(zero_index is not None) - subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim) - subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i), - 'dimension' : subset_dim} - config_lines['component-nodes'].append(subset_node_config) - appended_descriptors = [] - appended_dimension = 0 - for j in range(len(splice_indexes[i])): - if j == zero_index: - appended_descriptors.append(prev_layer_output['descriptor']) - appended_dimension += prev_layer_output['dimension'] - continue - appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j])) - appended_dimension += subset_output['dimension'] - prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)), - 'dimension' : appended_dimension} - else: - # this is a normal affine node - pass + # Adding the TDNN layers + for i in range(0, num_hidden_layers): if xent_separate_forward_affine and i == num_hidden_layers - 1: + # xent_separate_forward_affine is only done when adding the final hidden layer + # this is the final layer so assert that splice index is [0] + assert(splice_indexes[i] == [0]) if xent_regularize == 0.0: raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero") - if nonlin_type == "relu" : - prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain", - prev_layer_output, nonlin_output_dim, - self_repair_scale = self_repair_scale, - norm_target_rms = final_layer_normalize_target) - - prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent", - prev_layer_output, nonlin_output_dim, - self_repair_scale = self_repair_scale, - norm_target_rms = final_layer_normalize_target) - elif nonlin_type == "pnorm" : - prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain", - prev_layer_output, nonlin_input_dim, nonlin_output_dim, - norm_target_rms = final_layer_normalize_target) - - prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent", - prev_layer_output, nonlin_input_dim, nonlin_output_dim, - norm_target_rms = final_layer_normalize_target) - else: - raise Exception("Unknown nonlinearity type") - - nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets, - use_presoftmax_prior_scale = use_presoftmax_prior_scale, - prior_scale_file = prior_scale_file, - include_log_softmax = include_log_softmax) - - nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets, - ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( - 0.5 / xent_regularize), - use_presoftmax_prior_scale = use_presoftmax_prior_scale, - prior_scale_file = prior_scale_file, - include_log_softmax = True, - name_affix = 'xent') + # we use named arguments as we do not want argument offset errors + num_params_fin, num_params_fin_xent = nodes.AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + nonlin_type = nonlin_type, + nonlin_input_dim = nonlin_input_dim, + nonlin_output_dim = nonlin_output_dim, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale, + xent_regularize = xent_regularize, + final_layer_normalize_target = final_layer_normalize_target) + + + else: - if nonlin_type == "relu": - prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i), - prev_layer_output, nonlin_output_dims[i], - self_repair_scale = self_repair_scale, - norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) - elif nonlin_type == "pnorm": - prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i), - prev_layer_output, nonlin_input_dim, nonlin_output_dim, - norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + if splice_indexes[i] == [0]: + # add a normal affine layer + prev_layer = nodes.AddAffineNonlinLayer(config_lines, 'Affine{0}'.format(i), + prev_layer_output, + nonlin_type, nonlin_input_dim, nonlin_output_dims[i], + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) else: - raise Exception("Unknown nonlinearity type") + prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(i+1), prev_layer_output, + splice_indexes[i], + nonlin_type, nonlin_input_dim, nonlin_output_dims[i], + subset_dim = subset_dim, + self_repair_scale = self_repair_scale, + norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target) + + # a final layer is added after each new layer as we are generating # configs for layer-wise discriminative training + num_params_fin, num_params_fin_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines, + input = prev_layer_output, + num_targets = num_targets, + use_presoftmax_prior_scale = use_presoftmax_prior_scale, + prior_scale_file = prior_scale_file, + include_log_softmax = include_log_softmax, + self_repair_scale = self_repair_scale, + xent_regularize = xent_regularize, + add_final_sigmoid = add_final_sigmoid, + objective_type = objective_type) + + - # add_final_sigmoid adds a sigmoid as a final layer as alternative - # to log-softmax layer. - # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers - # This is useful when you need the final outputs to be probabilities between 0 and 1. - # Usually used with an objective-type such as "quadratic". - # Applications are k-binary classification such Ideal Ratio Mask prediction. - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - use_presoftmax_prior_scale = use_presoftmax_prior_scale, - prior_scale_file = prior_scale_file, - include_log_softmax = include_log_softmax, - add_final_sigmoid = add_final_sigmoid, - objective_type = objective_type) - if xent_regularize != 0.0: - nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, - ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format( - 0.5 / xent_regularize), - use_presoftmax_prior_scale = use_presoftmax_prior_scale, - prior_scale_file = prior_scale_file, - include_log_softmax = True, - name_affix = 'xent') config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines config_lines = {'components':[], 'component-nodes':[]} - left_context += int(parsed_splice_output['left_context']) - right_context += int(parsed_splice_output['right_context']) + num_learnable_params += num_params_fin + num_learnable_params_xent = num_params_fin_xent + left_context = int(parsed_splice_output['left_context']) + right_context = int(parsed_splice_output['right_context']) # write the files used by other scripts like steps/nnet3/get_egs.sh f = open(config_dir + "/vars", "w") print('model_left_context=' + str(left_context), file=f) @@ -514,6 +485,8 @@ def MakeConfigs(config_dir, splice_indexes_string, for key in config_files.keys(): PrintConfig(key, config_files[key]) + print('This model has num_learnable_params={0:,} and num_learnable_params_xent={1:,}'.format(num_learnable_params, num_learnable_params_xent)) + def Main(): args = GetArgs()