diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 9b9ce4a54ad..376ea291a28 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -10,6 +10,8 @@
 
 def GetSumDescriptor(inputs):
     sum_descriptors = inputs
+    if len(inputs) == 1:
+        return inputs
     while len(sum_descriptors) != 1:
         cur_sum_descriptors = []
         pair = []
@@ -52,8 +54,9 @@ def AddNoOpLayer(config_lines, name, input):
     components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
     component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))
 
-    return {'descriptor':  '{0}_noop'.format(name),
-            'dimension': input['dimension']}
+    return {'output' : {'descriptor':  '{0}_noop'.format(name),
+                        'dimension': input['dimension']},
+            'num_learnable_params' : 0}
 
 def AddLdaLayer(config_lines, name, input, lda_file):
     return AddFixedAffineLayer(config_lines, name, input, lda_file)
@@ -65,8 +68,9 @@ def AddFixedAffineLayer(config_lines, name, input, matrix_file):
     components.append('component name={0}_fixaffine type=FixedAffineComponent matrix={1}'.format(name, matrix_file))
     component_nodes.append('component-node name={0}_fixaffine component={0}_fixaffine input={1}'.format(name, input['descriptor']))
 
-    return {'descriptor':  '{0}_fixaffine'.format(name),
-            'dimension': input['dimension']}
+    return {'output' : {'descriptor':  '{0}_fixaffine'.format(name),
+                        'dimension': input['dimension']},
+            'num_learnable_params' : 0}
 
 
 def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
@@ -77,8 +81,9 @@ def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
     components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
     component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))
 
-    return {'descriptor' : '{0}_block_affine'.format(name),
-                           'dimension' : output_dim}
+    return {'output' : {'descriptor' : '{0}_block_affine'.format(name),
+                        'dimension' : output_dim},
+            'num_learnable_params' : (input['dimension']+1) * output_dim}
 
 def AddPermuteLayer(config_lines, name, input, column_map):
     components = config_lines['components']
@@ -87,8 +92,9 @@ def AddPermuteLayer(config_lines, name, input, column_map):
     components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
     component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
 
-    return {'descriptor': '{0}_permute'.format(name),
-            'dimension': input['dimension']}
+    return {'output' : {'descriptor': '{0}_permute'.format(name),
+                        'dimension': input['dimension']},
+            'num_learnable_params' : 0 }
 
 def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
     components = config_lines['components']
@@ -97,8 +103,9 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options))
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
 
-    return {'descriptor':  '{0}_affine'.format(name),
-            'dimension': output_dim}
+    return {'output' : {'descriptor':  '{0}_affine'.format(name),
+                        'dimension': output_dim},
+            'num_learnable_params' : (input['dimension']+1) * output_dim }
 
 def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None):
     components = config_lines['components']
@@ -113,9 +120,9 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
     component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
     component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name))
-
-    return {'descriptor':  '{0}_renorm'.format(name),
-            'dimension': output_dim}
+    return {'output' : {'descriptor':  '{0}_renorm'.format(name),
+                        'dimension': output_dim},
+            'num_learnable_params' : input['dimension'] * output_dim }
 
 def AddAffPnormLayer(config_lines, name, input, pnorm_input_dim, pnorm_output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0):
     components = config_lines['components']
@@ -129,8 +136,9 @@ def AddAffPnormLayer(config_lines, name, input, pnorm_input_dim, pnorm_output_di
     component_nodes.append("component-node name={0}_pnorm component={0}_pnorm input={0}_affine".format(name))
     component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_pnorm".format(name))
 
-    return {'descriptor':  '{0}_renorm'.format(name),
-            'dimension': pnorm_output_dim}
+    return {'output' : {'descriptor':  '{0}_renorm'.format(name),
+                        'dimension': pnorm_output_dim},
+            'num_learnable_params' : input['dimension'] * pnorm_input_dim }
 
 def AddConvolutionLayer(config_lines, name, input,
                        input_x_dim, input_y_dim, input_z_dim,
@@ -140,7 +148,7 @@ def AddConvolutionLayer(config_lines, name, input,
                        param_stddev = None, bias_stddev = None,
                        filter_bias_file = None,
                        is_updatable = True):
-    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim )
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
 
@@ -164,10 +172,11 @@ def AddConvolutionLayer(config_lines, name, input,
     num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step)
     num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step)
     output_dim = num_x_steps * num_y_steps * num_filters;
-    return {'descriptor':  '{0}_conv_t'.format(name),
-            'dimension': output_dim,
-            '3d-dim': [num_x_steps, num_y_steps, num_filters],
-            'vectorization': 'zyx'}
+    return {'output' : {'descriptor':  '{0}_conv_t'.format(name),
+                        'dimension': output_dim,
+                        '3d-dim': [num_x_steps, num_y_steps, num_filters],
+                        'vectorization': 'zyx'},
+            'num_learnable_params' : filt_x_dim * filt_y_dim * input_z_dim }
 
 # The Maxpooling component assumes input vectorizations of type zyx
 def AddMaxpoolingLayer(config_lines, name, input,
@@ -202,11 +211,11 @@ def AddMaxpoolingLayer(config_lines, name, input,
     num_pools_z = 1 + (input_z_dim - pool_z_size) / pool_z_step;
     output_dim = num_pools_x * num_pools_y * num_pools_z;
 
-    return {'descriptor':  '{0}_maxp_t'.format(name),
-            'dimension': output_dim,
-            '3d-dim': [num_pools_x, num_pools_y, num_pools_z],
-            'vectorization': 'zyx'}
-
+    return {'output' : {'descriptor':  '{0}_maxp_t'.format(name),
+                        'dimension': output_dim,
+                        '3d-dim': [num_pools_x, num_pools_y, num_pools_z],
+                        'vectorization': 'zyx'},
+            'num_learnable_params' : 0 }
 
 def AddSoftmaxLayer(config_lines, name, input):
     components = config_lines['components']
@@ -215,8 +224,9 @@ def AddSoftmaxLayer(config_lines, name, input):
     components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension']))
     component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor']))
 
-    return {'descriptor':  '{0}_log_softmax'.format(name),
-            'dimension': input['dimension']}
+    return {'output' : {'descriptor':  '{0}_log_softmax'.format(name),
+                        'dimension': input['dimension']},
+            'num_learnable_params' : 0 }
 
 
 def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None):
@@ -227,8 +237,9 @@ def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None):
     self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
     components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string))
     component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor']))
-    return {'descriptor':  '{0}_sigmoid'.format(name),
-            'dimension': input['dimension']}
+    return {'output' : {'descriptor':  '{0}_sigmoid'.format(name),
+                        'dimension': input['dimension']},
+            'num_learnable_params' : 0}
 
 def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"):
     components = config_lines['components']
@@ -254,29 +265,38 @@ def AddFinalLayer(config_lines, input, output_dim,
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
 
+    num_learnable_params = 0
     if name_affix is not None:
         final_node_prefix = 'Final-' + str(name_affix)
     else:
         final_node_prefix = 'Final'
 
-    prev_layer_output = AddAffineLayer(config_lines,
+    prev_layer = AddAffineLayer(config_lines,
             final_node_prefix , input, output_dim,
             ng_affine_options)
+    prev_layer_output = prev_layer['output']
+    num_learnable_params += prev_layer['num_learnable_params']
+
     if include_log_softmax:
         if use_presoftmax_prior_scale :
             components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file))
             component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix,
                 prev_layer_output['descriptor']))
             prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix)
-        prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output)
+        prev_layer = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output)
+        prev_layer_output = prev_layer['output']
+        num_learnable_params += prev_layer['num_learnable_params']
     elif add_final_sigmoid:
         # Useful when you need the final outputs to be probabilities
         # between 0 and 1.
         # Usually used with an objective-type such as "quadratic"
-        prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output)
+        prev_layer = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output)
+        prev_layer_output = prev_layer['output']
+        num_learnable_params += prev_layer['num_learnable_params']
     # we use the same name_affix as a prefix in for affine/scale nodes but as a
     # suffix for output node
     AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type)
+    return num_learnable_params
 
 def AddLstmLayer(config_lines,
                  name, input, cell_dim,
@@ -289,7 +309,9 @@ def AddLstmLayer(config_lines,
                  lstm_delay = -1,
                  self_repair_scale_nonlinearity = None,
                  self_repair_scale_clipgradient = None):
+
     assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
+    num_learnable_params = 0
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
 
@@ -319,21 +341,29 @@ def AddLstmLayer(config_lines,
     # Parameter Definitions W*(* replaced by - to have valid names)
     components.append("# Input gate control : W_i* matrices")
     components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim
     components.append("# note : the cell outputs pass through a diagonal matrix")
     components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+    num_learnable_params += cell_dim
 
     components.append("# Forget gate control : W_f* matrices")
     components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim
+
     components.append("# note : the cell outputs pass through a diagonal matrix")
     components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+    num_learnable_params += cell_dim
 
     components.append("#  Output gate control : W_o* matrices")
     components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim
     components.append("# note : the cell outputs pass through a diagonal matrix")
     components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+    num_learnable_params += cell_dim
 
     components.append("# Cell input matrices : W_c* matrices")
     components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    num_learnable_params += (input_dim + recurrent_projection_dim + 1) * cell_dim
 
 
     components.append("# Defining the non-linearities")
@@ -386,6 +416,7 @@ def AddLstmLayer(config_lines,
     if (add_recurrent_projection and add_non_recurrent_projection):
         components.append("# projection matrices : Wrm and Wpm")
         components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options))
+        num_learnable_params += (cell_dim + 1) * (non_recurrent_projection_dim + recurrent_projection_dim)
         components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
         component_nodes.append("# r_t and p_t")
         component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
@@ -397,6 +428,7 @@ def AddLstmLayer(config_lines,
     elif add_recurrent_projection:
         components.append("# projection matrices : Wrm")
         components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options))
+        num_learnable_params += (cell_dim + 1) * recurrent_projection_dim
         components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
         component_nodes.append("# r_t")
         component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name))
@@ -410,10 +442,9 @@ def AddLstmLayer(config_lines,
         output_descriptor = '{0}_r_t'.format(name)
         output_dim = cell_dim
 
-    return {
-            'descriptor': output_descriptor,
-            'dimension':output_dim
-            }
+    return {'output' : {'descriptor': output_descriptor,
+                        'dimension':output_dim},
+            'num_learnable_params' : num_learnable_params}
 
 def AddBLstmLayer(config_lines,
                   name, input, cell_dim,
@@ -427,25 +458,225 @@ def AddBLstmLayer(config_lines,
                   self_repair_scale_nonlinearity = None,
                   self_repair_scale_clipgradient = None):
     assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0)
-    output_forward = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim,
+    num_learnable_params = 0
+    prev_layer = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim,
                                   recurrent_projection_dim, non_recurrent_projection_dim,
                                   clipping_threshold, norm_based_clipping,
                                   ng_per_element_scale_options, ng_affine_options,
                                   lstm_delay = lstm_delay[0],
                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                   self_repair_scale_clipgradient = self_repair_scale_clipgradient)
-    output_backward = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim,
+    output_forward = prev_layer['output']
+    num_learnable_params += prev_layer['num_learnable_params']
+
+    prev_layer  = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim,
                                    recurrent_projection_dim, non_recurrent_projection_dim,
                                    clipping_threshold, norm_based_clipping,
                                    ng_per_element_scale_options, ng_affine_options,
                                    lstm_delay = lstm_delay[1],
                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                    self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+    output_backward = prev_layer['output']
+    num_learnable_params += prev_layer['num_learnable_params']
+
     output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor'])
     output_dim = output_forward['dimension'] + output_backward['dimension']
 
-    return {
-            'descriptor': output_descriptor,
-            'dimension':output_dim
-            }
- 
+    return {'output' : {'descriptor': output_descriptor,
+                        'dimension':output_dim},
+            'num_learnable_params' : num_learnable_params}
+
+def AddTdnnLayer(config_lines, name, input, splice_indexes,
+                 nonlin_type, nonlin_input_dim, nonlin_output_dim,
+                 subset_dim = 0, ng_affine_options = " bias-stddev=0 ",
+                 self_repair_scale = 0, norm_target_rms = 1.0):
+
+    # prepare the layer input
+    try:
+        zero_index = splice_indexes.index(0)
+    except ValueError:
+        zero_index = None
+
+    # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+    prev_layer_output_descriptor = input['descriptor']
+    subset_output = input
+    if subset_dim > 0:
+        # if subset_dim is specified the script expects a zero in the splice indexes
+        assert(zero_index is not None)
+        subset_node_config = "dim-range-node name={0}_input input-node={1} dim-offset={2} dim={3}".format(name, prev_layer_output_descriptor, 0, subset_dim)
+        subset_output = {'descriptor' : '{0}_input'.format(name),
+                         'dimension' : subset_dim}
+        config_lines['component-nodes'].append(subset_node_config)
+    appended_descriptors = []
+    appended_dimension = 0
+    for j in range(len(splice_indexes)):
+        if j == zero_index:
+            appended_descriptors.append(input['descriptor'])
+            appended_dimension += input['dimension']
+            continue
+        appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[j]))
+        appended_dimension += subset_output['dimension']
+    prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                         'dimension'  : appended_dimension}
+
+    # add the affine layer
+    if nonlin_type == "relu":
+        prev_layer = AddAffRelNormLayer(config_lines, name,
+                                        prev_layer_output,
+                                        nonlin_output_dim,
+                                        ng_affine_options = ng_affine_options,
+                                        self_repair_scale = self_repair_scale,
+                                        norm_target_rms = norm_target_rms)
+        prev_layer_output = prev_layer['output']
+    elif nonlin_type == "pnorm":
+        prev_layer = AddAffPnormLayer(config_lines, name,
+                                      prev_layer_output,
+                                      nonlin_input_dim, nonlin_output_dim,
+                                      ng_affine_options = ng_affine_options,
+                                      norm_target_rms = norm_target_rms)
+    else:
+        raise Exception("Unknown nonlinearity type")
+
+    return prev_layer
+
+
+
+
+# Convenience functions
+
+def SpliceInput(input, splice_indexes):
+    appended_descriptors = []
+    appended_dimension = 0
+
+    try:
+        zero_index = splice_indexes.index(0)
+    except ValueError:
+        zero_index = None
+
+    for j in range(len(splice_indexes)):
+        if j == zero_index:
+            appended_descriptors.append(input['descriptor'])
+            appended_dimension += input['dimension']
+            continue
+        appended_descriptors.append('Offset({0}, {1})'.format(input['descriptor'], splice_indexes[j]))
+        appended_dimension += input['dimension']
+
+    return {'output' : {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                        'dimension'  : appended_dimension},
+            'num_learnable_params' : 0}
+
+# this model does not have add_final_sigmoid and objective_type options
+# as this is specific to chain training and we don't have recipes
+# with chain trianing + raw training
+def AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines,
+                                                             input, num_targets,
+                                                             nonlin_type, nonlin_input_dim, nonlin_output_dim,
+                                                             use_presoftmax_prior_scale,
+                                                             prior_scale_file,
+                                                             include_log_softmax,
+                                                             self_repair_scale,
+                                                             xent_regularize,
+                                                             final_layer_normalize_target,
+                                                             ng_affine_options,
+                                                             label_delay = None):
+
+    num_learnable_params = 0
+    num_learnable_params_xent = 0
+    if nonlin_type == "relu" :
+        prev_layer_chain = AddAffRelNormLayer(config_lines, "Pre_final_chain",
+                                               input, nonlin_output_dim,
+                                               ng_affine_options = ng_affine_options,
+                                               self_repair_scale = self_repair_scale,
+                                               norm_target_rms = final_layer_normalize_target)
+        prev_layer_xent = AddAffRelNormLayer(config_lines, "Pre_final_xent",
+                                              input, nonlin_output_dim,
+                                              ng_affine_options = ng_affine_options,
+                                              self_repair_scale = self_repair_scale,
+                                              norm_target_rms = final_layer_normalize_target)
+    elif nonlin_type == "pnorm" :
+        prev_layer_chain = AddAffPnormLayer(config_lines, "Pre_final_chain",
+                                             input, nonlin_input_dim, nonlin_output_dim,
+                                             ng_affine_options = ng_affine_options,
+                                             norm_target_rms = final_layer_normalize_target)
+
+        prev_layer_xent = AddAffPnormLayer(config_lines, "Pre_final_xent",
+                                            input, nonlin_input_dim, nonlin_output_dim,
+                                            ng_affine_options = ng_affine_options,
+                                            norm_target_rms = final_layer_normalize_target)
+    else:
+        raise Exception("Unknown nonlinearity type")
+
+    prev_layer_output_chain = prev_layer_chain['output']
+    prev_layer_output_xent = prev_layer_xent['output']
+
+    num_learnable_params += prev_layer_chain['num_learnable_params']
+    num_learnable_params_xent += prev_layer_xent['num_learnable_params']
+
+    # we do not add the ng_affine_options here as Final layer has different defaults
+    num_learnable_params += AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
+                                          use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                          prior_scale_file = prior_scale_file,
+                                          include_log_softmax = include_log_softmax,
+                                          label_delay = label_delay)
+
+    # This block prints the configs for a separate output that will be
+    # trained with a cross-entropy objective in the 'chain' models... this
+    # has the effect of regularizing the hidden parts of the model.  we use
+    # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+    # 1.0 / args.xent_regularize is suitable as it means the xent
+    # final-layer learns at a rate independent of the regularization
+    # constant; and the 0.5 was tuned so as to make the relative progress
+    # similar in the xent and regular final layers.
+    num_learnable_params_xent += AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
+                                               ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                               0.5 / xent_regularize),
+                                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                               prior_scale_file = prior_scale_file,
+                                               include_log_softmax = True,
+                                               name_affix = 'xent',
+                                               label_delay = label_delay)
+
+    return [num_learnable_params, num_learnable_params_xent]
+
+def AddFinalLayerWithXentRegularizer(config_lines, input, num_targets,
+                                     use_presoftmax_prior_scale,
+                                     prior_scale_file,
+                                     include_log_softmax,
+                                     self_repair_scale,
+                                     xent_regularize,
+                                     add_final_sigmoid,
+                                     objective_type,
+                                     label_delay = None):
+
+    # add_final_sigmoid adds a sigmoid as a final layer as alternative
+    # to log-softmax layer.
+    # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
+    # This is useful when you need the final outputs to be probabilities between 0 and 1.
+    # Usually used with an objective-type such as "quadratic".
+    # Applications are k-binary classification such Ideal Ratio Mask prediction.
+    num_learnable_params = AddFinalLayer(config_lines, input, num_targets,
+                                         use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                         prior_scale_file = prior_scale_file,
+                                         include_log_softmax = include_log_softmax,
+                                         add_final_sigmoid = add_final_sigmoid,
+                                         objective_type = objective_type,
+                                         label_delay = label_delay)
+
+    if xent_regularize != 0.0:
+        # This block prints the configs for a separate output that will be
+        # trained with a cross-entropy objective in the 'chain' models... this
+        # has the effect of regularizing the hidden parts of the model.  we use
+        # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+        # 1.0 / args.xent_regularize is suitable as it means the xent
+        # final-layer learns at a rate independent of the regularization
+        # constant; and the 0.5 was tuned so as to make the relative progress
+        # similar in the xent and regular final layers.
+        num_learnable_params_xent = AddFinalLayer(config_lines, input, num_targets,
+                                                  ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(0.5 / xent_regularize),
+                                                  use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                                  prior_scale_file = prior_scale_file,
+                                                  include_log_softmax = True,
+                                                  name_affix = 'xent',
+                                                  label_delay = label_delay)
+
+    return [num_learnable_params, num_learnable_params_xent]
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 53739f0f9ce..3387beb093a 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -14,8 +14,31 @@
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="Writes config files and variables "
-                                                 "for LSTMs creation and training",
+    parser = argparse.ArgumentParser(description="Writes config files and variables for LSTMs creation and training,"
+                                                 " it also supports adding TDNN layers before, in between and after LSTMs."
+                                                 " This is done by interpreting --splice-indexes, --num-lstm-layers and --lstm-start-layer-index.\n"
+                                                 " When the splicing indexes at a layer corresponding to an LSTM is not [0] a TDNN layer is added before it.\n"
+                                                 " e.g.\n --splice-indexes '-2,-1,0,1,2 0 0 0' --num-lstm-layers 3 --lstm-start-layer-index 0 \n"
+                                                 "      This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n"
+                                                 "      and 3 lstm layers.\n"
+                                                 " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0' --num-lstm-layers 3 --lstm-start-layer-index 0 \n"
+                                                 "      This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n"
+                                                 "          TDNN layer with splicing -3,0,3 + LSTM layer,\n"
+                                                 "          TDNN layer with splicing -3,0,3 + LSTM layer,\n"
+                                                 "          and an LSTM layer\n"
+                                                 " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0 0' --num-lstm-layers 3 --lstm-start-layer-index 1 \n"
+                                                 "      This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n"
+                                                 "          TDNN layer with splicing -3,0,3 \n"
+                                                 "          TDNN layer with splicing -3,0,3 + LSTM layer,\n"
+                                                 "          LSTM layer,\n"
+                                                 "          and an LSTM layer\n"
+                                                 " --splice-index '-2,-1,0,1,2 -3,0,3 -3,0,3 0 0 -3,0,3 -3,0,3 -3,0,3' --num-lstm-layers 3 --lstm-start-layer-index 1 \n"
+                                                 "      This will add input layer with splicing -2,-1,0,1,2 followed by LDA layer, \n"
+                                                 "          TDNN layer with splicing -3,0,3 \n"
+                                                 "          TDNN layer with splicing -3,0,3 + LSTM layer,\n"
+                                                 "          TDNN layer with splicing -3,0,3 + LSTM layer,\n"
+                                                 "          and an LSTM layer\n"
+                                                 ,
                                      epilog="See steps/nnet3/lstm/train.sh for example.")
 
     # Only one of these arguments can be specified, and one of them has to
@@ -48,12 +71,20 @@ def GetArgs():
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
                         default=0.0)
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                        default=False, choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
     parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
 
     # LSTM options
     parser.add_argument("--num-lstm-layers", type=int,
                         help="Number of LSTM layers to be stacked", default=1)
+    parser.add_argument("--lstm-start-layer-index", type=int,
+                        help="layer number to start lstms from. Layer indexing starts from 0.", default=1)
     parser.add_argument("--cell-dim", type=int,
                         help="dimension of lstm-cell")
     parser.add_argument("--recurrent-projection-dim", type=int,
@@ -123,18 +154,23 @@ def CheckArgs(args):
         raise Exception("ivector-dim has to be non-negative")
 
     if (args.num_lstm_layers < 1):
-        sys.exit("--num-lstm-layers has to be a positive integer")
+        raise Exception("--num-lstm-layers has to be a positive integer")
+    if (args.lstm_start_layer_index < 1):
+        raise Exception("--lstm-start-layer-index has to be positive number.")
+    elif (args.lstm_start_layer_index > 1):
+        warnings.warn("TDNN/Affine layers are going to be stacked before LSTM layers, we don't support shrinkage in this scenario")
+
     if (args.clipping_threshold < 0):
-        sys.exit("--clipping-threshold has to be a non-negative")
+        raise Exception("--clipping-threshold has to be a non-negative")
     if args.lstm_delay is None:
         args.lstm_delay = [[-1]] * args.num_lstm_layers
     else:
         try:
             args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
         except ValueError:
-            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
+            raise Exception("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
         if len(args.lstm_delay) != args.num_lstm_layers:
-            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
+            raise Exception("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
 
     return args
 
@@ -146,38 +182,34 @@ def PrintConfig(file_name, config_lines):
     f.close()
 
 def ParseSpliceString(splice_indexes, label_delay=None):
-    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
-    split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
-    if len(split1) < 1:
-        splice_indexes = "0"
-
-    left_context=0
-    right_context=0
+    splice_array = []
+    left_context = 0
+    right_context = 0
     if label_delay is not None:
         left_context = -label_delay
         right_context = label_delay
 
-    splice_array = []
+    split1 = splice_indexes.split();  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        raise Exception("invalid splice-indexes argument, too short: "
+                 + splice_indexes)
     try:
-        for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].strip().split(","))
-            print(indexes)
-            if len(indexes) < 1:
-                raise ValueError("invalid --splice-indexes argument, too-short element: "
-                                + splice_indexes)
-
-            if (i > 0)  and ((len(indexes) != 1) or (indexes[0] != 0)):
-                raise ValueError("elements of --splice-indexes splicing is only allowed initial layer.")
-
-            if not indexes == sorted(indexes):
-                raise ValueError("elements of --splice-indexes must be sorted: "
-                                + splice_indexes)
-            left_context += -indexes[0]
-            right_context += indexes[-1]
-            splice_array.append(indexes)
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                raise Exception("invalid splice-indexes argument, too-short element: "
+                         + splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                raise Exception("elements of splice-indexes must be sorted: "
+                         + splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
     except ValueError as e:
-        raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e))
-
+        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
     left_context = max(0, left_context)
     right_context = max(0, right_context)
 
@@ -209,18 +241,30 @@ def ParseLstmDelayString(lstm_delay):
 
 
 def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
-                splice_indexes, lstm_delay, cell_dim, hidden_dim,
+                splice_indexes_string, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
-                num_lstm_layers, num_hidden_layers,
+                num_lstm_layers, lstm_start_layer_index,
                 norm_based_clipping, clipping_threshold,
                 ng_per_element_scale_options, ng_affine_options,
                 label_delay, include_log_softmax, xent_regularize,
+                xent_separate_forward_affine, final_layer_normalize_target,
                 self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
 
+    num_learnable_params = 0
+    num_learnable_params_xent = 0 # number of parameters in the xent branch
     config_lines = {'components':[], 'component-nodes':[]}
-
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+
+
+    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(config_dir,
+                                                                                            splice_indexes_string,
+                                                                                            label_delay,
+                                                                                            num_lstm_layers,
+                                                                                            lstm_start_layer_index)
+
+
+    prev_layer = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = prev_layer['output']
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -229,95 +273,239 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
     nodes.AddOutputLayer(init_config_lines, prev_layer_output)
     config_files[config_dir + '/init.config'] = init_config_lines
 
-    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
-
-    for i in range(num_lstm_layers):
-        if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
-            prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1),
-                                                    prev_layer_output, cell_dim,
-                                                    recurrent_projection_dim, non_recurrent_projection_dim,
-                                                    clipping_threshold, norm_based_clipping,
-                                                    ng_per_element_scale_options, ng_affine_options,
-                                                    lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
-        else: # add a uni-directional LSTM layer
-            prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
-                                                   prev_layer_output, cell_dim,
-                                                   recurrent_projection_dim, non_recurrent_projection_dim,
-                                                   clipping_threshold, norm_based_clipping,
-                                                   ng_per_element_scale_options, ng_affine_options,
-                                                   lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
-        # make the intermediate config file for layerwise discriminative
-        # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
-
-
-        if xent_regularize != 0.0:
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                include_log_softmax = True, label_delay = label_delay,
-                                name_affix = 'xent')
-
-        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+    prev_layer = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+    prev_layer_output = prev_layer['output']
+
+    # we don't want to add a simple affine layer after input layer as in TDNN only configs,
+    # so we reduce the number of hidden layers and splice_indexes
+    splice_indexes = splice_indexes[1:]
+    num_hidden_layers = num_hidden_layers - 1
+    lstm_start_layer_index -= 1
+
+    num_layers_added = 0
+    # stacking the TDNN/affine layers before the LSTM layers
+    for i in range(lstm_start_layer_index):
+        num_layers_added += 1
+        # we will just support Relu non-linearities
+        if splice_indexes[i] == [0]:
+            # add a normal affine layer
+            prev_layer = nodes.AddAffRelNormLayer(config_lines, 'Affine{0}'.format(num_layers_added),
+                                                  prev_layer_output,
+                                                  hidden_dim,
+                                                  self_repair_scale = self_repair_scale_nonlinearity,
+                                                  norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+            prev_layer_output = prev_layer['output']
+            num_learnable_params = prev_layer['num_learnable_params']
+        else :
+            prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added),
+                                            prev_layer_output,
+                                            splice_indexes = splice_indexes[i],
+                                            nonlin_type = 'relu',
+                                            nonlin_input_dim = hidden_dim,
+                                            nonlin_output_dim = hidden_dim,
+                                            self_repair_scale = self_repair_scale_nonlinearity,
+                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+            prev_layer_output = prev_layer['output']
+            num_learnable_params = prev_layer['num_learnable_params']
+
+        # a final layer is added after each new layer as we are generating
+        # configs for layer-wise discriminative training
+        num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines,
+                                                                                         input = prev_layer_output,
+                                                                                         num_targets = num_targets,
+                                                                                         use_presoftmax_prior_scale = False,
+                                                                                         prior_scale_file = None,
+                                                                                         include_log_softmax = include_log_softmax,
+                                                                                         self_repair_scale = self_repair_scale_nonlinearity,
+                                                                                         xent_regularize = xent_regularize,
+                                                                                         label_delay = label_delay,
+                                                                                         add_final_sigmoid = False,
+                                                                                         objective_type='linear')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 
-    for i in range(num_lstm_layers, num_hidden_layers):
-        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
-                                               prev_layer_output, hidden_dim,
-                                               ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
-        # make the intermediate config file for layerwise discriminative
-        # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
 
-        if xent_regularize != 0.0:
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                include_log_softmax = True, label_delay = label_delay,
-                                name_affix = 'xent')
 
-        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+    # stacking the LSTM layers
+    for i in range(lstm_start_layer_index, lstm_start_layer_index + num_lstm_layers):
+        num_layers_added += 1
+        if splice_indexes[i] != [0]:
+            # there is a non-zero splice-indexes at this layer index so we pre-pend a tdnn layer
+            warnings.warn("Adding a TDNN layer before LSTM at layer {l}"
+                          " as splice-indexes are {s} and not 0".format(l=i, s=','.join(map(lambda x: str(x), splice_indexes[i]))))
+
+            prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added),
+                                            prev_layer_output,
+                                            splice_indexes = splice_indexes[i],
+                                            nonlin_type = 'relu',
+                                            nonlin_input_dim = prev_layer_output['dimension'],
+                                            nonlin_output_dim = prev_layer_output['dimension'],
+                                            self_repair_scale = self_repair_scale_nonlinearity,
+                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+            prev_layer_output = prev_layer['output']
+            num_learnable_params += prev_layer['num_learnable_params']
+
+        lstm_index = i - lstm_start_layer_index
+        if len(lstm_delay[lstm_index]) == 2: # add a bi-directional LSTM layer
+            prev_layer = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(num_layers_added),
+                                             prev_layer_output, cell_dim,
+                                             recurrent_projection_dim, non_recurrent_projection_dim,
+                                             clipping_threshold, norm_based_clipping,
+                                             ng_per_element_scale_options, ng_affine_options,
+                                             lstm_delay = lstm_delay[lstm_index],
+                                             self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
+                                             self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+            prev_layer_output = prev_layer['output']
+            num_learnable_params += prev_layer['num_learnable_params']
+
+        else: # add a uni-directional LSTM layer
+            prev_layer = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(num_layers_added),
+                                            prev_layer_output, cell_dim,
+                                            recurrent_projection_dim, non_recurrent_projection_dim,
+                                            clipping_threshold, norm_based_clipping,
+                                            ng_per_element_scale_options, ng_affine_options,
+                                            lstm_delay = lstm_delay[lstm_index][0],
+                                            self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
+                                            self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+            prev_layer_output = prev_layer['output']
+            num_learnable_params += prev_layer['num_learnable_params']
+
+
+
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+            num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines,
+                                                                                             input = prev_layer_output,
+                                                                                             num_targets = num_targets,
+                                                                                             use_presoftmax_prior_scale = False,
+                                                                                             prior_scale_file = None,
+                                                                                             include_log_softmax = include_log_softmax,
+                                                                                             self_repair_scale = self_repair_scale_nonlinearity,
+                                                                                             xent_regularize = xent_regularize,
+                                                                                             label_delay = label_delay,
+                                                                                             objective_type = 'linear',
+                                                                                             add_final_sigmoid = False)
+
+
+        config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    # stacking TDNN/affine layers after the LSTM layers
+    for i in range(lstm_start_layer_index + num_lstm_layers, num_hidden_layers):
+        num_layers_added += 1
+        if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            # xent_separate_forward_affine is only honored only when adding the final hidden layer
+            # this is the final layer so assert that splice index is [0]
+            assert(splice_indexes[i] == [0])
+            if xent_regularize == 0.0:
+                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
+
+            # we use named arguments as we do not want argument offset errors
+            num_params_final, num_params_final_xent = nodes.AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines,
+                                                                                                                   input = prev_layer_output,
+                                                                                                                   num_targets = num_targets,
+                                                                                                                   nonlin_type = 'relu',
+                                                                                                                   nonlin_input_dim = hidden_dim,
+                                                                                                                   nonlin_output_dim = hidden_dim,
+                                                                                                                   use_presoftmax_prior_scale = False,
+                                                                                                                   prior_scale_file = None,
+                                                                                                                   include_log_softmax = include_log_softmax,
+                                                                                                                   self_repair_scale = self_repair_scale_nonlinearity,
+                                                                                                                   xent_regularize = xent_regularize,
+                                                                                                                   label_delay = label_delay,
+                                                                                                                   final_layer_normalize_target = final_layer_normalize_target)
+        else:
+            # we will just support Relu non-linearities
+            if splice_indexes[i] == [0]:
+                # add a normal affine layer
+                prev_layer = nodes.AddAffRelNormLayer(config_lines, 'Affine{0}'.format(num_layers_added),
+                                                      prev_layer_output,
+                                                      hidden_dim,
+                                                      self_repair_scale = self_repair_scale_nonlinearity,
+                                                      norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+                prev_layer_output = prev_layer['output']
+                num_learnable_params = prev_layer['num_learnable_params']
+            else :
+                prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(num_layers_added),
+                                                prev_layer_output,
+                                                splice_indexes = splice_indexes[i],
+                                                nonlin_type = 'relu',
+                                                nonlin_input_dim = hidden_dim,
+                                                nonlin_output_dim = hidden_dim,
+                                                self_repair_scale = self_repair_scale_nonlinearity,
+                                                norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+                prev_layer_output = prev_layer['output']
+                num_learnable_params = prev_layer['num_learnable_params']
+
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+            num_params_final, num_params_final_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines,
+                                                                                             input = prev_layer_output,
+                                                                                             num_targets = num_targets,
+                                                                                             use_presoftmax_prior_scale = False,
+                                                                                             prior_scale_file = None,
+                                                                                             include_log_softmax = include_log_softmax,
+                                                                                             self_repair_scale = self_repair_scale_nonlinearity,
+                                                                                             xent_regularize = xent_regularize,
+                                                                                             label_delay = label_delay,
+                                                                                             add_final_sigmoid = False,
+                                                                                             objective_type='linear')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, num_layers_added)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 
+    num_learnable_params += num_params_final
+    num_learnable_params_xent = num_params_final_xent
+
     # printing out the configs
     # init.config used to train lda-mllt train
     for key in config_files.keys():
         PrintConfig(key, config_files[key])
 
 
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
+    f.close()
+
 
+    print('This model has num_learnable_params={0:,} and num_learnable_params_xent={1:,}'.format(num_learnable_params, num_learnable_params_xent))
 
-def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
+
+def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers, lstm_start_layer_index):
     parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
     left_context = parsed_splice_output['left_context']
     right_context = parsed_splice_output['right_context']
     num_hidden_layers = parsed_splice_output['num_hidden_layers']
     splice_indexes = parsed_splice_output['splice_indexes']
 
-    if (num_hidden_layers < num_lstm_layers):
-        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
 
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    f.close()
+    if (num_hidden_layers < lstm_start_layer_index + num_lstm_layers):
+        raise Exception("num-lstm-layers : (number of lstm layers + lstm start layer index) "
+                        " has to be smaller than number of layers determined from splice-indexes")
+
 
     return [left_context, right_context, num_hidden_layers, splice_indexes]
 
 
 def Main():
     args = GetArgs()
-    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
 
     MakeConfigs(config_dir = args.config_dir,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
-                splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
+                splice_indexes_string = args.splice_indexes, lstm_delay = args.lstm_delay,
                 cell_dim = args.cell_dim,
                 hidden_dim = args.hidden_dim,
                 recurrent_projection_dim = args.recurrent_projection_dim,
                 non_recurrent_projection_dim = args.non_recurrent_projection_dim,
                 num_lstm_layers = args.num_lstm_layers,
-                num_hidden_layers = num_hidden_layers,
+                lstm_start_layer_index = args.lstm_start_layer_index,
                 norm_based_clipping = args.norm_based_clipping,
                 clipping_threshold = args.clipping_threshold,
                 ng_per_element_scale_options = args.ng_per_element_scale_options,
@@ -325,6 +513,8 @@ def Main():
                 label_delay = args.label_delay,
                 include_log_softmax = args.include_log_softmax,
                 xent_regularize = args.xent_regularize,
+                xent_separate_forward_affine = args.xent_separate_forward_affine,
+                final_layer_normalize_target = args.final_layer_normalize_target,
                 self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
                 self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
 
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index bac260e93bc..0e6a24d5619 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -227,6 +227,7 @@ def AddConvMaxpLayer(config_lines, name, input, args):
 # The ivectors are processed through an affine layer parallel to the CNN layers,
 # then concatenated with the CNN output and passed to the deeper part of the network.
 def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+    num_learnable_params = 0
     cnn_args = ParseCnnString(cnn_layer)
     num_cnn_layers = len(cnn_args)
     # We use an Idct layer here to convert MFCC to FBANK features
@@ -244,19 +245,27 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c
                          'vectorization': 'yzx'}
 
     for cl in range(0, num_cnn_layers):
-        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
+        prev_layer = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
+        prev_layer_output = prev_layer['output']
+        num_learnable_params = prev_layer['num_learnable_params']
 
     if cnn_bottleneck_dim > 0:
-        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+        prev_layer = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+        prev_layer_output = prev_layer['output']
+        num_learnable_params = prev_layer['num_learnable_params']
 
     if ivector_dim > 0:
         iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
                            'dimension': ivector_dim}
-        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
+        iv_layer = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
+        iv_layer_output = iv_layer['output']
+        num_learnable_params += iv_layer['num_learnable_params']
+
         prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
         prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
 
-    return prev_layer_output
+    return {'output' : prev_layer_output,
+            'num_learnable_params' : num_learnable_params}
 
 def PrintConfig(file_name, config_lines):
     f = open(file_name, 'w')
@@ -337,8 +346,6 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
 
-    left_context = parsed_splice_output['left_context']
-    right_context = parsed_splice_output['right_context']
     num_hidden_layers = parsed_splice_output['num_hidden_layers']
     splice_indexes = parsed_splice_output['splice_indexes']
     input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
@@ -349,12 +356,20 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
 
+    # start the config generation process
     config_lines = {'components':[], 'component-nodes':[]}
-
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
 
-    # Add the init config lines for estimating the preconditioning matrices
+    num_learnable_params = 0
+    num_learnable_params_xent = 0   # number of parameters in xent-branch of chain models
+
+    prev_layer = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = prev_layer['output']
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+
+    # Adding the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
@@ -362,19 +377,21 @@ def MakeConfigs(config_dir, splice_indexes_string,
     config_files[config_dir + '/init.config'] = init_config_lines
 
     if cnn_layer is not None:
-        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
+        prev_layer = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
                                          feat_dim, splice_indexes[0], ivector_dim)
+        prev_layer_output = prev_layer['output']
+        num_learnable_params += prev_layer['num_learnable_params']
 
     if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
-
-    left_context = 0
-    right_context = 0
-    # we moved the first splice layer to before the LDA..
-    # so the input to the first affine layer is going to [0] index
-    splice_indexes[0] = [0]
+        prev_layer = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+        prev_layer_output = prev_layer['output']
 
-    if not nonlin_output_dim is None:
+    # generating the output-dims for each layer
+    if nonlin_type == "pnorm":
+        # we don't increase the output dimension for each layer
+        # we might support this in the future
+        nonlin_output_dims = [nonlin_output_dim] * num_hidden_layers
+    elif not nonlin_output_dim is None:
         nonlin_output_dims = [nonlin_output_dim] * num_hidden_layers
     elif nonlin_output_dim_init < nonlin_output_dim_final and num_hidden_layers == 1:
         raise Exception("num-hidden-layers has to be greater than 1 if relu-dim-init and relu-dim-final is different.")
@@ -385,119 +402,73 @@ def MakeConfigs(config_dir, splice_indexes_string,
         assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
         nonlin_output_dims[-1] = nonlin_output_dim_final # It ensures that the dim of the last hidden layer is exactly the same as what is specified
 
-    for i in range(0, num_hidden_layers):
-        # make the intermediate config file for layerwise discriminative training
-
-        # prepare the spliced input
-        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
-            try:
-                zero_index = splice_indexes[i].index(0)
-            except ValueError:
-                zero_index = None
-            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
-            prev_layer_output_descriptor = prev_layer_output['descriptor']
-            subset_output = prev_layer_output
-            if subset_dim > 0:
-                # if subset_dim is specified the script expects a zero in the splice indexes
-                assert(zero_index is not None)
-                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
-                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
-                                 'dimension' : subset_dim}
-                config_lines['component-nodes'].append(subset_node_config)
-            appended_descriptors = []
-            appended_dimension = 0
-            for j in range(len(splice_indexes[i])):
-                if j == zero_index:
-                    appended_descriptors.append(prev_layer_output['descriptor'])
-                    appended_dimension += prev_layer_output['dimension']
-                    continue
-                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
-                appended_dimension += subset_output['dimension']
-            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
-                                 'dimension'  : appended_dimension}
-        else:
-            # this is a normal affine node
-            pass
 
+    # Adding the TDNN layers
+    for i in range(0, num_hidden_layers):
         if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            # xent_separate_forward_affine is only done when adding the final hidden layer
+            # this is the final layer so assert that splice index is [0]
+            assert(splice_indexes[i] == [0])
             if xent_regularize == 0.0:
                 raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
 
-            if nonlin_type == "relu" :
-                prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
-                                                                   prev_layer_output, nonlin_output_dim,
-                                                                   self_repair_scale = self_repair_scale,
-                                                                   norm_target_rms = final_layer_normalize_target)
-
-                prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
-                                                                  prev_layer_output, nonlin_output_dim,
-                                                                  self_repair_scale = self_repair_scale,
-                                                                  norm_target_rms = final_layer_normalize_target)
-            elif nonlin_type == "pnorm" :
-                prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain",
-                                                                 prev_layer_output, nonlin_input_dim, nonlin_output_dim,
-                                                                 norm_target_rms = final_layer_normalize_target)
-
-                prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent",
-                                                                prev_layer_output, nonlin_input_dim, nonlin_output_dim,
-                                                                norm_target_rms = final_layer_normalize_target)
-            else:
-                raise Exception("Unknown nonlinearity type")
-
-            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
-                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                               prior_scale_file = prior_scale_file,
-                               include_log_softmax = include_log_softmax)
-
-            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
-                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
-                                    0.5 / xent_regularize),
-                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                                prior_scale_file = prior_scale_file,
-                                include_log_softmax = True,
-                                name_affix = 'xent')
+            # we use named arguments as we do not want argument offset errors
+            num_params_fin, num_params_fin_xent = nodes.AddFinalLayersWithXentSeperateForwardAffineRegularizer(config_lines,
+                                                                                                             input = prev_layer_output,
+                                                                                                             num_targets = num_targets,
+                                                                                                             nonlin_type = nonlin_type,
+                                                                                                             nonlin_input_dim = nonlin_input_dim,
+                                                                                                             nonlin_output_dim = nonlin_output_dim,
+                                                                                                             use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                                                                                             prior_scale_file = prior_scale_file,
+                                                                                                             include_log_softmax = include_log_softmax,
+                                                                                                             self_repair_scale = self_repair_scale,
+                                                                                                             xent_regularize = xent_regularize,
+                                                                                                             final_layer_normalize_target = final_layer_normalize_target)
+
+
+
         else:
-            if nonlin_type == "relu":
-                prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
-                                                            prev_layer_output, nonlin_output_dims[i],
-                                                            self_repair_scale = self_repair_scale,
-                                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
-            elif nonlin_type == "pnorm":
-                prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i),
-                                                           prev_layer_output, nonlin_input_dim, nonlin_output_dim,
-                                                           norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+            if splice_indexes[i] == [0]:
+                # add a normal affine layer
+                prev_layer = nodes.AddAffineNonlinLayer(config_lines, 'Affine{0}'.format(i),
+                                                          prev_layer_output,
+                                                          nonlin_type, nonlin_input_dim, nonlin_output_dims[i],
+                                                          self_repair_scale = self_repair_scale,
+                                                          norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
             else:
-                raise Exception("Unknown nonlinearity type")
+                prev_layer = nodes.AddTdnnLayer(config_lines, 'Tdnn{0}'.format(i+1), prev_layer_output,
+                                                splice_indexes[i],
+                                                nonlin_type, nonlin_input_dim, nonlin_output_dims[i],
+                                                subset_dim = subset_dim,
+                                                self_repair_scale = self_repair_scale,
+                                                norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+
             # a final layer is added after each new layer as we are generating
             # configs for layer-wise discriminative training
+            num_params_fin, num_params_fin_xent = nodes.AddFinalLayerWithXentRegularizer(config_lines,
+                                                                                         input = prev_layer_output,
+                                                                                         num_targets = num_targets,
+                                                                                         use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                                                                         prior_scale_file = prior_scale_file,
+                                                                                         include_log_softmax = include_log_softmax,
+                                                                                         self_repair_scale = self_repair_scale,
+                                                                                         xent_regularize = xent_regularize,
+                                                                                         add_final_sigmoid = add_final_sigmoid,
+                                                                                         objective_type = objective_type)
+
+
 
-            # add_final_sigmoid adds a sigmoid as a final layer as alternative
-            # to log-softmax layer.
-            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
-            # This is useful when you need the final outputs to be probabilities between 0 and 1.
-            # Usually used with an objective-type such as "quadratic".
-            # Applications are k-binary classification such Ideal Ratio Mask prediction.
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                               prior_scale_file = prior_scale_file,
-                               include_log_softmax = include_log_softmax,
-                               add_final_sigmoid = add_final_sigmoid,
-                               objective_type = objective_type)
-            if xent_regularize != 0.0:
-                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
-                                          0.5 / xent_regularize),
-                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                                    prior_scale_file = prior_scale_file,
-                                    include_log_softmax = True,
-                                    name_affix = 'xent')
 
         config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 
-    left_context += int(parsed_splice_output['left_context'])
-    right_context += int(parsed_splice_output['right_context'])
+    num_learnable_params += num_params_fin
+    num_learnable_params_xent = num_params_fin_xent
 
+    left_context = int(parsed_splice_output['left_context'])
+    right_context = int(parsed_splice_output['right_context'])
     # write the files used by other scripts like steps/nnet3/get_egs.sh
     f = open(config_dir + "/vars", "w")
     print('model_left_context=' + str(left_context), file=f)
@@ -514,6 +485,8 @@ def MakeConfigs(config_dir, splice_indexes_string,
     for key in config_files.keys():
         PrintConfig(key, config_files[key])
 
+    print('This model has num_learnable_params={0:,} and num_learnable_params_xent={1:,}'.format(num_learnable_params, num_learnable_params_xent))
+
 def Main():
     args = GetArgs()