From bbdfeafec1cefa245077f06e5080e1cb3774bd20 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 27 Sep 2016 00:51:50 -0400
Subject: [PATCH 01/71] raw_python_script: Adding raw nnet training

---
 egs/wsj/s5/steps/nnet3/components.py          |  15 +-
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py     | 244 +++++-
 .../s5/steps/nnet3/report/generate_plots.py   |  18 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 661 +++++++++++++++++
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 700 ++++++++++++++++++
 src/nnet3bin/nnet3-copy.cc                    |  22 +
 6 files changed, 1611 insertions(+), 49 deletions(-)
 create mode 100755 egs/wsj/s5/steps/nnet3/train_raw_dnn.py
 create mode 100755 egs/wsj/s5/steps/nnet3/train_raw_rnn.py

diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 9b9ce4a54ad..0b85012e7d0 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -26,12 +26,16 @@ def GetSumDescriptor(inputs):
     return sum_descriptors
 
 # adds the input nodes and returns the descriptor
-def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
+def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0, idct_mat = None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
     output_dim = 0
     components.append('input-node name=input dim=' + str(feat_dim))
-    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
+    prev_layer_output = {'descriptor':  "input",
+                         'dimension': feat_dim}
+    if idct_mat is not None:
+        prev_layer_output = AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, idct_mat)
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
     output_dim += len(splice_indexes) * feat_dim
     if ivector_dim > 0:
         components.append('input-node name=ivector dim=' + str(ivector_dim))
@@ -158,6 +162,11 @@ def AddConvolutionLayer(config_lines, name, input,
     else:
         conv_init_string += " num-filters={0}".format(num_filters)
 
+    if param_stddev is not None:
+        conv_init_string += " param-stddev={0}".format(param_stddev)
+    if bias_stddev is not None:
+        conv_init_string += " bias-stddev={0}".format(bias_stddev)
+
     components.append(conv_init_string)
     component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
 
@@ -448,4 +457,4 @@ def AddBLstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
- 
+
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index a43aa05176b..0128d01adc9 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -24,15 +24,21 @@ def SendMail(message, subject, email_id):
         logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
         pass
 
+def StrToBool(values):
+    if values == "true":
+        return True
+    elif values == "false":
+        return False
+    else:
+        raise ValueError
+
 class StrToBoolAction(argparse.Action):
     """ A custom action to convert bools from shell format i.e., true/false
         to python format i.e., True/False """
     def __call__(self, parser, namespace, values, option_string=None):
-        if values == "true":
-            setattr(namespace, self.dest, True)
-        elif values == "false":
-            setattr(namespace, self.dest, False)
-        else:
+        try:
+            setattr(namespace, self.dest, StrToBool(values))
+        except ValueError:
             raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
 
 class NullstrToNoneAction(argparse.Action):
@@ -105,6 +111,49 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
 
     return [accepted_models, max_index+1]
 
+def GetAverageNnetModel(dir, iter, nnets_list, run_opts, use_raw_nnet = False, shrink = None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    new_iter = iter + 1
+    if use_raw_nnet:
+        if shrink is not None:
+            out_model = "- \| nnet3-copy --scale={scale} - {dir}/{new_iter}.raw".format(dir = dir, new_iter = new_iter, scale = scale)
+        else:
+            out_model = "{dir}/{new_iter}.raw".format(dir = dir, new_iter = new_iter)
+    else:
+        out_model = "- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} {dir}/{iter}.mdl {dir}/{new_iter}.mdl".format(dir = dir, iter = iter, new_iter = new_iter, scale = scale)
+
+    RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnets_list} \
+{out_model}""".format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               nnets_list = nnets_list,
+               out_model = out_model))
+
+def GetBestNnetModel(dir, iter, best_model_index, run_opts, use_raw_nnet = False, shrink = None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(dir = dir, next_iter = iter + 1, best_model_index = best_model_index)
+
+    if use_raw_nnet:
+        out_model = '{dir}/{next_iter}.raw'.format(dir = dir, next_iter = iter + 1)
+    else:
+        out_model = '- \| nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{next_iter}.mdl'.format(dir = dir, iter = iter, new_iter = iter + 1)
+
+    RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+nnet3-copy --scale={scale} {best_model} \
+{out_model}""".format(command = run_opts.command,
+               dir = dir, iter = iter,
+               best_model =  best_model,
+               out_model = out_model, scale = scale))
+
 def GetNumberOfLeaves(alidir):
     [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
     parts = stdout.split()
@@ -120,6 +169,7 @@ def GetNumberOfJobs(alidir):
     except IOError, ValueError:
         raise Exception('Exception while reading the number of alignment jobs')
     return num_jobs
+
 def GetIvectorDim(ivector_dir = None):
     if ivector_dir is None:
         return 0
@@ -132,6 +182,11 @@ def GetFeatDim(feat_dir):
     feat_dim = int(stdout_val)
     return feat_dim
 
+def GetFeatDimFromScp(feat_scp):
+    [stdout_val, stderr_val] =  RunKaldiCommand("feat-to-dim --print-args=false scp:{feat_scp} -".format(feat_scp = feat_scp))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
 def ReadKaldiMatrix(matrix_file):
     try:
         lines = map(lambda x: x.split(), open(matrix_file).readlines())
@@ -205,6 +260,28 @@ def ParseModelConfigVarsFile(var_file):
 
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
+def ParseModelConfigGenericVarsFile(var_file):
+    variables = {}
+    try:
+        var_file_handle = open(var_file, 'r')
+        for line in var_file_handle:
+            parts = line.split('=')
+            field_name = parts[0].strip()
+            field_value = parts[1].strip()
+            if field_name in ['model_left_context', 'left_context']:
+                variables['model_left_context'] = int(field_value)
+            elif field_name in ['model_right_context', 'right_context']:
+                variables['model_right_context'] = int(field_value)
+            elif field_name == 'num_hidden_layers':
+                variables['num_hidden_layers'] = int(field_value)
+            else:
+                variables[field_name] = field_value
+        return variables
+    except ValueError:
+        # we will throw an error at the end of the function so I will just pass
+        pass
+
+    raise Exception('Error while parsing the file {0}'.format(var_file))
 
 def GenerateEgs(data, alidir, egs_dir,
                 left_context, right_context,
@@ -242,6 +319,53 @@ def GenerateEgs(data, alidir, egs_dir,
           egs_dir = egs_dir,
           egs_opts = egs_opts if egs_opts is not None else '' ))
 
+def GenerateEgsFromTargets(data, targets_scp, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                target_type = 'dense', num_targets = -1,
+                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+    if target_type == 'dense':
+        num_targets = GetFeatDimFromScp(targets_scp)
+    else:
+        if num_targets == -1:
+            raise Exception("--num-targets is required if target-type is dense")
+
+    RunKaldiCommand("""
+steps/nnet3/get_egs_targets.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  --target-type {target_type} \
+  --num-targets {num_targets} \
+  {data} {targets_scp} {egs_dir}
+      """.format(command = run_opts.egs_command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, srand = srand,
+          num_targets = num_targets,
+          data = data,
+          targets_scp = targets_scp, target_type = target_type,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
 def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
@@ -351,7 +475,7 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
     WriteKaldiMatrix(output_file, [scaled_counts])
     ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
 
-def PrepareInitialAcousticModel(dir, alidir, run_opts):
+def PrepareInitialAcousticModel(dir, alidir, run_opts, use_raw_nnet = False):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
@@ -361,13 +485,14 @@ def PrepareInitialAcousticModel(dir, alidir, run_opts):
    nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
                dir = dir))
 
-  # Convert to .mdl, train the transitions, set the priors.
-    RunKaldiCommand("""
-{command} {dir}/log/init_mdl.log \
-    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
-    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, alidir = alidir))
+    if not use_raw_nnet:
+      # Convert to .mdl, train the transitions, set the priors.
+        RunKaldiCommand("""
+    {command} {dir}/log/init_mdl.log \
+        nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+        nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+            """.format(command = run_opts.command,
+                       dir = dir, alidir = alidir))
 
 def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
                      num_archives, max_models_combine, add_layers_period,
@@ -478,13 +603,16 @@ def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
 
     return num_jobs * effective_learning_rate
 
-def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
+def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold, use_raw_nnet = False):
 
     if iter == 0:
         return True
 
     try:
-        output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
+        if use_raw_nnet:
+            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
+        else:
+            output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
         output = output.strip().split("\n")
         # eg.
         # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
@@ -506,41 +634,53 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False):
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False, use_raw_nnet = False, compute_accuracy = True):
 
-    model = '{0}/{1}.mdl'.format(dir, iter)
+    if use_raw_nnet:
+        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
+    else:
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
+
+    compute_prob_opts = ""
+    if compute_accuracy:
+        compute_prob_opts = "--compute-accuracy"
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
-  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
         "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               compute_prob_opts = compute_prob_opts,
                egs_dir = egs_dir), wait = wait)
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
-  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               compute_prob_opts = compute_prob_opts,
                egs_dir = egs_dir), wait = wait)
 
+def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False, use_raw_nnet = False):
+    if use_raw_nnet:
+        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
+        model = '{0}/{1}.raw'.format(dir, iter)
+    else:
+        prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
 
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
-
-    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
-    model = '{0}/{1}.mdl'.format(dir, iter)
     RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
-nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
-nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
+nnet3-info {model} '&&' \
+nnet3-show-progress --use-gpu=no {prev_model} {model} \
 "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
     """.format(command = run_opts.command,
                dir = dir,
@@ -551,7 +691,8 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
                egs_dir = egs_dir), wait = wait)
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
-                  run_opts, chunk_width = None):
+                  run_opts, chunk_width = None,
+                  use_raw_nnet = False):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -559,10 +700,16 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     raw_model_strings = []
     print num_iters_combine
     for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-      model_file = '{0}/{1}.mdl'.format(dir, iter)
-      if not os.path.exists(model_file):
-          raise Exception('Model file {0} missing'.format(model_file))
-      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      if use_raw_nnet:
+          model_file = '{0}/{1}.raw'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append(model_file)
+      else:
+          model_file = '{0}/{1}.mdl'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
 
     if chunk_width is not None:
         # this is an RNN model
@@ -570,17 +717,22 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     else:
         mbsize = 1024
 
+    if use_raw_nnet:
+        out_model = '{dir}/final.raw'.format(dir = dir)
+    else:
+        out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
+
     RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
 nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
    --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
-"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
-    """.format(command = run_opts.command,
+   {out_model}
+   """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
                dir = dir, raw_models = " ".join(raw_model_strings),
                mbsize = mbsize,
-               num_iters = num_iters,
+               out_model = out_model,
                egs_dir = egs_dir))
 
   # Compute the probability of the final, combined model with
@@ -589,7 +741,7 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts):
+                            prior_subset_size, run_opts, use_raw_nnet = False):
     # Note: this just uses CPUs, using a smallish subset of data.
     """ Computes the average posterior of the network"""
     import glob
@@ -601,15 +753,20 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     else:
         egs_part = 'JOB'
 
+    if use_raw_nnet:
+        model = "{dir}/final.raw".format(dir = dir)
+    else:
+        model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir = dir)
+
     RunKaldiCommand("""
 {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
     nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
     nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+    {model} ark:- ark:- \| \
 matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
     """.format(command = run_opts.command,
-               dir = dir,
+               dir = dir, model = model,
                num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
                prior_queue_opt = run_opts.prior_queue_opt,
                iter = iter, prior_subset_size = prior_subset_size,
@@ -643,25 +800,32 @@ def RemoveEgs(egs_dir):
 
 def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
                  preserve_model_interval = 100,
-                 remove_egs = True):
+                 remove_egs = True,
+                 use_raw_nnet = False):
     try:
         if remove_egs:
             RemoveEgs(egs_dir)
 
         for iter in range(num_iters):
             RemoveModel(nnet_dir, iter, num_iters, 1,
-                        preserve_model_interval)
+                        preserve_model_interval,
+                        use_raw_nnet = use_raw_nnet)
     except (IOError, OSError) as err:
         logger.warning("Error while cleaning up the nnet directory")
         raise err
 
 def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
-               preserve_model_interval = 100):
+               preserve_model_interval = 100,
+               use_raw_nnet = False):
     if iter % preserve_model_interval == 0:
         return
     if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
         return
-    file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    if use_raw_nnet:
+        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)
+    else:
+        file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+
     if os.path.isfile(file_name):
         os.remove(file_name)
 
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index ea8f41749da..5f671f1137a 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -48,6 +48,7 @@ def GetArgs():
     parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
     parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
     parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start")
+    parser.add_argument("--is-linear-objf", type=str, default = True, action = train_lib.StrToBoolAction, help="Nnet trained with linear objective as against with quadratic objective")
     parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
     parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
 
@@ -422,7 +423,7 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None,
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
 
-def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False):
+def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False, is_linear_objf = True):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -439,11 +440,15 @@ def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is
         logger.info("Generating log-probability plots")
         GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
     else:
-        logger.info("Generating accuracy plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+        if is_linear_objf:
+            logger.info("Generating accuracy plots")
+            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
-        logger.info("Generating log-likelihood plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+            logger.info("Generating log-likelihood plots")
+            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+        else:
+            logger.info("Generating MSE plots")
+            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
     logger.info("Generating non-linearity stats plots")
     GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
@@ -465,7 +470,8 @@ def Main():
     GeneratePlots(args.exp_dir, args.output_dir,
                   comparison_dir = args.comparison_dir,
                   start_iter = args.start_iter,
-                  is_chain = args.is_chain)
+                  is_chain = args.is_chain,
+                  is_linear_objf = args.is_linear_objf)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
new file mode 100755
index 00000000000..84667eeee45
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -0,0 +1,661 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/tdnn/train_raw_nnet.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting DNN trainer (train_raw_dnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains a feed forward DNN raw acoustic model (without transition model)
+    using the cross-entropy objective.
+    DNNs include simple DNNs, TDNNs and CNNs.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
+                        default = 8,
+                        help="Number of output labels per example")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 0,
+                        help="""Number of left steps used in the estimation of LSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="""Number of right steps used in the estimation of BLSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help="Controls randomization of the samples on each"
+                        "iteration. If 0 or a large value the randomization is"
+                        "complete, but this will consume memory and cause spikes"
+                        "in disk I/O.  Smaller is easier on disk and memory but"
+                        "less random.  It's not a huge deal though, as samples"
+                        "are anyway randomized right at the start."
+                        "(the point of this is to get data in different"
+                        "minibatches on different iterations, since in the"
+                        "preconditioning method, 2 samples in the same minibatch"
+                        "can affect each others' gradients.")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="The maximum change in parameters allowed per minibatch,"
+                        "measured in Frobenius norm over the entire model")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.minibatch-size", type=int, dest='minibatch_size',
+                        default = 512,
+                        help="Size of the minibatch used to compute the gradient")
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+    parser.add_argument("--nj", type=int, default=4,
+                        help="Number of parallel jobs")
+
+    parser.add_argument("--configs-dir", type=str,
+                        help="Use a different configs dir than dir/configs")
+    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+                       default = True, choices = ["true", "false"],
+                       help="Train neural network using dense targets")
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--targets-scp", type=str, required = True,
+                        help="Target for training neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.frames_per_eg < 1:
+        raise Exception("--egs.frames-per-eg should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be positive")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be positive")
+
+
+    if args.configs_dir is not None:
+        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
+                                                '{0}/configs'.format(args.dir)))
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs"
+        " directory which is the output of make_configs.py script")
+
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   cache_read_opt, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame = (k / num_archives) % frames_per_eg
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  "{raw_model}" \
+  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     frame = frame,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     minibatch_size = minibatch_size),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      compute_accuracy,
+                      run_opts, use_raw_nnet = True):
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, use_raw_nnet = True, compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts, use_raw_nnet = True)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+
+    if do_average:
+        cur_minibatch_size = minibatch_size
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_minibatch_size = minibatch_size / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        GetAverageNnetModel(dir = dir, iter = iter,
+                            nnets_list = " ".join(nnets_list),
+                            run_opts = run_opts,
+                            use_raw_nnet = True)
+    else:
+        # choose the best model from different jobs
+        GetBestNnetModel(dir = dir, iter = iter,
+                         best_model_index = best_model,
+                         run_opts = run_opts,
+                         use_raw_nnet = True)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.raw".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    SplitData(args.feat_dir, args.nj)
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    variables = ParseModelConfigGenericVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+        num_targets = int(variables['num_targets'])
+        add_lda = StrToBool(variables['add_lda'])
+        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        objective_type = variables['objective_type']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if args.use_dense_targets:
+        if GetFeatDimFromScp(targets_scp) != num_targets:
+            raise Exception("Mismatch between num-targets provided to "
+                            "script vs configs")
+
+    if (args.stage <= -5):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if args.use_dense_targets:
+        target_type = "dense"
+        compute_accuracy = False
+    else:
+        target_type = "sparse"
+        compute_accuracy = True
+
+    if (args.stage <= -4) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                    left_context, right_context,
+                    left_context, right_context, run_opts,
+                    frames_per_eg = args.frames_per_eg,
+                    srand = args.srand,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage,
+                    target_type = target_type,
+                    num_targets = num_targets)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.frames_per_eg == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (add_lda and args.stage <= -3):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, None, run_opts, use_raw_nnet = True)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_expanded = num_archives * args.frames_per_eg
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives_expanded,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              minibatch_size = args.minibatch_size,
+                              frames_per_eg = args.frames_per_eg,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              compute_accuracy = compute_accuracy,
+                              use_raw_nnet = True,
+                              run_opts = run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval, use_raw_nnet = True)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, use_raw_nnet = True)
+
+    if include_log_softmax and args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts, use_raw_nnet = True)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs,
+                     use_raw_nnet = True)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
new file mode 100755
index 00000000000..f524366c8ac
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -0,0 +1,700 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_raw_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains an RNN acoustic model using the cross-entropy objective.
+    RNNs include LSTMs, BLSTMs and GRUs.
+    RNN acoustic model training differs from feed-forward DNN training
+    in the following ways
+        1. RNN acoustic models train on output chunks rather than individual
+           outputs
+        2. The training includes additional stage of shrinkage, where
+           the parameters of the model are scaled when the derivative averages
+           at the non-linearities are below a threshold.
+        3. RNNs can also be trained with state preservation training
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 20,
+                        help="""Number of output labels in the sequence
+                        used to train an LSTM.
+                        Caution: if you double this you should halve
+                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 40,
+                        help="""Number of left steps used in the estimation of LSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="""Number of right steps used in the estimation of BLSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=20000,
+                        help="""This is really the number of egs in each
+                        archive.  Each eg has 'chunk_width' frames in it--
+                        for chunk_width=20, this value (20k) is equivalent
+                        to the 400k number that we use as a default in
+                        regular DNN training.""")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.5,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 0.99,
+                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
+            default = 256,
+            help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
+
+
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=100,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
+                        default=None,
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+    parser.add_argument("--nj", type=int, default=4,
+                        help="Number of parallel jobs")
+
+    parser.add_argument("--configs-dir", type=str,
+                        help="Use a different configs dir than dir/configs")
+    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+                       default = True, choices = ["true", "false"],
+                       help="Train neural network using dense targets")
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--targets-scp", type=str, required = True,
+                        help="Target for training neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be positive")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be positive")
+
+
+    if args.configs_dir is not None:
+        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
+                                                '{0}/configs'.format(args.dir)))
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
+def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   cache_read_opt, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     momentum = momentum, max_param_change = max_param_change,
+                     min_deriv_time = min_deriv_time,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context, min_deriv_time,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      cv_minibatch_size, compute_accuracy,
+                      run_opts, use_raw_nnet = True):
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = True, compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = True)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        GetAverageNnetModel(dir = dir, iter = iter,
+                            nnets_list = " ".join(nnets_list),
+                            run_opts = run_opts,
+                            use_raw_nnet = True,
+                            shrink = shrinkage_value)
+
+    else:
+        # choose the best model from different jobs
+        GetBestNnetModel(dir = dir, iter = iter,
+                         best_model_index = best_model,
+                         run_opts = run_opts,
+                         use_raw_nnet = True,
+                         shrink = shrinkage_value)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.raw".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    SplitData(args.feat_dir, args.nj)
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    variables = ParseModelConfigGenericVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+        num_targets = int(variables['num_targets'])
+        add_lda = StrToBool(variables['add_lda'])
+        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        objective_type = variables['objective_type']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if args.use_dense_targets:
+        if GetFeatDimFromScp(args.targets_scp) != num_targets:
+            raise Exception("Mismatch between num-targets provided to "
+                            "script vs configs")
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if args.use_dense_targets:
+        target_type = "dense"
+        compute_accuracy = False
+    else:
+        target_type = "sparse"
+        compute_accuracy = True
+
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                    left_context, right_context,
+                    args.chunk_width + left_context,
+                    args.chunk_width + right_context, run_opts,
+                    frames_per_eg = args.chunk_width,
+                    srand = args.srand,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage,
+                    target_type = target_type,
+                    num_targets = num_targets)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (add_lda and args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, None, run_opts, use_raw_nnet = True)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    if args.num_bptt_steps is None:
+        num_bptt_steps = args.chunk_width
+    else:
+        num_bptt_steps = args.num_bptt_steps
+
+    min_deriv_time = args.chunk_width - num_bptt_steps
+
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
+            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, use_raw_nnet = True) else 1
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value = shrinkage_value,
+                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              min_deriv_time = min_deriv_time,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              cv_minibatch_size = args.cv_minibatch_size,
+                              compute_accuracy = compute_accuracy,
+                              run_opts = run_opts,
+                              use_raw_nnet = True)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval, use_raw_nnet = True)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.raw")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                chunk_width = args.chunk_width, use_raw_nnet = True)
+
+    if include_log_softmax and args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts, use_raw_nnet = True)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs,
+                     use_raw_nnet = True)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc
index 8d171cfa121..2b9f664ef9e 100644
--- a/src/nnet3bin/nnet3-copy.cc
+++ b/src/nnet3bin/nnet3-copy.cc
@@ -42,12 +42,19 @@ int main(int argc, char *argv[]) {
 
     bool binary_write = true;
     BaseFloat learning_rate = -1;
+    std::string learning_rates_csl;
+    BaseFloat scale = 1.0;
     
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("learning-rate", &learning_rate,
                 "If supplied, all the learning rates of updatable components"
                 "are set to this value.");
+    po.Register("learning-rates", &learning_rates_csl,
+                "If supplied, set the actual learning rates of the "
+                "updatable components to these set of values");
+    po.Register("scale", &scale, "The parameter matrices are scaled"
+                " by the specified value.");
 
     po.Read(argc, argv);
     
@@ -64,6 +71,21 @@ int main(int argc, char *argv[]) {
     
     if (learning_rate >= 0)
       SetLearningRate(learning_rate, &nnet);
+    
+    if (scale != 1.0)
+      ScaleNnet(scale, &nnet);
+    
+    if (!learning_rates_csl.empty()) {
+      std::vector<BaseFloat> learning_rates;
+      SplitStringToFloats(learning_rates_csl, ":,", true, &learning_rates);
+
+      Vector<BaseFloat> temp(learning_rates.size());
+      for (size_t i = 0; i < learning_rates.size(); i++) {
+        temp(i) = learning_rates[i];
+      }
+      SetLearningRates(temp, &nnet);
+    }
+
 
     WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
     KALDI_LOG << "Copied raw neural net from " << raw_nnet_rxfilename

From 4c060c379bf90f94c3f03cc59ba693c8fbfc59f2 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 27 Sep 2016 01:27:15 -0400
Subject: [PATCH 02/71] raw_python_script: Raw LSTM config

---
 .../s5/steps/nnet3/lstm/make_raw_configs.py   | 367 ++++++++++++++++++
 1 file changed, 367 insertions(+)
 create mode 100755 egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py

diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py
new file mode 100755
index 00000000000..63e5c5dfc0e
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import os
+import argparse
+import sys
+import warnings
+import copy
+import imp
+
+nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for LSTMs creation and training",
+                                     epilog="See steps/nnet3/lstm/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str,
+                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0")
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add lda matrix",
+                        choices=['true', 'false'], default = True)
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
+                        choices=['true', 'false'], default = False)
+    parser.add_argument("--objective-type", type=str, default="linear",
+                        choices = ["linear", "quadratic", "xent"],
+                        help = "the type of objective; i.e. quadratic or linear or cross-entropy")
+
+    # LSTM options
+    parser.add_argument("--num-lstm-layers", type=int,
+                        help="Number of LSTM layers to be stacked", default=1)
+    parser.add_argument("--cell-dim", type=int,
+                        help="dimension of lstm-cell")
+    parser.add_argument("--recurrent-projection-dim", type=int,
+                        help="dimension of recurrent projection")
+    parser.add_argument("--non-recurrent-projection-dim", type=int,
+                        help="dimension of non-recurrent projection")
+    parser.add_argument("--hidden-dim", type=int,
+                        help="dimension of fully-connected layers")
+
+    # Natural gradient options
+    parser.add_argument("--ng-per-element-scale-options", type=str,
+                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
+    parser.add_argument("--ng-affine-options", type=str,
+                        help="options to be supplied to NaturalGradientAffineComponent", default="")
+
+    # Gradient clipper options
+    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
+    parser.add_argument("--clipping-threshold", type=float,
+                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
+    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=0.00001)
+    parser.add_argument("--self-repair-scale-clipgradient", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the ClipGradient component of the LSTM", default=1.0)
+
+    # Delay options
+    parser.add_argument("--label-delay", type=int, default=None,
+                        help="option to delay the labels to make the lstm robust")
+
+    parser.add_argument("--lstm-delay", type=str, default=None,
+                        help="option to have different delays in recurrence for each lstm")
+
+    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=0)
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank")
+
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.num_lstm_layers < 1):
+        sys.exit("--num-lstm-layers has to be a positive integer")
+    if (args.clipping_threshold < 0):
+        sys.exit("--clipping-threshold has to be a non-negative")
+    if args.lstm_delay is None:
+        args.lstm_delay = [[-1]] * args.num_lstm_layers
+    else:
+        try:
+            args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
+        except ValueError:
+            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
+        if len(args.lstm_delay) != args.num_lstm_layers:
+            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
+
+    return args
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes'])+"\n")
+    f.close()
+
+def ParseSpliceString(splice_indexes, label_delay=None):
+    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
+    split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        splice_indexes = "0"
+
+    left_context=0
+    right_context=0
+    if label_delay is not None:
+        left_context = -label_delay
+        right_context = label_delay
+
+    splice_array = []
+    try:
+        for i in range(len(split1)):
+            indexes = map(lambda x: int(x), split1[i].strip().split(","))
+            print(indexes)
+            if len(indexes) < 1:
+                raise ValueError("invalid --splice-indexes argument, too-short element: "
+                                + splice_indexes)
+
+            if (i > 0)  and ((len(indexes) != 1) or (indexes[0] != 0)):
+                raise ValueError("elements of --splice-indexes splicing is only allowed initial layer.")
+
+            if not indexes == sorted(indexes):
+                raise ValueError("elements of --splice-indexes must be sorted: "
+                                + splice_indexes)
+            left_context += -indexes[0]
+            right_context += indexes[-1]
+            splice_array.append(indexes)
+    except ValueError as e:
+        raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e))
+
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+def ParseLstmDelayString(lstm_delay):
+    ## Work out lstm_delay e.g. "-1 [-1,1] -2" -> list([ [-1], [-1, 1], [-2] ])
+    split1 = lstm_delay.split(" ");
+    lstm_delay_array = []
+    try:
+        for i in range(len(split1)):
+            indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(","))
+            if len(indexes) < 1:
+                raise ValueError("invalid --lstm-delay argument, too-short element: "
+                                + lstm_delay)
+            elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
+                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
+            if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
+                indexes[0], indexes[1] = indexes[1], indexes[0]
+            lstm_delay_array.append(indexes)
+    except ValueError as e:
+        raise ValueError("invalid --lstm-delay argument " + lstm_delay + str(e))
+
+    return lstm_delay_array
+
+
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
+                add_idct, cepstral_lifter,
+                splice_indexes, lstm_delay, cell_dim, hidden_dim,
+                recurrent_projection_dim, non_recurrent_projection_dim,
+                num_lstm_layers, num_hidden_layers,
+                norm_based_clipping, clipping_threshold,
+                ng_per_element_scale_options, ng_affine_options,
+                label_delay, include_log_softmax, add_final_sigmoid,
+                objective_type, xent_regularize,
+                self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    if add_idct:
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
+
+    for i in range(num_lstm_layers):
+        if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
+            prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1),
+                                                    prev_layer_output, cell_dim,
+                                                    recurrent_projection_dim, non_recurrent_projection_dim,
+                                                    clipping_threshold, norm_based_clipping,
+                                                    ng_per_element_scale_options, ng_affine_options,
+                                                    lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+        else: # add a uni-directional LSTM layer
+            prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
+                                                   prev_layer_output, cell_dim,
+                                                   recurrent_projection_dim, non_recurrent_projection_dim,
+                                                   clipping_threshold, norm_based_clipping,
+                                                   ng_per_element_scale_options, ng_affine_options,
+                                                   lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+        # make the intermediate config file for layerwise discriminative
+        # training
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True, label_delay = label_delay,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    for i in range(num_lstm_layers, num_hidden_layers):
+        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
+                                               prev_layer_output, hidden_dim,
+                                               ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
+        # make the intermediate config file for layerwise discriminative
+        # training
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True, label_delay = label_delay,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
+
+
+
+
+def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
+    parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+
+    if (num_hidden_layers < num_lstm_layers):
+        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
+
+    return [left_context, right_context, num_hidden_layers, splice_indexes]
+
+
+def Main():
+    args = GetArgs()
+    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(args.config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
+    print('num_targets=' + str(args.num_targets), file=f)
+    print('objective_type=' + str(args.objective_type), file=f)
+    print('add_lda=' + ("true" if args.add_lda else "false"), file=f)
+    print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f)
+    f.close()
+
+    MakeConfigs(config_dir = args.config_dir,
+                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
+                num_targets = args.num_targets,
+                add_lda = args.add_lda,
+                add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter,
+                splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
+                cell_dim = args.cell_dim,
+                hidden_dim = args.hidden_dim,
+                recurrent_projection_dim = args.recurrent_projection_dim,
+                non_recurrent_projection_dim = args.non_recurrent_projection_dim,
+                num_lstm_layers = args.num_lstm_layers,
+                num_hidden_layers = num_hidden_layers,
+                norm_based_clipping = args.norm_based_clipping,
+                clipping_threshold = args.clipping_threshold,
+                ng_per_element_scale_options = args.ng_per_element_scale_options,
+                ng_affine_options = args.ng_affine_options,
+                label_delay = args.label_delay,
+                include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                objective_type = args.objective_type,
+                xent_regularize = args.xent_regularize,
+                self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
+                self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
+
+if __name__ == "__main__":
+    Main()

From 185e031968da327cc8a3c687d8193aa4e79418b2 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 28 Sep 2016 22:30:12 -0400
Subject: [PATCH 03/71] raw-signal-v2: Adding
 steps/nnet3/tdnn/make_raw_configs.py

---
 .../s5/steps/nnet3/tdnn/make_raw_configs.py   | 492 ++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100755 egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py

diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py
new file mode 100755
index 00000000000..179143bc916
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py
@@ -0,0 +1,492 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import shlex
+import sys
+import warnings
+import copy
+import imp
+import ast
+
+nodes = imp.load_source('', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training",
+                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # CNN options
+    parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
+                        help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 "
+                        "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 "
+                        "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, "
+                        "when CNN layers are used, no LDA will be added", default = None)
+    parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim",
+                        help="Output dimension of the linear layer at the CNN output "
+                        "for dimension reduction, e.g. 256."
+                        "The default zero means this layer is not needed.", default=0)
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str, required = True,
+                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' "
+                        "If CNN layers are used the first set of splice indexes will be used as input "
+                        "to the first CNN layer and later splice indexes will be interpreted as indexes "
+                        "for the TDNNs.")
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="If \"true\" an LDA matrix computed from the input features "
+                        "(spliced according to the first set of splice-indexes) will be used as "
+                        "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "If --cnn.layer is specified this option will be forced to \"false\".",
+                        default=True, choices = ["false", "true"])
+
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a final sigmoid layer as alternate to log-softmax-layer. "
+                        "Can only be used if include-log-softmax is false. "
+                        "This is useful in cases where you want the output to be "
+                        "like probabilities between 0 and 1. Typically the nnet "
+                        "is trained with an objective such as quadratic",
+                        default=False, choices = ["false", "true"])
+
+    parser.add_argument("--objective-type", type=str,
+                        help = "the type of objective; i.e. quadratic or linear",
+                        default="linear", choices = ["linear", "quadratic", "xent"])
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                        default=False, choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+
+    hidden_dim_group = parser.add_mutually_exclusive_group(required = True)
+    hidden_dim_group.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+    hidden_dim_group.add_argument("--relu-dims", type=str,
+                        help="dimension of ReLU nonlinearities")
+
+    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = True)
+
+    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank", default = False)
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.subset_dim < 0):
+        raise Exception("--subset-dim has to be non-negative")
+
+    args.nonlin_output_dims = args.relu_dims
+    args.nonlin_output_dim = args.relu_dim
+
+    if args.add_final_sigmoid and args.include_log_softmax:
+        raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.")
+
+    if args.xent_separate_forward_affine and args.add_final_sigmoid:
+        raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true")
+
+    if args.add_lda and args.cnn_layer is not None:
+        args.add_lda = False
+        warnings.warn("--add-lda is set to false as CNN layers are used.")
+
+    return args
+
+def AddConvMaxpLayer(config_lines, name, input, args):
+    if '3d-dim' not in input:
+        raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.")
+
+    input = nodes.AddConvolutionLayer(config_lines, name, input,
+                              input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                              args.filt_x_dim, args.filt_y_dim,
+                              args.filt_x_step, args.filt_y_step,
+                              args.num_filters, input['vectorization'])
+
+    if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1:
+      input = nodes.AddMaxpoolingLayer(config_lines, name, input,
+                                input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                                args.pool_x_size, args.pool_y_size, args.pool_z_size,
+                                args.pool_x_step, args.pool_y_step, args.pool_z_step)
+
+    return input
+
+# The ivectors are processed through an affine layer parallel to the CNN layers,
+# then concatenated with the CNN output and passed to the deeper part of the network.
+def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+    cnn_args = ParseCnnString(cnn_layer)
+    num_cnn_layers = len(cnn_args)
+    # We use an Idct layer here to convert MFCC to FBANK features
+    nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+    prev_layer_output = {'descriptor':  "input",
+                         'dimension': feat_dim}
+    prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
+
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    splice_descriptor = "Append({0})".format(", ".join(list))
+    cnn_input_dim = len(splice_indexes) * feat_dim
+    prev_layer_output = {'descriptor':  splice_descriptor,
+                         'dimension': cnn_input_dim,
+                         '3d-dim': [len(splice_indexes), feat_dim, 1],
+                         'vectorization': 'yzx'}
+
+    for cl in range(0, num_cnn_layers):
+        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
+
+    if cnn_bottleneck_dim > 0:
+        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+
+    if ivector_dim > 0:
+        iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
+                           'dimension': ivector_dim}
+        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
+        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
+        prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
+
+    return prev_layer_output
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseCnnString(cnn_param_string_list):
+    cnn_parser = argparse.ArgumentParser(description="cnn argument parser")
+
+    cnn_parser.add_argument("--filt-x-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-y-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--filt-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--num-filters", required=True, type=int)
+    cnn_parser.add_argument("--pool-x-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-step", type=int, default = 1)
+
+    cnn_args = []
+    for cl in range(0, len(cnn_param_string_list)):
+         cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl])))
+
+    return cnn_args
+
+def ParseSpliceString(splice_indexes):
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = splice_indexes.split();  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        raise Exception("invalid splice-indexes argument, too short: "
+                 + splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                raise Exception("invalid splice-indexes argument, too-short element: "
+                         + splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                raise Exception("elements of splice-indexes must be sorted: "
+                         + splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
+def MakeConfigs(config_dir, splice_indexes_string,
+                cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
+                feat_dim, ivector_dim, num_targets, add_lda,
+                nonlin_output_dim, nonlin_output_dims, subset_dim,
+                use_presoftmax_prior_scale,
+                final_layer_normalize_target,
+                include_log_softmax,
+                add_final_sigmoid,
+                xent_regularize,
+                xent_separate_forward_affine,
+                self_repair_scale,
+                objective_type):
+
+    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
+
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
+
+    if nonlin_output_dims is None:
+        nonlin_output_dims = [ nonlin_output_dim for x in range(0, num_hidden_layers)]
+    else:
+        nonlin_output_dims = [ int(x) for x in nonlin_output_dims.split() ]
+
+    assert len(nonlin_output_dims) == num_hidden_layers
+
+    if xent_separate_forward_affine:
+        if splice_indexes[-1] != [0]:
+            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    if add_idct and cnn_layer is None:
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct and cnn_layer is None else None)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    if cnn_layer is not None:
+        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
+                                         feat_dim, splice_indexes[0], ivector_dim)
+
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+
+        if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            if xent_regularize == 0.0:
+                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
+
+            prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
+                                                    prev_layer_output, nonlin_output_dims[i],
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax)
+
+
+            prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
+                                                    prev_layer_output, nonlin_output_dims[i],
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
+                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                    0.5 / xent_regularize),
+                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                prior_scale_file = prior_scale_file,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+        else:
+            prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                        prev_layer_output, nonlin_output_dims[i],
+                                                        self_repair_scale = self_repair_scale,
+                                                        norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+
+            # add_final_sigmoid adds a sigmoid as a final layer as alternative
+            # to log-softmax layer.
+            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
+            # This is useful when you need the final outputs to be probabilities between 0 and 1.
+            # Usually used with an objective-type such as "quadratic".
+            # Applications are k-binary classification such Ideal Ratio Mask prediction.
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax,
+                               add_final_sigmoid = add_final_sigmoid,
+                               objective_type = objective_type)
+            if xent_regularize != 0.0:
+                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                          0.5 / xent_regularize),
+                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                    prior_scale_file = prior_scale_file,
+                                    include_log_softmax = True,
+                                    name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('num_targets=' + str(num_targets), file=f)
+    print('add_lda=' + ('true' if add_lda else 'false'), file=f)
+    print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
+    print('objective_type=' + objective_type, file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
+
+def Main():
+    args = GetArgs()
+
+    MakeConfigs(config_dir = args.config_dir,
+                splice_indexes_string = args.splice_indexes,
+                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
+                num_targets = args.num_targets,
+                add_lda = args.add_lda, add_idct = args.add_idct,
+                cnn_layer = args.cnn_layer,
+                cnn_bottleneck_dim = args.cnn_bottleneck_dim,
+                cepstral_lifter = args.cepstral_lifter,
+                nonlin_output_dim = args.nonlin_output_dim,
+                nonlin_output_dims = args.nonlin_output_dims,
+                subset_dim = args.subset_dim,
+                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
+                final_layer_normalize_target = args.final_layer_normalize_target,
+                include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                xent_regularize = args.xent_regularize,
+                xent_separate_forward_affine = args.xent_separate_forward_affine,
+                self_repair_scale = args.self_repair_scale_nonlinearity,
+                objective_type = args.objective_type)
+
+if __name__ == "__main__":
+    Main()
+

From 851eb24bea6147f423e519d1b386b75c0a0797a5 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 29 Sep 2016 00:55:07 -0400
Subject: [PATCH 04/71] raw_python_script: Made raw and AM nnets training and
 configs similar

---
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |  65 +++-
 .../s5/steps/nnet3/lstm/make_raw_configs.py   | 367 ------------------
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py     |  22 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |   2 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 114 +++---
 5 files changed, 111 insertions(+), 459 deletions(-)
 delete mode 100755 egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py

diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 53739f0f9ce..b0c9f2b591c 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -50,6 +50,15 @@ def GetArgs():
                         default=0.0)
     parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add lda matrix",
+                        choices=['true', 'false'], default = True)
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
+                        choices=['true', 'false'], default = False)
+    parser.add_argument("--objective-type", type=str, default="linear",
+                        choices = ["linear", "quadratic", "xent"],
+                        help = "the type of objective; i.e. quadratic or linear or cross-entropy per dim")
 
     # LSTM options
     parser.add_argument("--num-lstm-layers", type=int,
@@ -86,6 +95,14 @@ def GetArgs():
     parser.add_argument("--lstm-delay", type=str, default=None,
                         help="option to have different delays in recurrence for each lstm")
 
+    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=0)
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank",
+                        default = False, choices = ["true", "false"])
+
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -115,6 +132,9 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
     if not args.num_targets > 0:
         print(args.num_targets)
         raise Exception("num_targets has to be positive")
@@ -208,28 +228,36 @@ def ParseLstmDelayString(lstm_delay):
     return lstm_delay_array
 
 
-def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
+                add_idct, cepstral_lifter,
                 splice_indexes, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
                 num_lstm_layers, num_hidden_layers,
                 norm_based_clipping, clipping_threshold,
                 ng_per_element_scale_options, ng_affine_options,
-                label_delay, include_log_softmax, xent_regularize,
+                label_delay, include_log_softmax, add_final_sigmoid,
+                objective_type, xent_regularize,
                 self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
 
     config_lines = {'components':[], 'component-nodes':[]}
 
+    if add_idct:
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
     config_files[config_dir + '/init.config'] = init_config_lines
 
-    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
 
     for i in range(num_lstm_layers):
         if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
@@ -248,7 +276,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                                                    lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
 
         if xent_regularize != 0.0:
@@ -265,7 +293,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                                                ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
         if xent_regularize != 0.0:
             nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
@@ -293,14 +321,6 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer
     if (num_hidden_layers < num_lstm_layers):
         raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
 
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    f.close()
-
     return [left_context, right_context, num_hidden_layers, splice_indexes]
 
 
@@ -308,9 +328,22 @@ def Main():
     args = GetArgs()
     [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
 
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(args.config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('num_targets=' + str(args.num_targets), file=f)
+    print('objective_type=' + str(args.objective_type), file=f)
+    print('add_lda=' + ("true" if args.add_lda else "false"), file=f)
+    print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f)
+    f.close()
+
     MakeConfigs(config_dir = args.config_dir,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
+                add_lda = args.add_lda,
+                add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter,
                 splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
                 cell_dim = args.cell_dim,
                 hidden_dim = args.hidden_dim,
@@ -324,6 +357,8 @@ def Main():
                 ng_affine_options = args.ng_affine_options,
                 label_delay = args.label_delay,
                 include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                objective_type = args.objective_type,
                 xent_regularize = args.xent_regularize,
                 self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
                 self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py
deleted file mode 100755
index 63e5c5dfc0e..00000000000
--- a/egs/wsj/s5/steps/nnet3/lstm/make_raw_configs.py
+++ /dev/null
@@ -1,367 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
-import os
-import argparse
-import sys
-import warnings
-import copy
-import imp
-
-nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
-
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="Writes config files and variables "
-                                                 "for LSTMs creation and training",
-                                     epilog="See steps/nnet3/lstm/train.sh for example.")
-
-    # Only one of these arguments can be specified, and one of them has to
-    # be compulsarily specified
-    feat_group = parser.add_mutually_exclusive_group(required = True)
-    feat_group.add_argument("--feat-dim", type=int,
-                            help="Raw feature dimension, e.g. 13")
-    feat_group.add_argument("--feat-dir", type=str,
-                            help="Feature directory, from which we derive the feat-dim")
-
-    # only one of these arguments can be specified
-    ivector_group = parser.add_mutually_exclusive_group(required = False)
-    ivector_group.add_argument("--ivector-dim", type=int,
-                                help="iVector dimension, e.g. 100", default=0)
-    ivector_group.add_argument("--ivector-dir", type=str,
-                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
-
-    num_target_group = parser.add_mutually_exclusive_group(required = True)
-    num_target_group.add_argument("--num-targets", type=int,
-                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
-    num_target_group.add_argument("--ali-dir", type=str,
-                                  help="alignment directory, from which we derive the num-targets")
-    num_target_group.add_argument("--tree-dir", type=str,
-                                  help="directory with final.mdl, from which we derive the num-targets")
-
-    # General neural network options
-    parser.add_argument("--splice-indexes", type=str,
-                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0")
-    parser.add_argument("--xent-regularize", type=float,
-                        help="For chain models, if nonzero, add a separate output for cross-entropy "
-                        "regularization (with learning-rate-factor equal to the inverse of this)",
-                        default=0.0)
-    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
-    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add lda matrix",
-                        choices=['true', 'false'], default = True)
-    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
-                        choices=['true', 'false'], default = False)
-    parser.add_argument("--objective-type", type=str, default="linear",
-                        choices = ["linear", "quadratic", "xent"],
-                        help = "the type of objective; i.e. quadratic or linear or cross-entropy")
-
-    # LSTM options
-    parser.add_argument("--num-lstm-layers", type=int,
-                        help="Number of LSTM layers to be stacked", default=1)
-    parser.add_argument("--cell-dim", type=int,
-                        help="dimension of lstm-cell")
-    parser.add_argument("--recurrent-projection-dim", type=int,
-                        help="dimension of recurrent projection")
-    parser.add_argument("--non-recurrent-projection-dim", type=int,
-                        help="dimension of non-recurrent projection")
-    parser.add_argument("--hidden-dim", type=int,
-                        help="dimension of fully-connected layers")
-
-    # Natural gradient options
-    parser.add_argument("--ng-per-element-scale-options", type=str,
-                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
-    parser.add_argument("--ng-affine-options", type=str,
-                        help="options to be supplied to NaturalGradientAffineComponent", default="")
-
-    # Gradient clipper options
-    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
-    parser.add_argument("--clipping-threshold", type=float,
-                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
-    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=0.00001)
-    parser.add_argument("--self-repair-scale-clipgradient", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the ClipGradient component of the LSTM", default=1.0)
-
-    # Delay options
-    parser.add_argument("--label-delay", type=int, default=None,
-                        help="option to delay the labels to make the lstm robust")
-
-    parser.add_argument("--lstm-delay", type=str, default=None,
-                        help="option to have different delays in recurrence for each lstm")
-
-    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=0)
-    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="Add an IDCT after input to convert MFCC to Fbank")
-
-    parser.add_argument("config_dir",
-                        help="Directory to write config files and variables")
-
-    print(' '.join(sys.argv))
-
-    args = parser.parse_args()
-    args = CheckArgs(args)
-
-    return args
-
-def CheckArgs(args):
-    if not os.path.exists(args.config_dir):
-        os.makedirs(args.config_dir)
-
-    ## Check arguments.
-    if args.feat_dir is not None:
-        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
-
-    if args.ali_dir is not None:
-        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
-    elif args.tree_dir is not None:
-        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
-
-    if args.ivector_dir is not None:
-        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
-
-    if not args.feat_dim > 0:
-        raise Exception("feat-dim has to be postive")
-
-    if args.add_lda and args.add_idct:
-        raise Exception("add-idct can be true only if add-lda is false")
-
-    if not args.num_targets > 0:
-        print(args.num_targets)
-        raise Exception("num_targets has to be positive")
-
-    if not args.ivector_dim >= 0:
-        raise Exception("ivector-dim has to be non-negative")
-
-    if (args.num_lstm_layers < 1):
-        sys.exit("--num-lstm-layers has to be a positive integer")
-    if (args.clipping_threshold < 0):
-        sys.exit("--clipping-threshold has to be a non-negative")
-    if args.lstm_delay is None:
-        args.lstm_delay = [[-1]] * args.num_lstm_layers
-    else:
-        try:
-            args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
-        except ValueError:
-            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
-        if len(args.lstm_delay) != args.num_lstm_layers:
-            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
-
-    return args
-
-def PrintConfig(file_name, config_lines):
-    f = open(file_name, 'w')
-    f.write("\n".join(config_lines['components'])+"\n")
-    f.write("\n#Component nodes\n")
-    f.write("\n".join(config_lines['component-nodes'])+"\n")
-    f.close()
-
-def ParseSpliceString(splice_indexes, label_delay=None):
-    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
-    split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
-    if len(split1) < 1:
-        splice_indexes = "0"
-
-    left_context=0
-    right_context=0
-    if label_delay is not None:
-        left_context = -label_delay
-        right_context = label_delay
-
-    splice_array = []
-    try:
-        for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].strip().split(","))
-            print(indexes)
-            if len(indexes) < 1:
-                raise ValueError("invalid --splice-indexes argument, too-short element: "
-                                + splice_indexes)
-
-            if (i > 0)  and ((len(indexes) != 1) or (indexes[0] != 0)):
-                raise ValueError("elements of --splice-indexes splicing is only allowed initial layer.")
-
-            if not indexes == sorted(indexes):
-                raise ValueError("elements of --splice-indexes must be sorted: "
-                                + splice_indexes)
-            left_context += -indexes[0]
-            right_context += indexes[-1]
-            splice_array.append(indexes)
-    except ValueError as e:
-        raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e))
-
-    left_context = max(0, left_context)
-    right_context = max(0, right_context)
-
-    return {'left_context':left_context,
-            'right_context':right_context,
-            'splice_indexes':splice_array,
-            'num_hidden_layers':len(splice_array)
-            }
-
-def ParseLstmDelayString(lstm_delay):
-    ## Work out lstm_delay e.g. "-1 [-1,1] -2" -> list([ [-1], [-1, 1], [-2] ])
-    split1 = lstm_delay.split(" ");
-    lstm_delay_array = []
-    try:
-        for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(","))
-            if len(indexes) < 1:
-                raise ValueError("invalid --lstm-delay argument, too-short element: "
-                                + lstm_delay)
-            elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
-                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
-            if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
-                indexes[0], indexes[1] = indexes[1], indexes[0]
-            lstm_delay_array.append(indexes)
-    except ValueError as e:
-        raise ValueError("invalid --lstm-delay argument " + lstm_delay + str(e))
-
-    return lstm_delay_array
-
-
-def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
-                add_idct, cepstral_lifter,
-                splice_indexes, lstm_delay, cell_dim, hidden_dim,
-                recurrent_projection_dim, non_recurrent_projection_dim,
-                num_lstm_layers, num_hidden_layers,
-                norm_based_clipping, clipping_threshold,
-                ng_per_element_scale_options, ng_affine_options,
-                label_delay, include_log_softmax, add_final_sigmoid,
-                objective_type, xent_regularize,
-                self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
-
-    config_lines = {'components':[], 'component-nodes':[]}
-
-    if add_idct:
-        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
-
-    config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim,
-                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None)
-
-    # Add the init config lines for estimating the preconditioning matrices
-    init_config_lines = copy.deepcopy(config_lines)
-    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
-    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
-    config_files[config_dir + '/init.config'] = init_config_lines
-
-    if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
-
-    for i in range(num_lstm_layers):
-        if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
-            prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1),
-                                                    prev_layer_output, cell_dim,
-                                                    recurrent_projection_dim, non_recurrent_projection_dim,
-                                                    clipping_threshold, norm_based_clipping,
-                                                    ng_per_element_scale_options, ng_affine_options,
-                                                    lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
-        else: # add a uni-directional LSTM layer
-            prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
-                                                   prev_layer_output, cell_dim,
-                                                   recurrent_projection_dim, non_recurrent_projection_dim,
-                                                   clipping_threshold, norm_based_clipping,
-                                                   ng_per_element_scale_options, ng_affine_options,
-                                                   lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
-        # make the intermediate config file for layerwise discriminative
-        # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
-
-
-        if xent_regularize != 0.0:
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                include_log_softmax = True, label_delay = label_delay,
-                                name_affix = 'xent')
-
-        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
-        config_lines = {'components':[], 'component-nodes':[]}
-
-    for i in range(num_lstm_layers, num_hidden_layers):
-        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
-                                               prev_layer_output, hidden_dim,
-                                               ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
-        # make the intermediate config file for layerwise discriminative
-        # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
-
-        if xent_regularize != 0.0:
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                include_log_softmax = True, label_delay = label_delay,
-                                name_affix = 'xent')
-
-        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
-        config_lines = {'components':[], 'component-nodes':[]}
-
-    # printing out the configs
-    # init.config used to train lda-mllt train
-    for key in config_files.keys():
-        PrintConfig(key, config_files[key])
-
-
-
-
-def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
-    parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
-    left_context = parsed_splice_output['left_context']
-    right_context = parsed_splice_output['right_context']
-    num_hidden_layers = parsed_splice_output['num_hidden_layers']
-    splice_indexes = parsed_splice_output['splice_indexes']
-
-    if (num_hidden_layers < num_lstm_layers):
-        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
-
-    return [left_context, right_context, num_hidden_layers, splice_indexes]
-
-
-def Main():
-    args = GetArgs()
-    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
-
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(args.config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    print('num_targets=' + str(args.num_targets), file=f)
-    print('objective_type=' + str(args.objective_type), file=f)
-    print('add_lda=' + ("true" if args.add_lda else "false"), file=f)
-    print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f)
-    f.close()
-
-    MakeConfigs(config_dir = args.config_dir,
-                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
-                num_targets = args.num_targets,
-                add_lda = args.add_lda,
-                add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter,
-                splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
-                cell_dim = args.cell_dim,
-                hidden_dim = args.hidden_dim,
-                recurrent_projection_dim = args.recurrent_projection_dim,
-                non_recurrent_projection_dim = args.non_recurrent_projection_dim,
-                num_lstm_layers = args.num_lstm_layers,
-                num_hidden_layers = num_hidden_layers,
-                norm_based_clipping = args.norm_based_clipping,
-                clipping_threshold = args.clipping_threshold,
-                ng_per_element_scale_options = args.ng_per_element_scale_options,
-                ng_affine_options = args.ng_affine_options,
-                label_delay = args.label_delay,
-                include_log_softmax = args.include_log_softmax,
-                add_final_sigmoid = args.add_final_sigmoid,
-                objective_type = args.objective_type,
-                xent_regularize = args.xent_regularize,
-                self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
-                self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
-
-if __name__ == "__main__":
-    Main()
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index 0128d01adc9..903ffde10c6 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -475,25 +475,27 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
     WriteKaldiMatrix(output_file, [scaled_counts])
     ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
 
-def PrepareInitialAcousticModel(dir, alidir, run_opts, use_raw_nnet = False):
+def PrepareInitialAcousticModel(dir, alidir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
+    PrepareInitialNetwork(dir, run_opts):
+
+  # Convert to .mdl, train the transitions, set the priors.
+    RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, alidir = alidir))
+
+def PrepareInitialNetwork(dir, run_opts):
     RunKaldiCommand("""
 {command} {dir}/log/add_first_layer.log \
    nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
                dir = dir))
 
-    if not use_raw_nnet:
-      # Convert to .mdl, train the transitions, set the priors.
-        RunKaldiCommand("""
-    {command} {dir}/log/init_mdl.log \
-        nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
-        nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
-            """.format(command = run_opts.command,
-                       dir = dir, alidir = alidir))
-
 def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
                      num_archives, max_models_combine, add_layers_period,
                      num_jobs_final):
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index f524366c8ac..125b8219e73 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -573,7 +573,7 @@ def Train(args, run_opts):
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        PrepareInitialAcousticModel(args.dir, None, run_opts, use_raw_nnet = True)
+        PrepareInitialNetwork(args.dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 8c1e88cdc08..e4dd59d3416 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -2,6 +2,7 @@
 
 
 # Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
 # Apache 2.0.
 
 
@@ -207,6 +208,9 @@ def GetArgs():
                         e.g. queue.pl for launching on SGE cluster
                              run.pl for launching on local machine
                         """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
     parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
                         choices = ["true", "false"],
                         help="Use GPU for training", default=True)
@@ -232,6 +236,8 @@ def GetArgs():
                         type=int, default=0.1,
                         help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
 
+    parser.add_argument("--configs-dir", type=str,
+                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
@@ -260,6 +266,11 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be positive")
 
+
+    if args.configs_dir is not None:
+        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
+                                                '{0}/configs'.format(args.dir)))
+
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
         directory which is the output of make_configs.py script""")
@@ -305,32 +316,11 @@ def ProcessArgs(args):
     run_opts.realign_num_jobs = args.realign_num_jobs
 
     run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-class StrToBoolAction(argparse.Action):
-    """ A custom action to convert bools from shell format i.e., true/false
-        to python format i.e., True/False """
-    def __call__(self, parser, namespace, values, option_string=None):
-        if values == "true":
-            setattr(namespace, self.dest, True)
-        elif values == "false":
-            setattr(namespace, self.dest, False)
-        else:
-            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
-
-class NullstrToNoneAction(argparse.Action):
-    """ A custom action to convert empty strings passed by shell
-        to None in python. This is necessary as shell scripts print null strings
-        when a variable is not specified. We could use the more apt None
-        in python. """
-    def __call__(self, parser, namespace, values, option_string=None):
-            if values.strip() == "":
-                setattr(namespace, self.dest, None)
-            else:
-                setattr(namespace, self.dest, values)
-
 
 # a class to store run options
 class RunOpts:
@@ -366,9 +356,9 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
 
         cache_write_opt = ""
         if job == 1:
-          # an option for writing cache (storing pairs of nnet-computations and
-          # computation-requests) during training.
-          cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
 
         process_handle = RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
@@ -429,7 +419,6 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-
     ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
 
     if iter > 0:
@@ -453,14 +442,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
 
     if do_average:
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
     else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
 
     try:
         os.remove("{0}/.error".format(dir))
@@ -476,29 +465,21 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
+        GetAverageNnetModel(dir = dir, iter = iter,
+                            nnets_list = " ".join(nnets_list),
+                            run_opts = run_opts,
+                            shrink = shrinkage_value)
 
     else:
         # choose the best model from different jobs
-        RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
+        GetBestNnetModel(dir = dir, iter = iter,
+                         best_model_index = best_model,
+                         run_opts = run_opts,
+                         shrink = shrinkage_value)
 
     try:
         for i in range(1, num_jobs + 1):
@@ -538,7 +519,21 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    variables = ParseModelConfigGenericVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -552,8 +547,6 @@ def Train(args, run_opts):
     """.format(command = run_opts.command,
                dir = args.dir))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
@@ -673,7 +666,7 @@ def Train(args, run_opts):
                               right_context = right_context,
                               min_deriv_time = min_deriv_time,
                               momentum = args.momentum,
-                              max_param_change= args.max_param_change,
+                              max_param_change = args.max_param_change,
                               shuffle_buffer_size = args.shuffle_buffer_size,
                               cv_minibatch_size = args.cv_minibatch_size,
                               run_opts = run_opts)
@@ -730,8 +723,7 @@ def Train(args, run_opts):
     report_handle.write(report)
     report_handle.close()
 
-    os.system("steps/info/nnet3_dir_info.sh " + args.dir)
-
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
 
 def Main():
     [args, run_opts] = GetArgs()
@@ -744,15 +736,5 @@ def Main():
         traceback.print_exc()
         raise e
 
-def SendMail(message, subject, email_id):
-    try:
-        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
-            message = message,
-            subject = subject,
-            email = email_id), shell=True)
-    except Exception as e:
-        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
-        pass
-
 if __name__ == "__main__":
     Main()

From 23aa55c3c0e5a1692da8920d2aaed2d1ae85fe6c Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 29 Sep 2016 01:17:28 -0400
Subject: [PATCH 05/71] raw_python_script: tdnn make_configs.py with support
 for raw nnet3

---
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py | 28 +++++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 918def50608..d249e7152c2 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -54,10 +54,6 @@ def GetArgs():
                         help="Output dimension of the linear layer at the CNN output "
                         "for dimension reduction, e.g. 256."
                         "The default zero means this layer is not needed.", default=0)
-    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=22.0)
 
     # General neural network options
     parser.add_argument("--splice-indexes", type=str, required = True,
@@ -84,7 +80,7 @@ def GetArgs():
 
     parser.add_argument("--objective-type", type=str,
                         help = "the type of objective; i.e. quadratic or linear",
-                        default="linear", choices = ["linear", "quadratic"])
+                        default="linear", choices = ["linear", "quadratic", "xent"])
     parser.add_argument("--xent-regularize", type=float,
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
@@ -116,6 +112,13 @@ def GetArgs():
     parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
+
+    parser.add_argument(["--cepstral-lifter","--cnn.cepstral-lifter"], type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank", default = False)
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -145,6 +148,9 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
     if not args.num_targets > 0:
         print(args.num_targets)
         raise Exception("num_targets has to be positive")
@@ -318,7 +324,7 @@ def ParseSpliceString(splice_indexes):
 # The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
 def MakeConfigs(config_dir, splice_indexes_string,
                 cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
-                feat_dim, ivector_dim, num_targets, add_lda,
+                feat_dim, ivector_dim, num_targets, add_lda, add_idct,
                 nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim,
                 nonlin_output_dim_init, nonlin_output_dim_final,
                 use_presoftmax_prior_scale,
@@ -346,8 +352,14 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     config_lines = {'components':[], 'component-nodes':[]}
 
+    if add_idct and cnn_layer is None:
+        # If CNN layer is not None, IDCT will be add inside AddCnnLayers method
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct and cnn_layer is None else None)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -516,7 +528,7 @@ def Main():
                 splice_indexes_string = args.splice_indexes,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
-                add_lda = args.add_lda,
+                add_lda = args.add_lda, add_idct = args.add_idct,
                 cnn_layer = args.cnn_layer,
                 cnn_bottleneck_dim = args.cnn_bottleneck_dim,
                 cepstral_lifter = args.cepstral_lifter,

From d074e56d32addb7ab25a4c76c597e7eae777933e Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 29 Sep 2016 02:40:41 -0400
Subject: [PATCH 06/71] raw_python_script: Refactoring DNN training

---
 egs/wsj/s5/steps/nnet3/libs/train_lib.py | 194 ++++++++++++++++
 egs/wsj/s5/steps/nnet3/train_dnn.py      | 270 ++++-------------------
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py  | 236 +++-----------------
 3 files changed, 265 insertions(+), 435 deletions(-)
 create mode 100644 egs/wsj/s5/steps/nnet3/libs/train_lib.py

diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
new file mode 100644
index 00000000000..7e658b950eb
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -0,0 +1,194 @@
+import subprocess
+import logging
+import math
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   cache_read_opt, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame = (k / num_archives) % frames_per_eg
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  "{raw_model}" \
+  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     frame = frame,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     minibatch_size = minibatch_size),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts,
+                      compute_accuracy = True,
+                      use_raw_nnet = False):
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, use_raw_nnet = use_raw_nnet, compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts, use_raw_nnet = use_raw_nnet)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        if use_raw_nnet:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
+        else:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        if use_raw_nnet:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+        else:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+        cur_minibatch_size = minibatch_size
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_minibatch_size = minibatch_size / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        GetAverageNnetModel(dir = dir, iter = iter,
+                            nnets_list = " ".join(nnets_list),
+                            run_opts = run_opts,
+                            use_raw_nnet = use_raw_nnet)
+    else:
+        # choose the best model from different jobs
+        GetBestNnetModel(dir = dir, iter = iter,
+                         best_model_index = best_model,
+                         run_opts = run_opts,
+                         use_raw_nnet = use_raw_nnet)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    if use_raw_nnet:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ff178643f38..a625f696287 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -2,10 +2,11 @@
 
 
 # Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
 # Apache 2.0.
 
 
-# this script is based on steps/nnet3/lstm/train.sh
+# this script is based on steps/nnet3/tdnn/train.sh
 
 
 import subprocess
@@ -18,6 +19,7 @@
 from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -115,23 +117,6 @@ def GetArgs():
                         default=-0.25,
                         help="")
 
-    # Realignment parameters
-    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
-                        default=None, action=NullstrToNoneAction,
-                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
-    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
-                        default=30,
-                        help="Number of jobs to use for realignment")
-    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
-                        default=None, action=NullstrToNoneAction,
-                        help="""A space seperated string of realignment
-                        times. Values must be between 0 and 1
-                        e.g. '0.1 0.2 0.3' """)
-
-    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
-                        default=True, action=StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="If true, gpu is used with steps/nnet3/align.sh")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
@@ -170,6 +155,9 @@ def GetArgs():
                         e.g. queue.pl for launching on SGE cluster
                              run.pl for launching on local machine
                         """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
     parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
                         choices = ["true", "false"],
                         help="Use GPU for training", default=True)
@@ -195,6 +183,8 @@ def GetArgs():
                         type=int, default=0.1,
                         help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
 
+    parser.add_argument("--configs-dir", type=str,
+                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
@@ -217,12 +207,17 @@ def ProcessArgs(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
+    if args.configs_dir is not None:
+        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
+                                                '{0}/configs'.format(args.dir)))
+
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs"
         " directory which is the output of make_configs.py script")
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
+
     # set the options corresponding to args.use_gpu
     run_opts = RunOpts()
     if args.use_gpu:
@@ -248,20 +243,8 @@ def ProcessArgs(args):
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
 
-    if args.realign_use_gpu is True:
-        run_opts.realign_use_gpu = True
-        run_opts.realign_queue_opt = "--gpu 1"
-    else:
-        run_opts.realign_use_gpu = False
-        run_opts.realign_queue_opt = ""
-
-    if args.realign_command is None:
-        run_opts.realign_command = args.command
-    else:
-        run_opts.realign_command = args.realign_command
-    run_opts.realign_num_jobs = args.realign_num_jobs
-
     run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
@@ -275,169 +258,6 @@ def __init__(self):
         self.prior_gpu_opt = None
         self.prior_queue_opt = None
         self.parallel_train_opts = None
-        self.realign_use_gpu = None
-
-# this is the main method which differs between RNN and DNN training
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size,
-                   run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame = (k / num_archives) % frames_per_eg
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  "{raw_model}" \
-  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     frame = frame,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     minibatch_size = minibatch_size),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, minibatch_size,
-                      frames_per_eg, num_hidden_layers, add_layers_period,
-                      left_context, right_context,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      run_opts):
-
-
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts)
-
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-
-    if do_average:
-      cur_minibatch_size = minibatch_size
-      cur_max_param_change = max_param_change
-    else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_minibatch_size = minibatch_size // 2
-      cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size,
-                   run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
 
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
@@ -461,7 +281,17 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    variables = ParseModelConfigGenericVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        left_context = variables['model_left_context']
+        right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -545,15 +375,6 @@ def Train(args, run_opts):
                                                                     num_archives_to_process,
                                                                     args.initial_effective_lrate,
                                                                     args.final_effective_lrate)
-    realign_iters = []
-    if args.realign_times is not None:
-        realign_iters = GetRealignIters(args.realign_times,
-                                        num_iters,
-                                        args.num_jobs_initial,
-                                        args.num_jobs_final)
-        print(realign_iters)
-    # egs_dir will be updated if there is realignment
-    cur_egs_dir=egs_dir
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -563,29 +384,28 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            if iter in realign_iters:
-                logger.info("Re-aligning the data at iteration {0}".format(iter))
-                prev_egs_dir=cur_egs_dir
-                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
-                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
-                Realign(args.dir, iter, args.feat_dir, args.lang,
-                        prev_egs_dir, cur_egs_dir,
-                        args.prior_subset_size, num_archives, run_opts,
-                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
-                if args.cleanup and args.egs_dir is None:
-                    RemoveEgs(prev_egs_dir)
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              args.minibatch_size, args.frames_per_eg,
-                              num_hidden_layers, args.add_layers_period,
-                              left_context, right_context,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size, run_opts)
+            train_lib.TrainOneIteration(dir = args.dir,
+                                        iter = iter,
+                                        srand = args.srand,
+                                        egs_dir = egs_dir,
+                                        num_jobs = current_num_jobs,
+                                        num_archives_processed = num_archives_processed,
+                                        num_archives = num_archives,
+                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        minibatch_size = args.minibatch_size,
+                                        frames_per_eg = args.frames_per_eg,
+                                        num_hidden_layers = num_hidden_layers,
+                                        add_layers_period = args.add_layers_period,
+                                        left_context = left_context,
+                                        right_context = right_context,
+                                        momentum = args.momentum,
+                                        max_param_change = args.max_param_change,
+                                        shuffle_buffer_size = args.shuffle_buffer_size,
+                                        run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -624,14 +444,14 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+        CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 84667eeee45..3432e5df9df 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -19,6 +19,7 @@
 from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -27,13 +28,13 @@
 formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
 handler.setFormatter(formatter)
 logger.addHandler(handler)
-logger.info('Starting DNN trainer (train_raw_dnn.py)')
+logger.info('Starting raw DNN trainer (train_raw_dnn.py)')
 
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="""
-    Trains a feed forward DNN raw acoustic model (without transition model)
+    Trains a feed forward raw DNN (without transition model)
     using the cross-entropy objective.
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
@@ -52,14 +53,6 @@ def GetArgs():
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
-    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default = 0,
-                        help="""Number of left steps used in the estimation of LSTM
-                        state before prediction of the first label""")
-    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                        default = 0,
-                        help="""Number of right steps used in the estimation of BLSTM
-                        state before prediction of the first label""")
     parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
                         default = None, action = NullstrToNoneAction,
                         help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
@@ -124,7 +117,7 @@ def GetArgs():
 
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.minibatch-size", type=int, dest='minibatch_size',
+    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
                         default = 512,
                         help="Size of the minibatch used to compute the gradient")
     parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
@@ -215,13 +208,6 @@ def ProcessArgs(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
-    if args.chunk_left_context < 0:
-        raise Exception("--egs.chunk-left-context should be positive")
-
-    if args.chunk_right_context < 0:
-        raise Exception("--egs.chunk-right-context should be positive")
-
-
     if args.configs_dir is not None:
         RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
                                                 '{0}/configs'.format(args.dir)))
@@ -271,176 +257,6 @@ def __init__(self):
         self.prior_queue_opt = None
         self.parallel_train_opts = None
 
-# this is the main method which differs between RNN and DNN training
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size,
-                   cache_read_opt, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame = (k / num_archives) % frames_per_eg
-
-        cache_write_opt = ""
-        if job == 1:
-            # an option for writing cache (storing pairs of nnet-computations and
-            # computation-requests) during training.
-            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  "{raw_model}" \
-  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     frame = frame,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     minibatch_size = minibatch_size),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, minibatch_size,
-                      frames_per_eg, num_hidden_layers, add_layers_period,
-                      left_context, right_context,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      compute_accuracy,
-                      run_opts, use_raw_nnet = True):
-
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, use_raw_nnet = True, compute_accuracy = compute_accuracy)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, use_raw_nnet = True)
-
-    # an option for writing cache (storing pairs of nnet-computations
-    # and computation-requests) during training.
-    cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
-
-    if do_average:
-        cur_minibatch_size = minibatch_size
-        cur_max_param_change = max_param_change
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_minibatch_size = minibatch_size / 2
-        cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        GetAverageNnetModel(dir = dir, iter = iter,
-                            nnets_list = " ".join(nnets_list),
-                            run_opts = run_opts,
-                            use_raw_nnet = True)
-    else:
-        # choose the best model from different jobs
-        GetBestNnetModel(dir = dir, iter = iter,
-                         best_model_index = best_model,
-                         run_opts = run_opts,
-                         use_raw_nnet = True)
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.raw".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -542,8 +358,8 @@ def Train(args, run_opts):
 
 
     if (args.stage <= -1):
-        logger.info("Preparing the initial acoustic model.")
-        PrepareInitialAcousticModel(args.dir, None, run_opts, use_raw_nnet = True)
+        logger.info("Preparing the initial network.")
+        PrepareInitialNetwork(args.dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
@@ -577,26 +393,26 @@ def Train(args, run_opts):
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            TrainOneIteration(dir = args.dir,
-                              iter = iter,
-                              srand = args.srand,
-                              egs_dir = egs_dir,
-                              num_jobs = current_num_jobs,
-                              num_archives_processed = num_archives_processed,
-                              num_archives = num_archives,
-                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                              minibatch_size = args.minibatch_size,
-                              frames_per_eg = args.frames_per_eg,
-                              num_hidden_layers = num_hidden_layers,
-                              add_layers_period = args.add_layers_period,
-                              left_context = left_context,
-                              right_context = right_context,
-                              momentum = args.momentum,
-                              max_param_change = args.max_param_change,
-                              shuffle_buffer_size = args.shuffle_buffer_size,
-                              compute_accuracy = compute_accuracy,
-                              use_raw_nnet = True,
-                              run_opts = run_opts)
+            train_lib.TrainOneIteration(dir = args.dir,
+                                        iter = iter,
+                                        srand = args.srand,
+                                        egs_dir = egs_dir,
+                                        num_jobs = current_num_jobs,
+                                        num_archives_processed = num_archives_processed,
+                                        num_archives = num_archives,
+                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        minibatch_size = args.minibatch_size,
+                                        frames_per_eg = args.frames_per_eg,
+                                        num_hidden_layers = num_hidden_layers,
+                                        add_layers_period = args.add_layers_period,
+                                        left_context = left_context,
+                                        right_context = right_context,
+                                        momentum = args.momentum,
+                                        max_param_change = args.max_param_change,
+                                        shuffle_buffer_size = args.shuffle_buffer_size,
+                                        run_opts = run_opts,
+                                        compute_accuracy = compute_accuracy,
+                                        use_raw_nnet = True)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,

From 14db04643df7b13ad8d981a1d6cac0b113ca9be1 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 30 Sep 2016 11:03:53 -0400
Subject: [PATCH 07/71] raw_python_script: Minor bug fixes

---
 egs/wsj/s5/steps/nnet3/libs/train_lib.py  |  5 ++++-
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py | 20 +++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 7e658b950eb..2cf25d650ec 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -1,6 +1,10 @@
 import subprocess
 import logging
 import math
+import imp
+
+nnet3_train_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py')
+from nnet3_train_lib import *
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -10,7 +14,6 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-
 # this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index 903ffde10c6..a01f39bafff 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -460,9 +460,12 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
     import glob
     for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
         os.remove(file)
-
-    smooth=0.01
     pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+
+    smooth = 0.01
+    WritePresoftmaxPriorScaleVector(dir, pdf_counts, smooth = smooth)
+
+def WritePresoftmaxPriorScaleVector(dir, pdf_counts, smooth = 0.01)
     total = sum(pdf_counts)
     average_count = total/len(pdf_counts)
     scales = []
@@ -694,7 +697,7 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False, use_r
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
                   run_opts, chunk_width = None,
-                  use_raw_nnet = False):
+                  use_raw_nnet = False, compute_accuracy = True):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -737,10 +740,13 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
                out_model = out_model,
                egs_dir = egs_dir))
 
-  # Compute the probability of the final, combined model with
-  # the same subset we used for the previous compute_probs, as the
-  # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    if use_raw_nnet:
+        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts, wait = False, use_raw_nnet = True, compute_accuracy = compute_accuracy)
+    else:
+        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
                             prior_subset_size, run_opts, use_raw_nnet = False):

From 0782aab54e88812a74cdc27845745ecf05cd4fd7 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sat, 1 Oct 2016 18:20:26 -0400
Subject: [PATCH 08/71] raw_python_script: Refactoring RNN and DNN scripts

---
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py  | 348 +++++++++++++
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      | 166 +++++-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |   6 +-
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py     |  16 +-
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   |   4 +-
 .../s5/steps/nnet3/tdnn/make_raw_configs.py   | 492 ------------------
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 178 +------
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 187 +------
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 373 ++-----------
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 378 +-------------
 10 files changed, 603 insertions(+), 1545 deletions(-)
 create mode 100644 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
 delete mode 100755 egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py

diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
new file mode 100644
index 00000000000..7a21b28c87a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+import logging
+import imp
+
+imp.load_source('nnet3_train_lib', 'steps/nnet3/nnet3_train_lib.py')
+import nnet3_train_lib
+from nnet3_train_lib import *
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def AddCommonTrainArgs(parser):
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter MOD preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   cache_read_opt, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     momentum = momentum, max_param_change = max_param_change,
+                     min_deriv_time = min_deriv_time,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context, min_deriv_time,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      cv_minibatch_size, run_opts,
+                      compute_accuracy = True, use_raw_nnet = False):
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = use_raw_nnet, compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = use_raw_nnet)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        if use_raw_nnet:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        if use_raw_nnet:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+        else:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        GetAverageNnetModel(dir = dir, iter = iter,
+                            nnets_list = " ".join(nnets_list),
+                            run_opts = run_opts,
+                            use_raw_nnet = use_raw_nnet,
+                            shrink = shrinkage_value)
+
+    else:
+        # choose the best model from different jobs
+        GetBestNnetModel(dir = dir, iter = iter,
+                         best_model_index = best_model,
+                         run_opts = run_opts,
+                         use_raw_nnet = use_raw_nnet,
+                         shrink = shrinkage_value)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    if use_raw_nnet:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 2cf25d650ec..27042b4d1b5 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -1,9 +1,16 @@
-import subprocess
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
 import logging
 import math
 import imp
 
-nnet3_train_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py')
+imp.load_source('nnet3_train_lib', 'steps/nnet3/nnet3_train_lib.py')
+import nnet3_train_lib
 from nnet3_train_lib import *
 
 logger = logging.getLogger(__name__)
@@ -14,6 +21,152 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
+def AddCommonTrainArgs(parser):
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter MOD preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
 # this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
@@ -85,8 +238,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       left_context, right_context,
                       momentum, max_param_change, shuffle_buffer_size,
                       run_opts,
-                      compute_accuracy = True,
-                      use_raw_nnet = False):
+                      compute_accuracy = True, use_raw_nnet = False):
 
 
     # Set off jobs doing some diagnostics, in the background.
@@ -121,9 +273,9 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
         if use_raw_nnet:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
         else:
-            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
     else:
         do_average = True
         if iter == 0:
@@ -193,5 +345,3 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
     if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
         os.remove("{0}/cache.{1}".format(dir, iter))
-
-
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index b0c9f2b591c..f564a21beca 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -57,8 +57,8 @@ def GetArgs():
                         help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
                         choices=['true', 'false'], default = False)
     parser.add_argument("--objective-type", type=str, default="linear",
-                        choices = ["linear", "quadratic", "xent"],
-                        help = "the type of objective; i.e. quadratic or linear or cross-entropy per dim")
+                        choices = ["linear", "quadratic"],
+                        help = "the type of objective; i.e. quadratic or linear")
 
     # LSTM options
     parser.add_argument("--num-lstm-layers", type=int,
@@ -98,7 +98,7 @@ def GetArgs():
     parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
                         help="The factor used for determining the liftering vector in the production of MFCC. "
                         "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=0)
+                        "e.g. 22.0", default=22.0)
     parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="Add an IDCT after input to convert MFCC to Fbank",
                         default = False, choices = ["true", "false"])
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index a01f39bafff..01a76f30214 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -440,7 +440,7 @@ def ForceSymlink(file1, file2):
             os.symlink(file1, file2)
 
 def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
-                                presoftmax_prior_scale_power = None):
+                                presoftmax_prior_scale_power = -0.25):
 
     # getting the raw pdf count
     RunKaldiCommand("""
@@ -461,11 +461,13 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
     for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
         os.remove(file)
     pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+    scaled_counts = SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = presoftmax_prior_scale_power, smooth = 0.01)
 
-    smooth = 0.01
-    WritePresoftmaxPriorScaleVector(dir, pdf_counts, smooth = smooth)
+    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
+    WriteKaldiMatrix(output_file, [scaled_counts])
+    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
 
-def WritePresoftmaxPriorScaleVector(dir, pdf_counts, smooth = 0.01)
+def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = -0.25, smooth = 0.01):
     total = sum(pdf_counts)
     average_count = total/len(pdf_counts)
     scales = []
@@ -473,17 +475,15 @@ def WritePresoftmaxPriorScaleVector(dir, pdf_counts, smooth = 0.01)
         scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
     scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+    return scaled_counts
 
-    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
-    WriteKaldiMatrix(output_file, [scaled_counts])
-    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
 
 def PrepareInitialAcousticModel(dir, alidir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    PrepareInitialNetwork(dir, run_opts):
+    PrepareInitialNetwork(dir, run_opts)
 
   # Convert to .mdl, train the transitions, set the priors.
     RunKaldiCommand("""
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index d249e7152c2..ed9a34e2a67 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -80,7 +80,7 @@ def GetArgs():
 
     parser.add_argument("--objective-type", type=str,
                         help = "the type of objective; i.e. quadratic or linear",
-                        default="linear", choices = ["linear", "quadratic", "xent"])
+                        default="linear", choices = ["linear", "quadratic"])
     parser.add_argument("--xent-regularize", type=float,
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
@@ -359,7 +359,7 @@ def MakeConfigs(config_dir, splice_indexes_string,
     config_files={}
     prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
                         ivector_dim,
-                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct and cnn_layer is None else None)
+                        idct_mat = config_dir.strip() + "/idct.mat" if (add_idct and cnn_layer is None) else None)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py
deleted file mode 100755
index 179143bc916..00000000000
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_raw_configs.py
+++ /dev/null
@@ -1,492 +0,0 @@
-#!/usr/bin/env python
-
-# we're using python 3.x style print but want it to work in python 2.x,
-from __future__ import print_function
-import os
-import argparse
-import shlex
-import sys
-import warnings
-import copy
-import imp
-import ast
-
-nodes = imp.load_source('', 'steps/nnet3/components.py')
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
-
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="Writes config files and variables "
-                                                 "for TDNNs creation and training",
-                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
-
-    # Only one of these arguments can be specified, and one of them has to
-    # be compulsarily specified
-    feat_group = parser.add_mutually_exclusive_group(required = True)
-    feat_group.add_argument("--feat-dim", type=int,
-                            help="Raw feature dimension, e.g. 13")
-    feat_group.add_argument("--feat-dir", type=str,
-                            help="Feature directory, from which we derive the feat-dim")
-
-    # only one of these arguments can be specified
-    ivector_group = parser.add_mutually_exclusive_group(required = False)
-    ivector_group.add_argument("--ivector-dim", type=int,
-                                help="iVector dimension, e.g. 100", default=0)
-    ivector_group.add_argument("--ivector-dir", type=str,
-                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
-
-    num_target_group = parser.add_mutually_exclusive_group(required = True)
-    num_target_group.add_argument("--num-targets", type=int,
-                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
-    num_target_group.add_argument("--ali-dir", type=str,
-                                  help="alignment directory, from which we derive the num-targets")
-    num_target_group.add_argument("--tree-dir", type=str,
-                                  help="directory with final.mdl, from which we derive the num-targets")
-
-    # CNN options
-    parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
-                        help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 "
-                        "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 "
-                        "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, "
-                        "when CNN layers are used, no LDA will be added", default = None)
-    parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim",
-                        help="Output dimension of the linear layer at the CNN output "
-                        "for dimension reduction, e.g. 256."
-                        "The default zero means this layer is not needed.", default=0)
-
-    # General neural network options
-    parser.add_argument("--splice-indexes", type=str, required = True,
-                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' "
-                        "If CNN layers are used the first set of splice indexes will be used as input "
-                        "to the first CNN layer and later splice indexes will be interpreted as indexes "
-                        "for the TDNNs.")
-    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="If \"true\" an LDA matrix computed from the input features "
-                        "(spliced according to the first set of splice-indexes) will be used as "
-                        "the first Affine layer. This affine layer's parameters are fixed during training. "
-                        "If --cnn.layer is specified this option will be forced to \"false\".",
-                        default=True, choices = ["false", "true"])
-
-    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
-    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add a final sigmoid layer as alternate to log-softmax-layer. "
-                        "Can only be used if include-log-softmax is false. "
-                        "This is useful in cases where you want the output to be "
-                        "like probabilities between 0 and 1. Typically the nnet "
-                        "is trained with an objective such as quadratic",
-                        default=False, choices = ["false", "true"])
-
-    parser.add_argument("--objective-type", type=str,
-                        help = "the type of objective; i.e. quadratic or linear",
-                        default="linear", choices = ["linear", "quadratic", "xent"])
-    parser.add_argument("--xent-regularize", type=float,
-                        help="For chain models, if nonzero, add a separate output for cross-entropy "
-                        "regularization (with learning-rate-factor equal to the inverse of this)",
-                        default=0.0)
-    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
-                        default=False, choices = ["false", "true"])
-    parser.add_argument("--final-layer-normalize-target", type=float,
-                        help="RMS target for final layer (set to <1 if final layer learns too fast",
-                        default=1.0)
-    parser.add_argument("--subset-dim", type=int, default=0,
-                        help="dimension of the subset of units to be sent to the central frame")
-
-    hidden_dim_group = parser.add_mutually_exclusive_group(required = True)
-    hidden_dim_group.add_argument("--relu-dim", type=int,
-                        help="dimension of ReLU nonlinearities")
-    hidden_dim_group.add_argument("--relu-dims", type=str,
-                        help="dimension of ReLU nonlinearities")
-
-    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
-
-
-    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="if true, a presoftmax-prior-scale is added",
-                        choices=['true', 'false'], default = True)
-
-    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=22.0)
-    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="Add an IDCT after input to convert MFCC to Fbank", default = False)
-    parser.add_argument("config_dir",
-                        help="Directory to write config files and variables")
-
-    print(' '.join(sys.argv))
-
-    args = parser.parse_args()
-    args = CheckArgs(args)
-
-    return args
-
-def CheckArgs(args):
-    if not os.path.exists(args.config_dir):
-        os.makedirs(args.config_dir)
-
-    ## Check arguments.
-    if args.feat_dir is not None:
-        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
-
-    if args.ali_dir is not None:
-        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
-    elif args.tree_dir is not None:
-        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
-
-    if args.ivector_dir is not None:
-        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
-
-    if not args.feat_dim > 0:
-        raise Exception("feat-dim has to be postive")
-
-    if args.add_lda and args.add_idct:
-        raise Exception("add-idct can be true only if add-lda is false")
-
-    if not args.num_targets > 0:
-        print(args.num_targets)
-        raise Exception("num_targets has to be positive")
-
-    if not args.ivector_dim >= 0:
-        raise Exception("ivector-dim has to be non-negative")
-
-    if (args.subset_dim < 0):
-        raise Exception("--subset-dim has to be non-negative")
-
-    args.nonlin_output_dims = args.relu_dims
-    args.nonlin_output_dim = args.relu_dim
-
-    if args.add_final_sigmoid and args.include_log_softmax:
-        raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.")
-
-    if args.xent_separate_forward_affine and args.add_final_sigmoid:
-        raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true")
-
-    if args.add_lda and args.cnn_layer is not None:
-        args.add_lda = False
-        warnings.warn("--add-lda is set to false as CNN layers are used.")
-
-    return args
-
-def AddConvMaxpLayer(config_lines, name, input, args):
-    if '3d-dim' not in input:
-        raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.")
-
-    input = nodes.AddConvolutionLayer(config_lines, name, input,
-                              input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
-                              args.filt_x_dim, args.filt_y_dim,
-                              args.filt_x_step, args.filt_y_step,
-                              args.num_filters, input['vectorization'])
-
-    if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1:
-      input = nodes.AddMaxpoolingLayer(config_lines, name, input,
-                                input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
-                                args.pool_x_size, args.pool_y_size, args.pool_z_size,
-                                args.pool_x_step, args.pool_y_step, args.pool_z_step)
-
-    return input
-
-# The ivectors are processed through an affine layer parallel to the CNN layers,
-# then concatenated with the CNN output and passed to the deeper part of the network.
-def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
-    cnn_args = ParseCnnString(cnn_layer)
-    num_cnn_layers = len(cnn_args)
-    # We use an Idct layer here to convert MFCC to FBANK features
-    nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
-    prev_layer_output = {'descriptor':  "input",
-                         'dimension': feat_dim}
-    prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
-
-    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
-    splice_descriptor = "Append({0})".format(", ".join(list))
-    cnn_input_dim = len(splice_indexes) * feat_dim
-    prev_layer_output = {'descriptor':  splice_descriptor,
-                         'dimension': cnn_input_dim,
-                         '3d-dim': [len(splice_indexes), feat_dim, 1],
-                         'vectorization': 'yzx'}
-
-    for cl in range(0, num_cnn_layers):
-        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
-
-    if cnn_bottleneck_dim > 0:
-        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
-
-    if ivector_dim > 0:
-        iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
-                           'dimension': ivector_dim}
-        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
-        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
-        prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
-
-    return prev_layer_output
-
-def PrintConfig(file_name, config_lines):
-    f = open(file_name, 'w')
-    f.write("\n".join(config_lines['components'])+"\n")
-    f.write("\n#Component nodes\n")
-    f.write("\n".join(config_lines['component-nodes']))
-    f.close()
-
-def ParseCnnString(cnn_param_string_list):
-    cnn_parser = argparse.ArgumentParser(description="cnn argument parser")
-
-    cnn_parser.add_argument("--filt-x-dim", required=True, type=int)
-    cnn_parser.add_argument("--filt-y-dim", required=True, type=int)
-    cnn_parser.add_argument("--filt-x-step", type=int, default = 1)
-    cnn_parser.add_argument("--filt-y-step", type=int, default = 1)
-    cnn_parser.add_argument("--num-filters", required=True, type=int)
-    cnn_parser.add_argument("--pool-x-size", type=int, default = 1)
-    cnn_parser.add_argument("--pool-y-size", type=int, default = 1)
-    cnn_parser.add_argument("--pool-z-size", type=int, default = 1)
-    cnn_parser.add_argument("--pool-x-step", type=int, default = 1)
-    cnn_parser.add_argument("--pool-y-step", type=int, default = 1)
-    cnn_parser.add_argument("--pool-z-step", type=int, default = 1)
-
-    cnn_args = []
-    for cl in range(0, len(cnn_param_string_list)):
-         cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl])))
-
-    return cnn_args
-
-def ParseSpliceString(splice_indexes):
-    splice_array = []
-    left_context = 0
-    right_context = 0
-    split1 = splice_indexes.split();  # we already checked the string is nonempty.
-    if len(split1) < 1:
-        raise Exception("invalid splice-indexes argument, too short: "
-                 + splice_indexes)
-    try:
-        for string in split1:
-            split2 = string.split(",")
-            if len(split2) < 1:
-                raise Exception("invalid splice-indexes argument, too-short element: "
-                         + splice_indexes)
-            int_list = []
-            for int_str in split2:
-                int_list.append(int(int_str))
-            if not int_list == sorted(int_list):
-                raise Exception("elements of splice-indexes must be sorted: "
-                         + splice_indexes)
-            left_context += -int_list[0]
-            right_context += int_list[-1]
-            splice_array.append(int_list)
-    except ValueError as e:
-        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
-    left_context = max(0, left_context)
-    right_context = max(0, right_context)
-
-    return {'left_context':left_context,
-            'right_context':right_context,
-            'splice_indexes':splice_array,
-            'num_hidden_layers':len(splice_array)
-            }
-
-# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
-def MakeConfigs(config_dir, splice_indexes_string,
-                cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
-                feat_dim, ivector_dim, num_targets, add_lda,
-                nonlin_output_dim, nonlin_output_dims, subset_dim,
-                use_presoftmax_prior_scale,
-                final_layer_normalize_target,
-                include_log_softmax,
-                add_final_sigmoid,
-                xent_regularize,
-                xent_separate_forward_affine,
-                self_repair_scale,
-                objective_type):
-
-    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
-
-    left_context = parsed_splice_output['left_context']
-    right_context = parsed_splice_output['right_context']
-    num_hidden_layers = parsed_splice_output['num_hidden_layers']
-    splice_indexes = parsed_splice_output['splice_indexes']
-    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
-
-    if nonlin_output_dims is None:
-        nonlin_output_dims = [ nonlin_output_dim for x in range(0, num_hidden_layers)]
-    else:
-        nonlin_output_dims = [ int(x) for x in nonlin_output_dims.split() ]
-
-    assert len(nonlin_output_dims) == num_hidden_layers
-
-    if xent_separate_forward_affine:
-        if splice_indexes[-1] != [0]:
-            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
-
-    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
-
-    config_lines = {'components':[], 'component-nodes':[]}
-
-    if add_idct and cnn_layer is None:
-        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
-
-    config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim,
-                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct and cnn_layer is None else None)
-
-    # Add the init config lines for estimating the preconditioning matrices
-    init_config_lines = copy.deepcopy(config_lines)
-    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
-    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
-    config_files[config_dir + '/init.config'] = init_config_lines
-
-    if cnn_layer is not None:
-        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
-                                         feat_dim, splice_indexes[0], ivector_dim)
-
-    if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
-
-    left_context = 0
-    right_context = 0
-    # we moved the first splice layer to before the LDA..
-    # so the input to the first affine layer is going to [0] index
-    splice_indexes[0] = [0]
-
-    for i in range(0, num_hidden_layers):
-        # make the intermediate config file for layerwise discriminative training
-
-        # prepare the spliced input
-        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
-            try:
-                zero_index = splice_indexes[i].index(0)
-            except ValueError:
-                zero_index = None
-            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
-            prev_layer_output_descriptor = prev_layer_output['descriptor']
-            subset_output = prev_layer_output
-            if subset_dim > 0:
-                # if subset_dim is specified the script expects a zero in the splice indexes
-                assert(zero_index is not None)
-                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
-                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
-                                 'dimension' : subset_dim}
-                config_lines['component-nodes'].append(subset_node_config)
-            appended_descriptors = []
-            appended_dimension = 0
-            for j in range(len(splice_indexes[i])):
-                if j == zero_index:
-                    appended_descriptors.append(prev_layer_output['descriptor'])
-                    appended_dimension += prev_layer_output['dimension']
-                    continue
-                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
-                appended_dimension += subset_output['dimension']
-            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
-                                 'dimension'  : appended_dimension}
-        else:
-            # this is a normal affine node
-            pass
-
-        if xent_separate_forward_affine and i == num_hidden_layers - 1:
-            if xent_regularize == 0.0:
-                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
-
-            prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
-                                                    prev_layer_output, nonlin_output_dims[i],
-                                                    self_repair_scale = self_repair_scale,
-                                                    norm_target_rms = final_layer_normalize_target)
-
-
-            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
-                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                               prior_scale_file = prior_scale_file,
-                               include_log_softmax = include_log_softmax)
-
-
-            prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
-                                                    prev_layer_output, nonlin_output_dims[i],
-                                                    self_repair_scale = self_repair_scale,
-                                                    norm_target_rms = final_layer_normalize_target)
-
-            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
-                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
-                                    0.5 / xent_regularize),
-                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                                prior_scale_file = prior_scale_file,
-                                include_log_softmax = True,
-                                name_affix = 'xent')
-        else:
-            prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
-                                                        prev_layer_output, nonlin_output_dims[i],
-                                                        self_repair_scale = self_repair_scale,
-                                                        norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
-
-            # a final layer is added after each new layer as we are generating
-            # configs for layer-wise discriminative training
-
-            # add_final_sigmoid adds a sigmoid as a final layer as alternative
-            # to log-softmax layer.
-            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
-            # This is useful when you need the final outputs to be probabilities between 0 and 1.
-            # Usually used with an objective-type such as "quadratic".
-            # Applications are k-binary classification such Ideal Ratio Mask prediction.
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                               prior_scale_file = prior_scale_file,
-                               include_log_softmax = include_log_softmax,
-                               add_final_sigmoid = add_final_sigmoid,
-                               objective_type = objective_type)
-            if xent_regularize != 0.0:
-                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
-                                          0.5 / xent_regularize),
-                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                                    prior_scale_file = prior_scale_file,
-                                    include_log_softmax = True,
-                                    name_affix = 'xent')
-
-        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
-        config_lines = {'components':[], 'component-nodes':[]}
-
-    left_context += int(parsed_splice_output['left_context'])
-    right_context += int(parsed_splice_output['right_context'])
-
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    print('num_targets=' + str(num_targets), file=f)
-    print('add_lda=' + ('true' if add_lda else 'false'), file=f)
-    print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
-    print('objective_type=' + objective_type, file=f)
-    f.close()
-
-    # printing out the configs
-    # init.config used to train lda-mllt train
-    for key in config_files.keys():
-        PrintConfig(key, config_files[key])
-
-def Main():
-    args = GetArgs()
-
-    MakeConfigs(config_dir = args.config_dir,
-                splice_indexes_string = args.splice_indexes,
-                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
-                num_targets = args.num_targets,
-                add_lda = args.add_lda, add_idct = args.add_idct,
-                cnn_layer = args.cnn_layer,
-                cnn_bottleneck_dim = args.cnn_bottleneck_dim,
-                cepstral_lifter = args.cepstral_lifter,
-                nonlin_output_dim = args.nonlin_output_dim,
-                nonlin_output_dims = args.nonlin_output_dims,
-                subset_dim = args.subset_dim,
-                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
-                final_layer_normalize_target = args.final_layer_normalize_target,
-                include_log_softmax = args.include_log_softmax,
-                add_final_sigmoid = args.add_final_sigmoid,
-                xent_regularize = args.xent_regularize,
-                xent_separate_forward_affine = args.xent_separate_forward_affine,
-                self_repair_scale = args.self_repair_scale_nonlinearity,
-                objective_type = args.objective_type)
-
-if __name__ == "__main__":
-    Main()
-
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index a625f696287..ac158cc19dc 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -18,8 +18,8 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -37,158 +37,28 @@ def GetArgs():
     Trains a feed forward DNN acoustic model using the cross-entropy objective.
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
+
+    train_lib.AddCommonTrainArgs(parser)
+
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help="Controls randomization of the samples on each"
-                        "iteration. If 0 or a large value the randomization is"
-                        "complete, but this will consume memory and cause spikes"
-                        "in disk I/O.  Smaller is easier on disk and memory but"
-                        "less random.  It's not a huge deal though, as samples"
-                        "are anyway randomized right at the start."
-                        "(the point of this is to get data in different"
-                        "minibatches on different iterations, since in the"
-                        "preconditioning method, 2 samples in the same minibatch"
-                        "can affect each others' gradients.")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="The maximum change in parameters allowed per minibatch,"
-                        "measured in Frobenius norm over the entire model")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
-                        default=-0.25,
-                        help="")
 
-
-    # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
                         default = 512,
                         help="Size of the minibatch used to compute the gradient")
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
+
+    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
+                        default=-0.25,
+                        help="")
+
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
-    parser.add_argument("--configs-dir", type=str,
-                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
-                        help="Languade directory")
+                        help="Language directory")
     parser.add_argument("--ali-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -207,10 +77,6 @@ def ProcessArgs(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
-    if args.configs_dir is not None:
-        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
-                                                '{0}/configs'.format(args.dir)))
-
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs"
         " directory which is the output of make_configs.py script")
@@ -219,7 +85,7 @@ def ProcessArgs(args):
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -249,16 +115,6 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -418,7 +274,7 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
@@ -451,7 +307,7 @@ def Train(args, run_opts):
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -466,7 +322,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 3432e5df9df..7c6c852ede4 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -18,8 +18,8 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -38,153 +38,23 @@ def GetArgs():
     using the cross-entropy objective.
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
+
+    train_lib.AddCommonTrainArgs(parser)
+
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help="Controls randomization of the samples on each"
-                        "iteration. If 0 or a large value the randomization is"
-                        "complete, but this will consume memory and cause spikes"
-                        "in disk I/O.  Smaller is easier on disk and memory but"
-                        "less random.  It's not a huge deal though, as samples"
-                        "are anyway randomized right at the start."
-                        "(the point of this is to get data in different"
-                        "minibatches on different iterations, since in the"
-                        "preconditioning method, 2 samples in the same minibatch"
-                        "can affect each others' gradients.")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="The maximum change in parameters allowed per minibatch,"
-                        "measured in Frobenius norm over the entire model")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-
-    # Parameters for the optimization
+
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
                         default = 512,
                         help="Size of the minibatch used to compute the gradient")
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
+
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
 
-    parser.add_argument("--configs-dir", type=str,
-                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
@@ -208,16 +78,12 @@ def ProcessArgs(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
-    if args.configs_dir is not None:
-        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
-                                                '{0}/configs'.format(args.dir)))
-
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("This scripts expects {0} to exist and have a configs"
-        " directory which is the output of make_configs.py script")
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
 
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -247,21 +113,12 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Set some variables.
     feat_dim = GetFeatDim(args.feat_dir)
     ivector_dim = GetIvectorDim(args.online_ivector_dir)
 
@@ -276,8 +133,8 @@ def Train(args, run_opts):
     # Set some variables.
 
     try:
-        model_left_context = variables['model_left_context']
-        model_right_context = variables['model_right_context']
+        left_context = variables['model_left_context']
+        right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
         num_targets = int(variables['num_targets'])
         add_lda = StrToBool(variables['add_lda'])
@@ -286,10 +143,6 @@ def Train(args, run_opts):
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))
-
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
-
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -315,7 +168,7 @@ def Train(args, run_opts):
         compute_accuracy = False
     else:
         target_type = "sparse"
-        compute_accuracy = True
+        compute_accuracy = True if objective_type == "linear" else False
 
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
@@ -425,13 +278,13 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, use_raw_nnet = True)
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, use_raw_nnet = True, compute_accuracy = compute_accuracy)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
@@ -454,7 +307,7 @@ def Train(args, run_opts):
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -469,7 +322,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 125b8219e73..92334ad3295 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -1,14 +1,11 @@
 #!/usr/bin/env python
 
-
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0.
 
-
 # this script is based on steps/nnet3/lstm/train.sh
 
-
 import subprocess
 import argparse
 import sys
@@ -18,7 +15,8 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -44,16 +42,10 @@ def GetArgs():
            at the non-linearities are below a threshold.
         3. RNNs can also be trained with state preservation training
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
 
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
+    train_lib.AddCommonTrainArgs(parser)
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -70,58 +62,6 @@ def GetArgs():
                         default = 0,
                         help="""Number of right steps used in the estimation of BLSTM
                         state before prediction of the first label""")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
     parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
                         default=20000,
                         help="""This is really the number of egs in each
@@ -129,32 +69,8 @@ def GetArgs():
                         for chunk_width=20, this value (20k) is equivalent
                         to the 400k number that we use as a default in
                         regular DNN training.""")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
     parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
                         default = 0.5,
                         help="""Momentum used in update computation.
@@ -181,48 +97,9 @@ def GetArgs():
                         help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
 
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
 
-    parser.add_argument("--configs-dir", type=str,
-                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
@@ -252,17 +129,12 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be positive")
 
-
-    if args.configs_dir is not None:
-        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
-                                                '{0}/configs'.format(args.dir)))
-
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
         directory which is the output of make_configs.py script""")
 
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -292,189 +164,12 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
-
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   cache_read_opt, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-
-        cache_write_opt = ""
-        if job == 1:
-            # an option for writing cache (storing pairs of nnet-computations and
-            # computation-requests) during training.
-            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
-  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      cv_minibatch_size, compute_accuracy,
-                      run_opts, use_raw_nnet = True):
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = True, compute_accuracy = compute_accuracy)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = True)
-
-    # an option for writing cache (storing pairs of nnet-computations
-    # and computation-requests) during training.
-    cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        GetAverageNnetModel(dir = dir, iter = iter,
-                            nnets_list = " ".join(nnets_list),
-                            run_opts = run_opts,
-                            use_raw_nnet = True,
-                            shrink = shrinkage_value)
-
-    else:
-        # choose the best model from different jobs
-        GetBestNnetModel(dir = dir, iter = iter,
-                         best_model_index = best_model,
-                         run_opts = run_opts,
-                         use_raw_nnet = True,
-                         shrink = shrinkage_value)
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.raw".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Set some variables.
     feat_dim = GetFeatDim(args.feat_dir)
     ivector_dim = GetIvectorDim(args.online_ivector_dir)
 
@@ -528,7 +223,7 @@ def Train(args, run_opts):
         compute_accuracy = False
     else:
         target_type = "sparse"
-        compute_accuracy = True
+        compute_accuracy = True if objective_type == "linear" else False
 
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
@@ -613,28 +308,28 @@ def Train(args, run_opts):
             shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, use_raw_nnet = True) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(dir = args.dir,
-                              iter = iter,
-                              srand = args.srand,
-                              egs_dir = egs_dir,
-                              num_jobs = current_num_jobs,
-                              num_archives_processed = num_archives_processed,
-                              num_archives = num_archives,
-                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value = shrinkage_value,
-                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                              num_hidden_layers = num_hidden_layers,
-                              add_layers_period = args.add_layers_period,
-                              left_context = left_context,
-                              right_context = right_context,
-                              min_deriv_time = min_deriv_time,
-                              momentum = args.momentum,
-                              max_param_change = args.max_param_change,
-                              shuffle_buffer_size = args.shuffle_buffer_size,
-                              cv_minibatch_size = args.cv_minibatch_size,
-                              compute_accuracy = compute_accuracy,
-                              run_opts = run_opts,
-                              use_raw_nnet = True)
+            train_lib.TrainOneIteration(dir = args.dir,
+                                        iter = iter,
+                                        srand = args.srand,
+                                        egs_dir = egs_dir,
+                                        num_jobs = current_num_jobs,
+                                        num_archives_processed = num_archives_processed,
+                                        num_archives = num_archives,
+                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        shrinkage_value = shrinkage_value,
+                                        num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                                        num_hidden_layers = num_hidden_layers,
+                                        add_layers_period = args.add_layers_period,
+                                        left_context = left_context,
+                                        right_context = right_context,
+                                        min_deriv_time = min_deriv_time,
+                                        momentum = args.momentum,
+                                        max_param_change = args.max_param_change,
+                                        shuffle_buffer_size = args.shuffle_buffer_size,
+                                        cv_minibatch_size = args.cv_minibatch_size,
+                                        run_opts = run_opts,
+                                        compute_accuracy = compute_accuracy,
+                                        use_raw_nnet = True)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -647,14 +342,14 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width, use_raw_nnet = True)
+                chunk_width = args.chunk_width, use_raw_nnet = True, compute_accuracy = compute_accuracy)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
@@ -677,7 +372,7 @@ def Train(args, run_opts):
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -692,7 +387,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index e4dd59d3416..826c5c084f6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -18,7 +18,8 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -44,16 +45,10 @@ def GetArgs():
            at the non-linearities are below a threshold.
         3. RNNs can also be trained with state preservation training
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
 
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
+    train_lib.AddCommonTrainArgs(parser)
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -70,58 +65,6 @@ def GetArgs():
                         default = 0,
                         help="""Number of right steps used in the estimation of BLSTM
                         state before prediction of the first label""")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
     parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
                         default=20000,
                         help="""This is really the number of egs in each
@@ -129,49 +72,8 @@ def GetArgs():
                         for chunk_width=20, this value (20k) is equivalent
                         to the 400k number that we use as a default in
                         regular DNN training.""")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-    # Realignment parameters
-    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
-                        default=None, action=NullstrToNoneAction,
-                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
-    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
-                        default=30,
-                        help="Number of jobs to use for realignment")
-    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
-                        default=None, action=NullstrToNoneAction,
-                        help="""A space seperated string of realignment
-                        times. Values must be between 0 and 1
-                        e.g. '0.1 0.2 0.3' """)
-
-    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
-                        default=True, action=StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="If true, gpu is used with steps/nnet3/align.sh")
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
     parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
                         default = 0.5,
                         help="""Momentum used in update computation.
@@ -198,50 +100,10 @@ def GetArgs():
                         help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
 
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
-    parser.add_argument("--configs-dir", type=str,
-                        help="Use a different configs dir than dir/configs")
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
-                        help="Languade directory")
+                        help="Language directory")
     parser.add_argument("--ali-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -266,19 +128,15 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be positive")
 
-
-    if args.configs_dir is not None:
-        RunKaldiCommand("cp -rT {0} {1}".format(config_dir,
-                                                '{0}/configs'.format(args.dir)))
-
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
         directory which is the output of make_configs.py script""")
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
+
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -302,201 +160,12 @@ def ProcessArgs(args):
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
 
-    if args.realign_use_gpu is True:
-        run_opts.realign_use_gpu = True
-        run_opts.realign_queue_opt = "--gpu 1"
-    else:
-        run_opts.realign_use_gpu = False
-        run_opts.realign_queue_opt = ""
-
-    if args.realign_command is None:
-        run_opts.realign_command = args.command
-    else:
-        run_opts.realign_command = args.realign_command
-    run_opts.realign_num_jobs = args.realign_num_jobs
-
     run_opts.command = args.command
     run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-        self.realign_use_gpu = None
-
-
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   cache_read_opt, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-
-        cache_write_opt = ""
-        if job == 1:
-            # an option for writing cache (storing pairs of nnet-computations and
-            # computation-requests) during training.
-            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
-  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      cv_minibatch_size, run_opts):
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
-
-    # an option for writing cache (storing pairs of nnet-computations
-    # and computation-requests) during training.
-    cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        GetAverageNnetModel(dir = dir, iter = iter,
-                            nnets_list = " ".join(nnets_list),
-                            run_opts = run_opts,
-                            shrink = shrinkage_value)
-
-    else:
-        # choose the best model from different jobs
-        GetBestNnetModel(dir = dir, iter = iter,
-                         best_model_index = best_model,
-                         run_opts = run_opts,
-                         shrink = shrinkage_value)
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -547,7 +216,6 @@ def Train(args, run_opts):
     """.format(command = run_opts.command,
                dir = args.dir))
 
-
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
@@ -609,15 +277,6 @@ def Train(args, run_opts):
                                                                     num_archives_to_process,
                                                                     args.initial_effective_lrate,
                                                                     args.final_effective_lrate)
-    realign_iters = []
-    if args.realign_times is not None:
-        realign_iters = GetRealignIters(args.realign_times,
-                                        num_iters,
-                                        args.num_jobs_initial,
-                                        args.num_jobs_final)
-        print(realign_iters)
-    # egs_dir will be updated if there is realignment
-    cur_egs_dir=egs_dir
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -635,22 +294,11 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            if iter in realign_iters:
-                logger.info("Re-aligning the data at iteration {0}".format(iter))
-                prev_egs_dir=cur_egs_dir
-                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
-                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
-                Realign(args.dir, iter, args.feat_dir, args.lang,
-                        prev_egs_dir, cur_egs_dir,
-                        args.prior_subset_size, num_archives, run_opts,
-                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
-                if args.cleanup and args.egs_dir is None:
-                    RemoveEgs(prev_egs_dir)
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
             shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(dir = args.dir,
+            train_lib.TrainOneIteration(dir = args.dir,
                               iter = iter,
                               srand = args.srand,
                               egs_dir = egs_dir,
@@ -683,7 +331,7 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
@@ -710,14 +358,14 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+        CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -732,7 +380,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 

From 5b17a4ca5c899fdf589ab08294ce43672e446219 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 5 Oct 2016 22:42:24 -0400
Subject: [PATCH 09/71] raw_python_script: Addressed comments and made changes

---
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py  |   9 +-
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      |   9 +-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |  13 +-
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py     | 130 +++++++++++-------
 .../s5/steps/nnet3/report/generate_plots.py   |  27 ++--
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   |   8 ++
 egs/wsj/s5/steps/nnet3/train_dnn.py           |   2 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  39 +++---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  44 +++---
 egs/wsj/s5/steps/nnet3/train_rnn.py           |   2 +-
 10 files changed, 165 insertions(+), 118 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
index 919af5f6aac..93b5041e87e 100644
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -1,10 +1,13 @@
 #!/usr/bin/env python
 
-
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0.
 
+# This is a module with methods which will be used by scripts for training of
+# recurrent neural network acoustic model and raw model (i.e., generic neural
+# network without transition model) with frame-level objectives.
+
 import logging
 import imp
 
@@ -266,8 +269,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     cache_read_opt = ""
     if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
 
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
+        do_average = False # if we've just added new hiden layer, don't do
+                           # averaging but take the best.
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
         if use_raw_nnet:
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index b39f9818a95..59b7aa28460 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -1,10 +1,13 @@
 #!/usr/bin/env python
 
-
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0.
 
+# This is a module with methods which will be used by scripts for training of
+# deep neural network acoustic model and raw model (i.e., generic neural
+# network without transition model) with frame-level objectives.
+
 import logging
 import math
 import imp
@@ -268,8 +271,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     cache_read_opt = ""
     if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
 
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
+        do_average = False # if we've just added new hiden layer, don't do
+                           # averaging but take the best.
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
         if use_raw_nnet:
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index f564a21beca..996d64eef2e 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -51,8 +51,12 @@ def GetArgs():
     parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
     parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add lda matrix",
-                        choices=['true', 'false'], default = True)
+                        help="If \"true\" an LDA matrix computed from the input features "
+                        "(spliced according to the first set of splice-indexes) will be used as "
+                        "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "This variable needs to be set to \"false\" when using dense-targets "
+                        "or when --add-idct is set to \"true\".",
+                        default=True, choices = ["false", "true"])
     parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
                         choices=['true', 'false'], default = False)
@@ -95,6 +99,8 @@ def GetArgs():
     parser.add_argument("--lstm-delay", type=str, default=None,
                         help="option to have different delays in recurrence for each lstm")
 
+    # Options to convert input MFCC into Fbank features. This is useful when a
+    # LDA layer is not added (such as when using dense targets)
     parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
                         help="The factor used for determining the liftering vector in the production of MFCC. "
                         "User has to ensure that it matches the lifter used in MFCC generation, "
@@ -256,6 +262,9 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
     nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
     config_files[config_dir + '/init.config'] = init_config_lines
 
+    # add_lda needs to be set "false" when using dense targets,
+    # or if the task is not a simple classification task
+    # (e.g. regression, multi-task)
     if add_lda:
         prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
 
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index 01a76f30214..043667e7226 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -107,23 +107,31 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
             accepted_models.append(i+1)
 
     if len(accepted_models) != num_models:
-        logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern))
+        logger.warn("""Only {0}/{1} of the models have been accepted
+for averaging, based on log files {2}.""".format(len(accepted_models),
+                                                 num_models, log_file_pattern))
 
     return [accepted_models, max_index+1]
 
-def GetAverageNnetModel(dir, iter, nnets_list, run_opts, use_raw_nnet = False, shrink = None):
+def GetAverageNnetModel(dir, iter, nnets_list, run_opts,
+                        get_raw_nnet_from_am = True, shrink = None):
     scale = 1.0
     if shrink is not None:
         scale = shrink
 
     new_iter = iter + 1
-    if use_raw_nnet:
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
+{dir}/{iter}.mdl {dir}/{new_iter}.mdl""".format(dir = dir, iter = iter,
+                                                new_iter = new_iter,
+                                                scale = scale)
+    else:
         if shrink is not None:
-            out_model = "- \| nnet3-copy --scale={scale} - {dir}/{new_iter}.raw".format(dir = dir, new_iter = new_iter, scale = scale)
+            out_model = """- \| nnet3-copy --scale={scale} \
+- {dir}/{new_iter}.raw""".format(dir = dir, new_iter = new_iter, scale = scale)
         else:
-            out_model = "{dir}/{new_iter}.raw".format(dir = dir, new_iter = new_iter)
-    else:
-        out_model = "- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} {dir}/{iter}.mdl {dir}/{new_iter}.mdl".format(dir = dir, iter = iter, new_iter = new_iter, scale = scale)
+            out_model = "{dir}/{new_iter}.raw".format(dir = dir,
+                                                      new_iter = new_iter)
 
     RunKaldiCommand("""
 {command} {dir}/log/average.{iter}.log \
@@ -134,17 +142,24 @@ def GetAverageNnetModel(dir, iter, nnets_list, run_opts, use_raw_nnet = False, s
                nnets_list = nnets_list,
                out_model = out_model))
 
-def GetBestNnetModel(dir, iter, best_model_index, run_opts, use_raw_nnet = False, shrink = None):
+def GetBestNnetModel(dir, iter, best_model_index, run_opts,
+                     get_raw_nnet_from_am = True, shrink = None):
     scale = 1.0
     if shrink is not None:
         scale = shrink
 
-    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(dir = dir, next_iter = iter + 1, best_model_index = best_model_index)
+    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(
+            dir = dir,
+            next_iter = iter + 1,
+            best_model_index = best_model_index)
 
-    if use_raw_nnet:
-        out_model = '{dir}/{next_iter}.raw'.format(dir = dir, next_iter = iter + 1)
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- \
+{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir = dir, iter = iter,
+                                                 new_iter = iter + 1)
     else:
-        out_model = '- \| nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{next_iter}.mdl'.format(dir = dir, iter = iter, new_iter = iter + 1)
+        out_model = '{dir}/{next_iter}.raw'.format(dir = dir,
+                                                   next_iter = iter + 1)
 
     RunKaldiCommand("""
 {command} {dir}/log/select.{iter}.log \
@@ -260,7 +275,7 @@ def ParseModelConfigVarsFile(var_file):
 
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
-def ParseModelConfigGenericVarsFile(var_file):
+def ParseGenericConfigVarsFile(var_file):
     variables = {}
     try:
         var_file_handle = open(var_file, 'r')
@@ -319,7 +334,13 @@ def GenerateEgs(data, alidir, egs_dir,
           egs_dir = egs_dir,
           egs_opts = egs_opts if egs_opts is not None else '' ))
 
-def GenerateEgsFromTargets(data, targets_scp, egs_dir,
+# This method generates egs directly from an scp file of targets, instead of
+# getting them from the alignments (as with the method GenerateEgs).
+# The targets are in matrix format for target_type="dense" and in posterior
+# format for target_type="sparse".
+# If using sparse targets, num_targets must be explicity specified.
+# If using dense targets, num_targets is computed by reading the feature matrix dimension.
+def GenerateEgsUsingTargets(data, targets_scp, egs_dir,
                 left_context, right_context,
                 valid_left_context, valid_right_context,
                 run_opts, stage = 0,
@@ -608,16 +629,17 @@ def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
 
     return num_jobs * effective_learning_rate
 
-def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold, use_raw_nnet = False):
+def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
+                get_raw_nnet_from_am = True):
 
     if iter == 0:
         return True
 
     try:
-        if use_raw_nnet:
-            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
-        else:
+        if get_raw_nnet_from_am:
             output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
+        else:
+            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
         output = output.strip().split("\n")
         # eg.
         # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
@@ -639,16 +661,16 @@ def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold, use_raw
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False, use_raw_nnet = False, compute_accuracy = True):
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
+                                wait = False, get_raw_nnet_from_am = True,
+                                compute_accuracy = True):
 
-    if use_raw_nnet:
-        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
-    else:
+    if get_raw_nnet_from_am:
         model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
+    else:
+        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
 
-    compute_prob_opts = ""
-    if compute_accuracy:
-        compute_prob_opts = "--compute-accuracy"
+    compute_prob_opts = "--compute-accuracy" if compute_accuracy else "";
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
@@ -674,13 +696,14 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait
                compute_prob_opts = compute_prob_opts,
                egs_dir = egs_dir), wait = wait)
 
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False, use_raw_nnet = False):
-    if use_raw_nnet:
-        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
-        model = '{0}/{1}.raw'.format(dir, iter)
-    else:
+def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
+                    get_raw_nnet_from_am = True):
+    if get_raw_nnet_from_am:
         prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
         model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
+    else:
+        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
+        model = '{0}/{1}.raw'.format(dir, iter)
 
     RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
@@ -697,7 +720,7 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False, use_r
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
                   run_opts, chunk_width = None,
-                  use_raw_nnet = False, compute_accuracy = True):
+                  get_raw_nnet_from_am = True, compute_accuracy = True):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -705,16 +728,16 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     raw_model_strings = []
     print num_iters_combine
     for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-      if use_raw_nnet:
-          model_file = '{0}/{1}.raw'.format(dir, iter)
+      if get_raw_nnet_from_am:
+          model_file = '{0}/{1}.mdl'.format(dir, iter)
           if not os.path.exists(model_file):
               raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append(model_file)
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
       else:
-          model_file = '{0}/{1}.mdl'.format(dir, iter)
+          model_file = '{0}/{1}.raw'.format(dir, iter)
           if not os.path.exists(model_file):
               raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+          raw_model_strings.append(model_file)
 
     if chunk_width is not None:
         # this is an RNN model
@@ -722,10 +745,10 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     else:
         mbsize = 1024
 
-    if use_raw_nnet:
-        out_model = '{dir}/final.raw'.format(dir = dir)
-    else:
+    if get_raw_nnet_from_am:
         out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
+    else:
+        out_model = '{dir}/final.raw'.format(dir = dir)
 
     RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
@@ -743,13 +766,16 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     # Compute the probability of the final, combined model with
     # the same subset we used for the previous compute_probs, as the
     # different subsets will lead to different probs.
-    if use_raw_nnet:
-        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts, wait = False, use_raw_nnet = True, compute_accuracy = compute_accuracy)
-    else:
+    if get_raw_nnet_from_am:
         ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    else:
+        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
+                                    wait = False, get_raw_nnet_from_am = False,
+                                    compute_accuracy = compute_accuracy)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts, use_raw_nnet = False):
+                            prior_subset_size, run_opts,
+                            get_raw_nnet_from_am = True):
     # Note: this just uses CPUs, using a smallish subset of data.
     """ Computes the average posterior of the network"""
     import glob
@@ -761,10 +787,10 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     else:
         egs_part = 'JOB'
 
-    if use_raw_nnet:
-        model = "{dir}/final.raw".format(dir = dir)
-    else:
+    if get_raw_nnet_from_am:
         model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir = dir)
+    else:
+        model = "{dir}/final.raw".format(dir = dir)
 
     RunKaldiCommand("""
 {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
@@ -809,7 +835,7 @@ def RemoveEgs(egs_dir):
 def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
                  preserve_model_interval = 100,
                  remove_egs = True,
-                 use_raw_nnet = False):
+                 get_raw_nnet_from_am = True):
     try:
         if remove_egs:
             RemoveEgs(egs_dir)
@@ -817,22 +843,22 @@ def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
         for iter in range(num_iters):
             RemoveModel(nnet_dir, iter, num_iters, 1,
                         preserve_model_interval,
-                        use_raw_nnet = use_raw_nnet)
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
     except (IOError, OSError) as err:
         logger.warning("Error while cleaning up the nnet directory")
         raise err
 
 def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
                preserve_model_interval = 100,
-               use_raw_nnet = False):
+               get_raw_nnet_from_am = True):
     if iter % preserve_model_interval == 0:
         return
     if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
         return
-    if use_raw_nnet:
-        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)
-    else:
+    if get_raw_nnet_from_am:
         file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    else:
+        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)
 
     if os.path.isfile(file_name):
         os.remove(file_name)
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 5f671f1137a..f1c489f4ca0 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -47,8 +47,7 @@ def GetArgs():
 """)
     parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
     parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
-    parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start")
-    parser.add_argument("--is-linear-objf", type=str, default = True, action = train_lib.StrToBoolAction, help="Nnet trained with linear objective as against with quadratic objective")
+    parser.add_argument("--objective-type", type=str, default="linear", choices=["linear","quadratic","chain"], help="Objective function used during training -- determines which plots are to be plotted.");
     parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
     parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
 
@@ -423,7 +422,7 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None,
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
 
-def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False, is_linear_objf = True):
+def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, objective_type = "linear"):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -436,19 +435,18 @@ def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is
     else:
         latex_report = None
 
-    if is_chain:
+    if objective_type == "chain":
         logger.info("Generating log-probability plots")
         GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-    else:
-        if is_linear_objf:
-            logger.info("Generating accuracy plots")
-            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    elif objective_type == "linear":
+        logger.info("Generating accuracy plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
-            logger.info("Generating log-likelihood plots")
-            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-        else:
-            logger.info("Generating MSE plots")
-            GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+        logger.info("Generating log-likelihood plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    else:
+        logger.info("Generating " + objective_type + " objective plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
     logger.info("Generating non-linearity stats plots")
     GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
@@ -470,8 +468,7 @@ def Main():
     GeneratePlots(args.exp_dir, args.output_dir,
                   comparison_dir = args.comparison_dir,
                   start_iter = args.start_iter,
-                  is_chain = args.is_chain,
-                  is_linear_objf = args.is_linear_objf)
+                  objective_type = args.objective_type)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index beb02893f33..cb5b8149526 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -65,6 +65,8 @@ def GetArgs():
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "This variable needs to be set to \"false\" when using dense-targets "
+                        "or when --add-idct is set to \"true\".",
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
@@ -113,10 +115,13 @@ def GetArgs():
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
 
+    # Options to convert input MFCC into Fbank features. This is useful when a
+    # LDA layer is not added (such as when using dense targets)
     parser.add_argument(["--cepstral-lifter","--cnn.cepstral-lifter"], type=float, dest = "cepstral_lifter",
                         help="The factor used for determining the liftering vector in the production of MFCC. "
                         "User has to ensure that it matches the lifter used in MFCC generation, "
                         "e.g. 22.0", default=22.0)
+
     parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="Add an IDCT after input to convert MFCC to Fbank", default = False)
     parser.add_argument("config_dir",
@@ -377,6 +382,9 @@ def MakeConfigs(config_dir, splice_indexes_string,
         prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
                                          feat_dim, splice_indexes[0], ivector_dim)
 
+    # add_lda needs to be set "false" when using dense targets,
+    # or if the task is not a simple classification task
+    # (e.g. regression, multi-task)
     if add_lda:
         prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
 
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ac158cc19dc..4ad91d354f8 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -137,7 +137,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseModelConfigGenericVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 7c6c852ede4..fc0b2f6200d 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -128,7 +128,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseModelConfigGenericVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -173,19 +173,19 @@ def Train(args, run_opts):
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir,
-                    left_context, right_context,
-                    left_context, right_context, run_opts,
-                    frames_per_eg = args.frames_per_eg,
-                    srand = args.srand,
-                    egs_opts = args.egs_opts,
-                    cmvn_opts = args.cmvn_opts,
-                    online_ivector_dir = args.online_ivector_dir,
-                    samples_per_iter = args.samples_per_iter,
-                    transform_dir = args.transform_dir,
-                    stage = args.egs_stage,
-                    target_type = target_type,
-                    num_targets = num_targets)
+        GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                                left_context, right_context,
+                                left_context, right_context, run_opts,
+                                frames_per_eg = args.frames_per_eg,
+                                srand = args.srand,
+                                egs_opts = args.egs_opts,
+                                cmvn_opts = args.cmvn_opts,
+                                online_ivector_dir = args.online_ivector_dir,
+                                samples_per_iter = args.samples_per_iter,
+                                transform_dir = args.transform_dir,
+                                stage = args.egs_stage,
+                                target_type = target_type,
+                                num_targets = num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -265,11 +265,11 @@ def Train(args, run_opts):
                                         shuffle_buffer_size = args.shuffle_buffer_size,
                                         run_opts = run_opts,
                                         compute_accuracy = compute_accuracy,
-                                        use_raw_nnet = True)
+                                        get_raw_nnet_from_am = False)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval, use_raw_nnet = True)
+                            args.preserve_model_interval, get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -284,12 +284,13 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts, use_raw_nnet = True, compute_accuracy = compute_accuracy)
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                      get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
         avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts, use_raw_nnet = True)
+                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -302,7 +303,7 @@ def Train(args, run_opts):
         CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs,
-                     use_raw_nnet = True)
+                     get_raw_nnet_from_am = False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 92334ad3295..25f1e40ca1a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -31,7 +31,7 @@
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="""
-    Trains an RNN acoustic model using the cross-entropy objective.
+    Trains an RNN neural network using the cross-entropy objective.
     RNNs include LSTMs, BLSTMs and GRUs.
     RNN acoustic model training differs from feed-forward DNN training
     in the following ways
@@ -179,7 +179,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseModelConfigGenericVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -228,20 +228,20 @@ def Train(args, run_opts):
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir,
-                    left_context, right_context,
-                    args.chunk_width + left_context,
-                    args.chunk_width + right_context, run_opts,
-                    frames_per_eg = args.chunk_width,
-                    srand = args.srand,
-                    egs_opts = args.egs_opts,
-                    cmvn_opts = args.cmvn_opts,
-                    online_ivector_dir = args.online_ivector_dir,
-                    samples_per_iter = args.samples_per_iter,
-                    transform_dir = args.transform_dir,
-                    stage = args.egs_stage,
-                    target_type = target_type,
-                    num_targets = num_targets)
+        GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                                left_context, right_context,
+                                args.chunk_width + left_context,
+                                args.chunk_width + right_context, run_opts,
+                                frames_per_eg = args.chunk_width,
+                                srand = args.srand,
+                                egs_opts = args.egs_opts,
+                                cmvn_opts = args.cmvn_opts,
+                                online_ivector_dir = args.online_ivector_dir,
+                                samples_per_iter = args.samples_per_iter,
+                                transform_dir = args.transform_dir,
+                                stage = args.egs_stage,
+                                target_type = target_type,
+                                num_targets = num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -305,7 +305,7 @@ def Train(args, run_opts):
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
-            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, use_raw_nnet = True) else 1
+            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, get_raw_nnet_from_am = False) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             train_lib.TrainOneIteration(dir = args.dir,
@@ -329,11 +329,11 @@ def Train(args, run_opts):
                                         cv_minibatch_size = args.cv_minibatch_size,
                                         run_opts = run_opts,
                                         compute_accuracy = compute_accuracy,
-                                        use_raw_nnet = True)
+                                        get_raw_nnet_from_am = False)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval, use_raw_nnet = True)
+                            args.preserve_model_interval, get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -349,12 +349,12 @@ def Train(args, run_opts):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width, use_raw_nnet = True, compute_accuracy = compute_accuracy)
+                chunk_width = args.chunk_width, get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
         avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts, use_raw_nnet = True)
+                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -367,7 +367,7 @@ def Train(args, run_opts):
         CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs,
-                     use_raw_nnet = True)
+                     get_raw_nnet_from_am = False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 826c5c084f6..104817e3df6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -188,7 +188,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseModelConfigGenericVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 

From f73183f5ee3be656b2b6a31d1450f0232b5ecdc4 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 5 Oct 2016 23:17:09 -0400
Subject: [PATCH 10/71] raw_python_script: Missed variable renames

---
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py | 28 ++++++++++----------
 egs/wsj/s5/steps/nnet3/libs/train_lib.py     | 28 ++++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
index 93b5041e87e..d169dce99a7 100644
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -239,7 +239,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       left_context, right_context, min_deriv_time,
                       momentum, max_param_change, shuffle_buffer_size,
                       cv_minibatch_size, run_opts,
-                      compute_accuracy = True, use_raw_nnet = False):
+                      compute_accuracy = True, get_raw_nnet_from_am = True):
 
 
     # Set off jobs doing some diagnostics, in the background.
@@ -259,10 +259,10 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = use_raw_nnet, compute_accuracy = compute_accuracy)
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am = get_raw_nnet_from_am, compute_accuracy = compute_accuracy)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, use_raw_nnet = use_raw_nnet)
+        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -273,20 +273,20 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                            # averaging but take the best.
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        if use_raw_nnet:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        else:
+        if get_raw_nnet_from_am:
             raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
     else:
         do_average = True
         if iter == 0:
             do_average = False   # on iteration 0, pick the best, don't average.
         else:
             cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        if use_raw_nnet:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
-        else:
+        if get_raw_nnet_from_am:
             raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
 
     if do_average:
         cur_num_chunk_per_minibatch = num_chunk_per_minibatch
@@ -319,7 +319,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         GetAverageNnetModel(dir = dir, iter = iter,
                             nnets_list = " ".join(nnets_list),
                             run_opts = run_opts,
-                            use_raw_nnet = use_raw_nnet,
+                            get_raw_nnet_from_am = get_raw_nnet_from_am,
                             shrink = shrinkage_value)
 
     else:
@@ -327,7 +327,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         GetBestNnetModel(dir = dir, iter = iter,
                          best_model_index = best_model,
                          run_opts = run_opts,
-                         use_raw_nnet = use_raw_nnet,
+                         get_raw_nnet_from_am = get_raw_nnet_from_am,
                          shrink = shrinkage_value)
 
     try:
@@ -336,10 +336,10 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         raise Exception("Error while trying to delete the raw models")
 
-    if use_raw_nnet:
-        new_model = "{0}/{1}.raw".format(dir, iter + 1)
-    else:
+    if get_raw_nnet_from_am:
         new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
 
     if not os.path.isfile(new_model):
         raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 59b7aa28460..8d92fec0ca8 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -241,7 +241,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       left_context, right_context,
                       momentum, max_param_change, shuffle_buffer_size,
                       run_opts,
-                      compute_accuracy = True, use_raw_nnet = False):
+                      compute_accuracy = True, get_raw_nnet_from_am = True):
 
 
     # Set off jobs doing some diagnostics, in the background.
@@ -261,10 +261,10 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, use_raw_nnet = use_raw_nnet, compute_accuracy = compute_accuracy)
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am = get_raw_nnet_from_am, compute_accuracy = compute_accuracy)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, use_raw_nnet = use_raw_nnet)
+        ComputeProgress(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -275,20 +275,20 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                            # averaging but take the best.
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        if use_raw_nnet:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        else:
+        if get_raw_nnet_from_am:
             raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
     else:
         do_average = True
         if iter == 0:
             do_average = False   # on iteration 0, pick the best, don't average.
         else:
             cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        if use_raw_nnet:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
-        else:
+        if get_raw_nnet_from_am:
             raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
 
     if do_average:
         cur_minibatch_size = minibatch_size
@@ -323,13 +323,13 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         GetAverageNnetModel(dir = dir, iter = iter,
                             nnets_list = " ".join(nnets_list),
                             run_opts = run_opts,
-                            use_raw_nnet = use_raw_nnet)
+                            get_raw_nnet_from_am = get_raw_nnet_from_am)
     else:
         # choose the best model from different jobs
         GetBestNnetModel(dir = dir, iter = iter,
                          best_model_index = best_model,
                          run_opts = run_opts,
-                         use_raw_nnet = use_raw_nnet)
+                         get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     try:
         for i in range(1, num_jobs + 1):
@@ -337,10 +337,10 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         raise Exception("Error while trying to delete the raw models")
 
-    if use_raw_nnet:
-        new_model = "{0}/{1}.raw".format(dir, iter + 1)
-    else:
+    if get_raw_nnet_from_am:
         new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
 
     if not os.path.isfile(new_model):
         raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))

From 167d90998bf5220d60b255e199e59d5011a8d183 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sat, 8 Oct 2016 18:37:56 -0400
Subject: [PATCH 11/71] raw_python_script: Changing module imports

---
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py | 184 +++----------------
 egs/wsj/s5/steps/nnet3/libs/train_lib.py     |  35 ++--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py      |  49 ++---
 egs/wsj/s5/steps/nnet3/train_rnn.py          |  44 ++---
 4 files changed, 91 insertions(+), 221 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
index d169dce99a7..4dd3f59e295 100644
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -11,9 +11,7 @@
 import logging
 import imp
 
-imp.load_source('nnet3_train_lib', 'steps/nnet3/nnet3_train_lib.py')
-import nnet3_train_lib
-from nnet3_train_lib import *
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -23,151 +21,6 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-def AddCommonTrainArgs(parser):
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-    # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
-    # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
 
 # this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
@@ -197,7 +50,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
             # computation-requests) during training.
             cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
 
-        process_handle = RunKaldiCommand("""
+        process_handle = nnet3_train_lib.RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
@@ -259,10 +112,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am = get_raw_nnet_from_am, compute_accuracy = compute_accuracy)
+    nnet3_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                                mb_size=cv_minibatch_size,
+                                                get_raw_nnet_from_am = get_raw_nnet_from_am,
+                                                compute_accuracy = compute_accuracy)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size, get_raw_nnet_from_am = get_raw_nnet_from_am)
+        nnet3_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
+                                        mb_size=cv_minibatch_size,
+                                        get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -309,26 +167,28 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                    momentum, max_param_change,
                    shuffle_buffer_size, cur_num_chunk_per_minibatch,
                    cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        GetAverageNnetModel(dir = dir, iter = iter,
-                            nnets_list = " ".join(nnets_list),
-                            run_opts = run_opts,
-                            get_raw_nnet_from_am = get_raw_nnet_from_am,
-                            shrink = shrinkage_value)
+        nnet3_train_lib.GetAverageNnetModel(
+                        dir = dir, iter = iter,
+                        nnets_list = " ".join(nnets_list),
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
 
     else:
         # choose the best model from different jobs
-        GetBestNnetModel(dir = dir, iter = iter,
-                         best_model_index = best_model,
-                         run_opts = run_opts,
-                         get_raw_nnet_from_am = get_raw_nnet_from_am,
-                         shrink = shrinkage_value)
+        nnet3_train_lib.GetBestNnetModel(
+                        dir = dir, iter = iter,
+                        best_model_index = best_model,
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
 
     try:
         for i in range(1, num_jobs + 1):
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 8d92fec0ca8..ce8293577b4 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -12,9 +12,7 @@
 import math
 import imp
 
-imp.load_source('nnet3_train_lib', 'steps/nnet3/nnet3_train_lib.py')
-import nnet3_train_lib
-from nnet3_train_lib import *
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -199,7 +197,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
             # computation-requests) during training.
             cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
 
-        process_handle = RunKaldiCommand("""
+        process_handle = nnet3_train_lib.RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
@@ -261,10 +259,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am = get_raw_nnet_from_am, compute_accuracy = compute_accuracy)
+    nnet3_train_lib.ComputeTrainCvProbabilities(
+                    dir, iter, egs_dir, run_opts,
+                    get_raw_nnet_from_am = get_raw_nnet_from_am,
+                    compute_accuracy = compute_accuracy)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am = get_raw_nnet_from_am)
+        nnet3_train_lib.ComputeProgress(
+                        dir, iter, egs_dir, run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -313,23 +316,25 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                    momentum, max_param_change,
                    shuffle_buffer_size, cur_minibatch_size,
                    cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        GetAverageNnetModel(dir = dir, iter = iter,
-                            nnets_list = " ".join(nnets_list),
-                            run_opts = run_opts,
-                            get_raw_nnet_from_am = get_raw_nnet_from_am)
+        nnet3_train_lib.GetAverageNnetModel(
+                        dir = dir, iter = iter,
+                        nnets_list = " ".join(nnets_list),
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
     else:
         # choose the best model from different jobs
-        GetBestNnetModel(dir = dir, iter = iter,
-                         best_model_index = best_model,
-                         run_opts = run_opts,
-                         get_raw_nnet_from_am = get_raw_nnet_from_am)
+        nnet3_train_lib.GetBestNnetModel(
+                        dir = dir, iter = iter,
+                        best_model_index = best_model,
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     try:
         for i in range(1, num_jobs + 1):
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 25f1e40ca1a..5842e63474e 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -16,7 +16,8 @@
 from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -308,28 +309,30 @@ def Train(args, run_opts):
             shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, get_raw_nnet_from_am = False) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            train_lib.TrainOneIteration(dir = args.dir,
-                                        iter = iter,
-                                        srand = args.srand,
-                                        egs_dir = egs_dir,
-                                        num_jobs = current_num_jobs,
-                                        num_archives_processed = num_archives_processed,
-                                        num_archives = num_archives,
-                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                                        shrinkage_value = shrinkage_value,
-                                        num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                                        num_hidden_layers = num_hidden_layers,
-                                        add_layers_period = args.add_layers_period,
-                                        left_context = left_context,
-                                        right_context = right_context,
-                                        min_deriv_time = min_deriv_time,
-                                        momentum = args.momentum,
-                                        max_param_change = args.max_param_change,
-                                        shuffle_buffer_size = args.shuffle_buffer_size,
-                                        cv_minibatch_size = args.cv_minibatch_size,
-                                        run_opts = run_opts,
-                                        compute_accuracy = compute_accuracy,
-                                        get_raw_nnet_from_am = False)
+            rnn_train_lib.TrainOneIteration(
+                          dir = args.dir,
+                          iter = iter,
+                          srand = args.srand,
+                          egs_dir = egs_dir,
+                          num_jobs = current_num_jobs,
+                          num_archives_processed = num_archives_processed,
+                          num_archives = num_archives,
+                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                          shrinkage_value = shrinkage_value,
+                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                          num_hidden_layers = num_hidden_layers,
+                          add_layers_period = args.add_layers_period,
+                          left_context = left_context,
+                          right_context = right_context,
+                          min_deriv_time = min_deriv_time,
+                          momentum = args.momentum,
+                          max_param_change = args.max_param_change,
+                          shuffle_buffer_size = args.shuffle_buffer_size,
+                          cv_minibatch_size = args.cv_minibatch_size,
+                          run_opts = run_opts,
+                          compute_accuracy = compute_accuracy,
+                          get_raw_nnet_from_am = False)
+
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 104817e3df6..a5679800db6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -19,7 +19,8 @@
 from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -298,26 +299,27 @@ def Train(args, run_opts):
             shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            train_lib.TrainOneIteration(dir = args.dir,
-                              iter = iter,
-                              srand = args.srand,
-                              egs_dir = egs_dir,
-                              num_jobs = current_num_jobs,
-                              num_archives_processed = num_archives_processed,
-                              num_archives = num_archives,
-                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value = shrinkage_value,
-                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                              num_hidden_layers = num_hidden_layers,
-                              add_layers_period = args.add_layers_period,
-                              left_context = left_context,
-                              right_context = right_context,
-                              min_deriv_time = min_deriv_time,
-                              momentum = args.momentum,
-                              max_param_change = args.max_param_change,
-                              shuffle_buffer_size = args.shuffle_buffer_size,
-                              cv_minibatch_size = args.cv_minibatch_size,
-                              run_opts = run_opts)
+            rnn_train_lib.TrainOneIteration(
+                          dir = args.dir,
+                          iter = iter,
+                          srand = args.srand,
+                          egs_dir = egs_dir,
+                          num_jobs = current_num_jobs,
+                          num_archives_processed = num_archives_processed,
+                          num_archives = num_archives,
+                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                          shrinkage_value = shrinkage_value,
+                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                          num_hidden_layers = num_hidden_layers,
+                          add_layers_period = args.add_layers_period,
+                          left_context = left_context,
+                          right_context = right_context,
+                          min_deriv_time = min_deriv_time,
+                          momentum = args.momentum,
+                          max_param_change = args.max_param_change,
+                          shuffle_buffer_size = args.shuffle_buffer_size,
+                          cv_minibatch_size = args.cv_minibatch_size,
+                          run_opts = run_opts)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions

From 8d86b6b271657448f2c2da1e88e064acf4f5d4df Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 18 Oct 2016 15:26:56 -0400
Subject: [PATCH 12/71] raw_python_script: Making changes based on comments

---
 egs/wsj/s5/steps/nnet3/components.py         | 15 +----
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py |  8 ++-
 egs/wsj/s5/steps/nnet3/libs/train_lib.py     |  8 ++-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py  | 48 ++++++--------
 egs/wsj/s5/steps/nnet3/nnet3_train_lib.py    | 10 +--
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py  | 66 +++++++++++---------
 6 files changed, 71 insertions(+), 84 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 0b85012e7d0..9b9ce4a54ad 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -26,16 +26,12 @@ def GetSumDescriptor(inputs):
     return sum_descriptors
 
 # adds the input nodes and returns the descriptor
-def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0, idct_mat = None):
+def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
     output_dim = 0
     components.append('input-node name=input dim=' + str(feat_dim))
-    prev_layer_output = {'descriptor':  "input",
-                         'dimension': feat_dim}
-    if idct_mat is not None:
-        prev_layer_output = AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, idct_mat)
-    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
     output_dim += len(splice_indexes) * feat_dim
     if ivector_dim > 0:
         components.append('input-node name=ivector dim=' + str(ivector_dim))
@@ -162,11 +158,6 @@ def AddConvolutionLayer(config_lines, name, input,
     else:
         conv_init_string += " num-filters={0}".format(num_filters)
 
-    if param_stddev is not None:
-        conv_init_string += " param-stddev={0}".format(param_stddev)
-    if bias_stddev is not None:
-        conv_init_string += " bias-stddev={0}".format(bias_stddev)
-
     components.append(conv_init_string)
     component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
 
@@ -457,4 +448,4 @@ def AddBLstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
-
+ 
diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
index 4dd3f59e295..cc885e4bc12 100644
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -92,7 +92,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       left_context, right_context, min_deriv_time,
                       momentum, max_param_change, shuffle_buffer_size,
                       cv_minibatch_size, run_opts,
-                      compute_accuracy = True, get_raw_nnet_from_am = True):
+                      get_raw_nnet_from_am = True):
 
 
     # Set off jobs doing some diagnostics, in the background.
@@ -112,12 +112,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
+    # Sets off some background jobs to compute train and
+    # validation set objectives
     nnet3_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
                                                 mb_size=cv_minibatch_size,
-                                                get_raw_nnet_from_am = get_raw_nnet_from_am,
-                                                compute_accuracy = compute_accuracy)
+                                                get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
+        # Runs in the background
         nnet3_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
                                         mb_size=cv_minibatch_size,
                                         get_raw_nnet_from_am = get_raw_nnet_from_am)
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index ce8293577b4..4dc98acfee5 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -239,7 +239,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       left_context, right_context,
                       momentum, max_param_change, shuffle_buffer_size,
                       run_opts,
-                      compute_accuracy = True, get_raw_nnet_from_am = True):
+                      get_raw_nnet_from_am = True):
 
 
     # Set off jobs doing some diagnostics, in the background.
@@ -259,12 +259,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
+    # Sets off some background jobs to compute train and
+    # validation set objectives
     nnet3_train_lib.ComputeTrainCvProbabilities(
                     dir, iter, egs_dir, run_opts,
-                    get_raw_nnet_from_am = get_raw_nnet_from_am,
-                    compute_accuracy = compute_accuracy)
+                    get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
+        # Runs in the background
         nnet3_train_lib.ComputeProgress(
                         dir, iter, egs_dir, run_opts,
                         get_raw_nnet_from_am = get_raw_nnet_from_am)
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 996d64eef2e..260def1d03f 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -54,8 +54,7 @@ def GetArgs():
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
-                        "This variable needs to be set to \"false\" when using dense-targets "
-                        "or when --add-idct is set to \"true\".",
+                        "This variable needs to be set to \"false\" when using dense-targets.",
                         default=True, choices = ["false", "true"])
     parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
@@ -99,16 +98,6 @@ def GetArgs():
     parser.add_argument("--lstm-delay", type=str, default=None,
                         help="option to have different delays in recurrence for each lstm")
 
-    # Options to convert input MFCC into Fbank features. This is useful when a
-    # LDA layer is not added (such as when using dense targets)
-    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=22.0)
-    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="Add an IDCT after input to convert MFCC to Fbank",
-                        default = False, choices = ["true", "false"])
-
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -138,9 +127,6 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
-    if args.add_lda and args.add_idct:
-        raise Exception("add-idct can be true only if add-lda is false")
-
     if not args.num_targets > 0:
         print(args.num_targets)
         raise Exception("num_targets has to be positive")
@@ -224,7 +210,9 @@ def ParseLstmDelayString(lstm_delay):
                 raise ValueError("invalid --lstm-delay argument, too-short element: "
                                 + lstm_delay)
             elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
-                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
+                raise ValueError('Warning: ' + str(indexes) +
+                                 ' is not a standard BLSTM mode. ' +
+                                 'There should be a negative delay for the forward, and a postive delay for the backward.')
             if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
                 indexes[0], indexes[1] = indexes[1], indexes[0]
             lstm_delay_array.append(indexes)
@@ -235,7 +223,6 @@ def ParseLstmDelayString(lstm_delay):
 
 
 def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
-                add_idct, cepstral_lifter,
                 splice_indexes, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
                 num_lstm_layers, num_hidden_layers,
@@ -247,13 +234,9 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
 
     config_lines = {'components':[], 'component-nodes':[]}
 
-    if add_idct:
-        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
-
     config_files={}
     prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim,
-                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None)
+                        ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -275,17 +258,23 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
                                                     recurrent_projection_dim, non_recurrent_projection_dim,
                                                     clipping_threshold, norm_based_clipping,
                                                     ng_per_element_scale_options, ng_affine_options,
-                                                    lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+                                                    lstm_delay = lstm_delay[i],
+                                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
+                                                    self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         else: # add a uni-directional LSTM layer
             prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
                                                    prev_layer_output, cell_dim,
                                                    recurrent_projection_dim, non_recurrent_projection_dim,
                                                    clipping_threshold, norm_based_clipping,
                                                    ng_per_element_scale_options, ng_affine_options,
-                                                   lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+                                                   lstm_delay = lstm_delay[i][0],
+                                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
+                                                   self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options,
+                            label_delay = label_delay, include_log_softmax = include_log_softmax,
+                            add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
 
         if xent_regularize != 0.0:
@@ -302,7 +291,9 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
                                                ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options,
+                            label_delay = label_delay, include_log_softmax = include_log_softmax,
+                            add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
         if xent_regularize != 0.0:
             nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
@@ -335,7 +326,9 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer
 
 def Main():
     args = GetArgs()
-    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
+    [left_context, right_context,
+     num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes,
+                                                               args.label_delay, args.num_lstm_layers)
 
     # write the files used by other scripts like steps/nnet3/get_egs.sh
     f = open(args.config_dir + "/vars", "w")
@@ -352,7 +345,6 @@ def Main():
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
                 add_lda = args.add_lda,
-                add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter,
                 splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
                 cell_dim = args.cell_dim,
                 hidden_dim = args.hidden_dim,
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index 043667e7226..30dcd0688ab 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -662,16 +662,13 @@ def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
     return False
 
 def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
-                                wait = False, get_raw_nnet_from_am = True,
-                                compute_accuracy = True):
+                                wait = False, get_raw_nnet_from_am = True):
 
     if get_raw_nnet_from_am:
         model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
     else:
         model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
 
-    compute_prob_opts = "--compute-accuracy" if compute_accuracy else "";
-
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
   nnet3-compute-prob {compute_prob_opts} "{model}" \
@@ -720,7 +717,7 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
                   run_opts, chunk_width = None,
-                  get_raw_nnet_from_am = True, compute_accuracy = True):
+                  get_raw_nnet_from_am = True):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -770,8 +767,7 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
         ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
     else:
         ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
-                                    wait = False, get_raw_nnet_from_am = False,
-                                    compute_accuracy = compute_accuracy)
+                                    wait = False, get_raw_nnet_from_am = False)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
                             prior_subset_size, run_opts,
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index cb5b8149526..71a73d65067 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -54,6 +54,10 @@ def GetArgs():
                         help="Output dimension of the linear layer at the CNN output "
                         "for dimension reduction, e.g. 256."
                         "The default zero means this layer is not needed.", default=0)
+    parser.add_argument("--cnn.cepstral-lifter", type=float,
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
 
     # General neural network options
     parser.add_argument("--splice-indexes", type=str, required = True,
@@ -65,8 +69,7 @@ def GetArgs():
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
-                        "This variable needs to be set to \"false\" when using dense-targets "
-                        "or when --add-idct is set to \"true\".",
+                        "This variable needs to be set to \"false\" when using dense-targets. "
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
@@ -103,12 +106,17 @@ def GetArgs():
     relu_dim_group.add_argument("--relu-dim", type=int,
                         help="dimension of all ReLU nonlinearity layers")
     relu_dim_group.add_argument("--relu-dim-final", type=int,
-                        help="dimension of the last ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)
+                        help="dimension of the last ReLU nonlinearity layer. "
+                        "Dimensions increase geometrically from the first through the last ReLU layer.",
+                        default=None)
     parser.add_argument("--relu-dim-init", type=int,
-                        help="dimension of the first ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)
+                        help="dimension of the first ReLU nonlinearity layer. "
+                        "Dimensions increase geometrically from the first through the last ReLU layer.",
+                        default=None)
 
     parser.add_argument("--self-repair-scale-nonlinearity", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+                        help="A non-zero value activates the self-repair mechanism in the "
+                        "sigmoid and tanh non-linearities of the LSTM", default=None)
 
 
     parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
@@ -117,13 +125,7 @@ def GetArgs():
 
     # Options to convert input MFCC into Fbank features. This is useful when a
     # LDA layer is not added (such as when using dense targets)
-    parser.add_argument(["--cepstral-lifter","--cnn.cepstral-lifter"], type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=22.0)
 
-    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="Add an IDCT after input to convert MFCC to Fbank", default = False)
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -153,9 +155,6 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
-    if args.add_lda and args.add_idct:
-        raise Exception("add-idct can be true only if add-lda is false")
-
     if not args.num_targets > 0:
         print(args.num_targets)
         raise Exception("num_targets has to be positive")
@@ -237,7 +236,8 @@ def AddConvMaxpLayer(config_lines, name, input, args):
 
 # The ivectors are processed through an affine layer parallel to the CNN layers,
 # then concatenated with the CNN output and passed to the deeper part of the network.
-def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
+                 config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
     cnn_args = ParseCnnString(cnn_layer)
     num_cnn_layers = len(cnn_args)
     # We use an Idct layer here to convert MFCC to FBANK features
@@ -246,7 +246,8 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c
                          'dimension': feat_dim}
     prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
 
-    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n)
+                if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
     splice_descriptor = "Append({0})".format(", ".join(list))
     cnn_input_dim = len(splice_indexes) * feat_dim
     prev_layer_output = {'descriptor':  splice_descriptor,
@@ -258,13 +259,15 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c
         prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
 
     if cnn_bottleneck_dim > 0:
-        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck",
+                                                 prev_layer_output, cnn_bottleneck_dim, "")
 
     if ivector_dim > 0:
         iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
                            'dimension': ivector_dim}
         iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
-        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
+        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'],
+                                                                    iv_layer_output['descriptor'])
         prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
 
     return prev_layer_output
@@ -334,7 +337,7 @@ def ParseSpliceString(splice_indexes):
 # The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
 def MakeConfigs(config_dir, splice_indexes_string,
                 cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
-                feat_dim, ivector_dim, num_targets, add_lda, add_idct,
+                feat_dim, ivector_dim, num_targets, add_lda,
                 nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim,
                 nonlin_output_dim_init, nonlin_output_dim_final,
                 use_presoftmax_prior_scale,
@@ -356,20 +359,17 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     if xent_separate_forward_affine:
         if splice_indexes[-1] != [0]:
-            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
+            raise Exception("--xent-separate-forward-affine option is supported only if the " +
+            "last-hidden layer has no splicing before it. " +
+            "Please use a splice-indexes with just 0 as the final splicing config.")
 
     prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
 
     config_lines = {'components':[], 'component-nodes':[]}
 
-    if add_idct and cnn_layer is None:
-        # If CNN layer is not None, IDCT will be add inside AddCnnLayers method
-        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
-
     config_files={}
     prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim,
-                        idct_mat = config_dir.strip() + "/idct.mat" if (add_idct and cnn_layer is None) else None)
+                        ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -379,14 +379,16 @@ def MakeConfigs(config_dir, splice_indexes_string,
     config_files[config_dir + '/init.config'] = init_config_lines
 
     if cnn_layer is not None:
-        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
+        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim,
+                                         cepstral_lifter, config_dir,
                                          feat_dim, splice_indexes[0], ivector_dim)
 
     # add_lda needs to be set "false" when using dense targets,
     # or if the task is not a simple classification task
     # (e.g. regression, multi-task)
     if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0",
+                                              prev_layer_output, config_dir + '/lda.mat')
 
     left_context = 0
     right_context = 0
@@ -400,9 +402,11 @@ def MakeConfigs(config_dir, splice_indexes_string,
         raise Exception("num-hidden-layers has to be greater than 1 if relu-dim-init and relu-dim-final is different.")
     else:
         # computes relu-dim for each hidden layer. They increase geometrically across layers
-        factor = pow(float(nonlin_output_dim_final) / nonlin_output_dim_init, 1.0 / (num_hidden_layers - 1)) if num_hidden_layers > 1 else 1
+        factor = pow(float(nonlin_output_dim_final) / nonlin_output_dim_init,
+                     1.0 / (num_hidden_layers - 1)) if num_hidden_layers > 1 else 1
         nonlin_output_dims = [int(round(nonlin_output_dim_init * pow(factor, i))) for i in range(0, num_hidden_layers)]
-        assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
+        assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and
+               nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
         nonlin_output_dims[-1] = nonlin_output_dim_final # It ensures that the dim of the last hidden layer is exactly the same as what is specified
 
     for i in range(0, num_hidden_layers):
@@ -541,7 +545,7 @@ def Main():
                 splice_indexes_string = args.splice_indexes,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
-                add_lda = args.add_lda, add_idct = args.add_idct,
+                add_lda = args.add_lda,
                 cnn_layer = args.cnn_layer,
                 cnn_bottleneck_dim = args.cnn_bottleneck_dim,
                 cepstral_lifter = args.cepstral_lifter,

From 26b50938283829340a7cdbf8f31a641c3eb338cb Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 24 Oct 2016 18:39:53 -0400
Subject: [PATCH 13/71] raw_python_script: Initial moving of libraries

---
 egs/wsj/s5/steps/nnet3/chain/train.py         |  85 ++---
 .../chain_train_lib.py}                       |  36 +-
 .../common_train_lib.py}                      | 271 +--------------
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py  |  14 +-
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      | 323 +++++++++++++++---
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 104 +++---
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 117 ++++---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 123 ++++---
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 109 +++---
 9 files changed, 609 insertions(+), 573 deletions(-)
 rename egs/wsj/s5/steps/nnet3/{chain/nnet3_chain_lib.py => libs/chain_train_lib.py} (91%)
 rename egs/wsj/s5/steps/nnet3/{nnet3_train_lib.py => libs/common_train_lib.py} (70%)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index cd9ebf4c7a3..7d2ca029c05 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -18,8 +18,8 @@
 import shutil
 import math
 
-train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+common_train_lib = imp.load_source('ctl', 'steps/nnet3/libs/common_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/libs/chain_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 
 logger = logging.getLogger(__name__)
@@ -41,10 +41,10 @@ def GetArgs():
 
     # feat options
     parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="directory with the ivectors extracted in an online fashion.")
     parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="A string specifying '--norm-means' and '--norm-vars' values")
 
     # egs extraction options
@@ -65,21 +65,21 @@ def GetArgs():
                         " in the estimation of bidirectional RNN state before"
                         " prediction of the first label.")
     parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="String to provide options directly to steps/nnet3/get_egs.sh script")
     parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="Directory with egs. If specified this directory "
                         "will be used rather than extracting egs")
     parser.add_argument("--egs.stage", type=int, dest='egs_stage',
                         default = -6, help="Stage at which get_egs.sh should be restarted")
     parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="String to provide options directly to steps/nnet3/get_egs.sh script")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
-                        default = None, action = train_lib.NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="options to be be passed to chain-est-phone-lm")
     parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize',
                         default = 0.0,
@@ -99,7 +99,7 @@ def GetArgs():
     parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient',
                         default = 0.00001, help="")
     parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights',
-                        default=True, action=train_lib.StrToBoolAction,
+                        default=True, action=common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="")
     parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights',
@@ -134,12 +134,6 @@ def GetArgs():
     parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
                         default = 10,
                         help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
     parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
                         default = 20,
                         help="The maximum number of models used in the final"
@@ -230,18 +224,18 @@ def GetArgs():
                         help="Specifies the stage of the experiment to execution from")
     parser.add_argument("--exit-stage", type=int, default=None,
                         help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = train_lib.NullstrToNoneAction, dest="command",
+    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction, dest="command",
                         help="Specifies the script to launch jobs."
                         " e.g. queue.pl for launching on SGE cluster run.pl"
                         " for launching on local machine", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = train_lib.StrToBoolAction,
+    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = train_lib.StrToBoolAction,
+    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="Clean up models after training", default=True)
     parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = train_lib.StrToBoolAction,
+                        default = True, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="If true, remove egs after experiment")
     parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
@@ -249,7 +243,7 @@ def GetArgs():
                         help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
 
     parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = train_lib.NullstrToNoneAction,
+                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
                         help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ")
     parser.add_argument("--reporting.interval", dest = "reporting_interval",
                         type=int, default=0.1,
@@ -291,9 +285,9 @@ def ProcessArgs(args):
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not train_lib.CheckIfCudaCompiled():
+        if not common_train_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -315,15 +309,6 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.parallel_train_opts = None
-
-
 def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
                    apply_deriv_weights,
@@ -357,7 +342,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
         else:
             cur_cache_io_opts = cache_io_opts
 
-        process_handle = train_lib.RunKaldiCommand("""
+        process_handle = common_train_lib.RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
   nnet3-chain-train {parallel_train_opts} \
   --apply-deriv-weights={app_deriv_wts} \
@@ -470,14 +455,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                    frame_subsampling_factor, truncate_deriv_weights,
                    cache_io_opts, run_opts)
 
-    [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
       nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        train_lib.RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/average.{iter}.log \
 nnet3-average {nnet_list} - \| \
 nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
@@ -490,7 +475,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     else:
         # choose the best model from different jobs
-        train_lib.RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/select.{iter}.log \
     nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
         """.format(command = run_opts.command,
@@ -529,13 +514,13 @@ def Train(args, run_opts):
     CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir)
 
     # Set some variables.
-    num_jobs = train_lib.GetNumberOfJobs(args.tree_dir)
-    feat_dim = train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = train_lib.GetIvectorDim(args.online_ivector_dir)
+    num_jobs = common_train_lib.GetNumberOfJobs(args.tree_dir)
+    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    train_lib.SplitData(args.feat_dir, num_jobs)
+    common_train_lib.SplitData(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
     f = open('{0}/num_jobs'.format(args.dir), 'w')
     f.write(str(num_jobs))
@@ -544,7 +529,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [model_left_context, model_right_context, num_hidden_layers] = train_lib.ParseModelConfigVarsFile(var_file)
+    [model_left_context, model_right_context, num_hidden_layers] = common_train_lib.ParseModelConfigVarsFile(var_file)
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -559,7 +544,7 @@ def Train(args, run_opts):
 
     if (args.stage <= -4):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
-        train_lib.RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
     """.format(command = run_opts.command,
@@ -594,7 +579,7 @@ def Train(args, run_opts):
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
@@ -603,7 +588,7 @@ def Train(args, run_opts):
 
     # copy the properties of the egs to dir for
     # use during decoding
-    train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
     if (args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
@@ -627,12 +612,12 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = train_lib.VerifyIterations(num_iters, args.num_epochs,
+    num_iters_combine = common_train_lib.VerifyIterations(num_iters, args.num_epochs,
                                                    num_hidden_layers, num_archives_expanded,
                                                    args.max_models_combine, args.add_layers_period,
                                                    args.num_jobs_final)
 
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: train_lib.GetLearningRate(iter, current_num_jobs, num_iters,
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: common_train_lib.GetLearningRate(iter, current_num_jobs, num_iters,
                                                                                            num_archives_processed,
                                                                                            num_archives_to_process,
                                                                                            args.initial_effective_lrate,
@@ -648,7 +633,7 @@ def Train(args, run_opts):
         if args.stage <= iter:
             if args.shrink_value != 1.0:
                 model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
-                shrinkage_value = args.shrink_value if train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1
+                shrinkage_value = args.shrink_value if common_train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1
             else:
                 shrinkage_value = args.shrink_value
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
@@ -667,7 +652,7 @@ def Train(args, run_opts):
                               args.truncate_deriv_weights, run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                common_train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
                             args.preserve_model_interval)
 
             if args.email is not None:
@@ -677,7 +662,7 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability")
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    train_lib.SendMail(message, subject, args.email)
+                    common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
@@ -696,14 +681,14 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
                                preserve_model_interval = args.preserve_model_interval,
                                remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability")
     if args.email is not None:
-        train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
similarity index 91%
rename from egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
rename to egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
index f012d06cca9..b2a0578772d 100644
--- a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
@@ -1,6 +1,7 @@
 
 
 # Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
 # Apache 2.0.
 
 
@@ -13,7 +14,7 @@
 import os
 import sys
 
-train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -24,7 +25,7 @@
 logger.addHandler(handler)
 
 def GetNumberOfLeaves(dir):
-    [stdout, stderr] = train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
     #number of pdfs 7115
     assert(' '.join(parts[0:3]) == "number of pdfs")
@@ -34,7 +35,7 @@ def GetNumberOfLeaves(dir):
     return num_leaves
 
 def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
   {command} {dir}/log/make_phone_lm.log \
     chain-est-phone-lm {lm_opts} \
      "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
@@ -45,7 +46,7 @@ def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
                tree_dir = tree_dir))
 
 def CreateDenominatorFst(dir, tree_dir, run_opts):
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
     copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
     {command} {dir}/log/make_den_fst.log \
     chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \
@@ -63,7 +64,7 @@ def GenerateChainEgs(dir, data, lat_dir, egs_dir,
                     frames_per_iter = 20000, frames_per_eg = 20, srand = 0,
                     egs_opts = None, cmvn_opts = None, transform_dir = None):
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 steps/nnet3/chain/get_egs.sh {egs_opts} \
   --cmd "{command}" \
   --cmvn-opts "{cmvn_opts}" \
@@ -99,7 +100,7 @@ def GenerateChainEgs(dir, data, lat_dir, egs_dir,
           data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir,
           egs_opts = egs_opts if egs_opts is not None else '' ))
 
-# this function is exactly similar to the version in nnet3_train_lib.py
+# this function is exactly similar to the version in nnet3_common_train_lib.py
 # except it uses egs files in place of cegs files
 def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
                                  max_lda_jobs = None, rand_prune = 4.0,
@@ -110,7 +111,7 @@ def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
 
 
   # Write stats with the same format as stats for LDA.
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
  nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
     {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format(
@@ -124,7 +125,7 @@ def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
     lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
                          range(1, num_lda_jobs + 1))
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/sum_transform_stats.log \
     sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
         command = run_opts.command,
@@ -139,20 +140,20 @@ def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
     # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
     # of an LDA transform but without dimensionality reduction.
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/get_transform.log \
  nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
      """.format(command = run_opts.command,dir = dir,
                 lda_opts = lda_opts if lda_opts is not None else ""))
 
-    train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+    common_train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
 def PrepareInitialAcousticModel(dir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/add_first_layer.log \
    nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
                dir = dir))
@@ -163,7 +164,7 @@ def PrepareInitialAcousticModel(dir, run_opts):
     # We ensure that they have the same mode (even if someone changed the
     # script to make one or both of them text mode) by copying them both
     # before concatenating them.
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/init_mdl.log \
     nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format(
                    command = run_opts.command, dir = dir))
@@ -183,7 +184,7 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
       else:
           print('{0}: warning: model file {1} does not exist (final combination)'.format(
                   sys.argv[0], model_file))
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
 nnet3-chain-combine --num-iters=40 \
    --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
@@ -201,14 +202,15 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False)
+    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize,
+                                leaky_hmm_coefficient, run_opts, wait = False)
 
 def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
                                 leaky_hmm_coefficient, run_opts, wait = False):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
@@ -220,7 +222,7 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari
                xent_reg = xent_regularize,
                egs_dir = egs_dir), wait = wait)
 
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
@@ -238,7 +240,7 @@ def ComputeProgress(dir, iter, run_opts, wait=False):
 
     prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
     model = '{0}/{1}.mdl'.format(dir, iter)
-    train_lib.RunKaldiCommand("""
+    common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
 nnet3-am-info {model} '&&' \
 nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |"
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
similarity index 70%
rename from egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
rename to egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
index 30dcd0688ab..7c69a31f425 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
@@ -1,3 +1,9 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
 import subprocess
 import logging
 import math
@@ -13,6 +19,15 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
 
 def SendMail(message, subject, email_id):
     try:
@@ -298,95 +313,6 @@ def ParseGenericConfigVarsFile(var_file):
 
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
-def GenerateEgs(data, alidir, egs_dir,
-                left_context, right_context,
-                valid_left_context, valid_right_context,
-                run_opts, stage = 0,
-                feat_type = 'raw', online_ivector_dir = None,
-                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
-                egs_opts = None, cmvn_opts = None, transform_dir = None):
-
-    RunKaldiCommand("""
-steps/nnet3/get_egs.sh {egs_opts} \
-  --cmd "{command}" \
-  --cmvn-opts "{cmvn_opts}" \
-  --feat-type {feat_type} \
-  --transform-dir "{transform_dir}" \
-  --online-ivector-dir "{ivector_dir}" \
-  --left-context {left_context} --right-context {right_context} \
-  --valid-left-context {valid_left_context} \
-  --valid-right-context {valid_right_context} \
-  --stage {stage} \
-  --samples-per-iter {samples_per_iter} \
-  --frames-per-eg {frames_per_eg} \
-  --srand {srand} \
-  {data} {alidir} {egs_dir}
-      """.format(command = run_opts.command,
-          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
-          feat_type = feat_type,
-          transform_dir = transform_dir if transform_dir is not None else '',
-          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
-          left_context = left_context, right_context = right_context,
-          valid_left_context = valid_left_context,
-          valid_right_context = valid_right_context,
-          stage = stage, samples_per_iter = samples_per_iter,
-          frames_per_eg = frames_per_eg, srand = srand, data = data, alidir = alidir,
-          egs_dir = egs_dir,
-          egs_opts = egs_opts if egs_opts is not None else '' ))
-
-# This method generates egs directly from an scp file of targets, instead of
-# getting them from the alignments (as with the method GenerateEgs).
-# The targets are in matrix format for target_type="dense" and in posterior
-# format for target_type="sparse".
-# If using sparse targets, num_targets must be explicity specified.
-# If using dense targets, num_targets is computed by reading the feature matrix dimension.
-def GenerateEgsUsingTargets(data, targets_scp, egs_dir,
-                left_context, right_context,
-                valid_left_context, valid_right_context,
-                run_opts, stage = 0,
-                feat_type = 'raw', online_ivector_dir = None,
-                target_type = 'dense', num_targets = -1,
-                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
-                egs_opts = None, cmvn_opts = None, transform_dir = None):
-    if target_type == 'dense':
-        num_targets = GetFeatDimFromScp(targets_scp)
-    else:
-        if num_targets == -1:
-            raise Exception("--num-targets is required if target-type is dense")
-
-    RunKaldiCommand("""
-steps/nnet3/get_egs_targets.sh {egs_opts} \
-  --cmd "{command}" \
-  --cmvn-opts "{cmvn_opts}" \
-  --feat-type {feat_type} \
-  --transform-dir "{transform_dir}" \
-  --online-ivector-dir "{ivector_dir}" \
-  --left-context {left_context} --right-context {right_context} \
-  --valid-left-context {valid_left_context} \
-  --valid-right-context {valid_right_context} \
-  --stage {stage} \
-  --samples-per-iter {samples_per_iter} \
-  --frames-per-eg {frames_per_eg} \
-  --srand {srand} \
-  --target-type {target_type} \
-  --num-targets {num_targets} \
-  {data} {targets_scp} {egs_dir}
-      """.format(command = run_opts.egs_command,
-          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
-          feat_type = feat_type,
-          transform_dir = transform_dir if transform_dir is not None else '',
-          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
-          left_context = left_context, right_context = right_context,
-          valid_left_context = valid_left_context,
-          valid_right_context = valid_right_context,
-          stage = stage, samples_per_iter = samples_per_iter,
-          frames_per_eg = frames_per_eg, srand = srand,
-          num_targets = num_targets,
-          data = data,
-          targets_scp = targets_scp, target_type = target_type,
-          egs_dir = egs_dir,
-          egs_opts = egs_opts if egs_opts is not None else '' ))
-
 def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
@@ -406,50 +332,6 @@ def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
     except IOError, ValueError:
         raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir))
 
-def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
-                                 max_lda_jobs = None, rand_prune = 4.0,
-                                 lda_opts = None):
-    if max_lda_jobs is not None:
-        if num_lda_jobs > max_lda_jobs:
-            num_lda_jobs = max_lda_jobs
-
-    RunKaldiCommand("""
-{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
- nnet3-acc-lda-stats --rand-prune={rand_prune} \
-    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
-        command = run_opts.command,
-        num_lda_jobs = num_lda_jobs,
-        dir = dir,
-        egs_dir = egs_dir,
-        rand_prune = rand_prune))
-
-    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                         range(1, num_lda_jobs + 1))
-
-    RunKaldiCommand("""
-{command} {dir}/log/sum_transform_stats.log \
-    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
-        command = run_opts.command,
-        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
-
-    for file in lda_stat_files:
-        try:
-            os.remove(file)
-        except OSError:
-            raise Exception("There was error while trying to remove lda stat files.")
-    # this computes a fixed affine transform computed in the way we described in
-    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
-    # of an LDA transform but without dimensionality reduction.
-
-    RunKaldiCommand("""
-{command} {dir}/log/get_transform.log \
- nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
-     """.format(command = run_opts.command,dir = dir,
-                lda_opts = lda_opts if lda_opts is not None else ""))
-
-    ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
-
 import os, errno
 
 def ForceSymlink(file1, file2):
@@ -499,21 +381,6 @@ def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power =
     return scaled_counts
 
 
-def PrepareInitialAcousticModel(dir, alidir, run_opts):
-    """ Adds the first layer; this will also add in the lda.mat and
-        presoftmax_prior_scale.vec. It will also prepare the acoustic model
-        with the transition model."""
-
-    PrepareInitialNetwork(dir, run_opts)
-
-  # Convert to .mdl, train the transitions, set the priors.
-    RunKaldiCommand("""
-{command} {dir}/log/init_mdl.log \
-    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
-    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, alidir = alidir))
-
 def PrepareInitialNetwork(dir, run_opts):
     RunKaldiCommand("""
 {command} {dir}/log/add_first_layer.log \
@@ -661,114 +528,6 @@ def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
-                                wait = False, get_raw_nnet_from_am = True):
-
-    if get_raw_nnet_from_am:
-        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
-    else:
-        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
-
-    RunKaldiCommand("""
-{command} {dir}/log/compute_prob_valid.{iter}.log \
-  nnet3-compute-prob {compute_prob_opts} "{model}" \
-        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
-    """.format(command = run_opts.command,
-               dir = dir,
-               iter = iter,
-               mb_size = mb_size,
-               model = model,
-               compute_prob_opts = compute_prob_opts,
-               egs_dir = egs_dir), wait = wait)
-
-    RunKaldiCommand("""
-{command} {dir}/log/compute_prob_train.{iter}.log \
-  nnet3-compute-prob {compute_prob_opts} "{model}" \
-       "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
-    """.format(command = run_opts.command,
-               dir = dir,
-               iter = iter,
-               mb_size = mb_size,
-               model = model,
-               compute_prob_opts = compute_prob_opts,
-               egs_dir = egs_dir), wait = wait)
-
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
-                    get_raw_nnet_from_am = True):
-    if get_raw_nnet_from_am:
-        prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
-        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
-    else:
-        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
-        model = '{0}/{1}.raw'.format(dir, iter)
-
-    RunKaldiCommand("""
-{command} {dir}/log/progress.{iter}.log \
-nnet3-info {model} '&&' \
-nnet3-show-progress --use-gpu=no {prev_model} {model} \
-"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
-    """.format(command = run_opts.command,
-               dir = dir,
-               iter = iter,
-               model = model,
-               mb_size = mb_size,
-               prev_model = prev_model,
-               egs_dir = egs_dir), wait = wait)
-
-def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
-                  run_opts, chunk_width = None,
-                  get_raw_nnet_from_am = True):
-    # Now do combination.  In the nnet3 setup, the logic
-    # for doing averaging of subsets of the models in the case where
-    # there are too many models to reliably esetimate interpolation
-    # factors (max_models_combine) is moved into the nnet3-combine
-    raw_model_strings = []
-    print num_iters_combine
-    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-      if get_raw_nnet_from_am:
-          model_file = '{0}/{1}.mdl'.format(dir, iter)
-          if not os.path.exists(model_file):
-              raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
-      else:
-          model_file = '{0}/{1}.raw'.format(dir, iter)
-          if not os.path.exists(model_file):
-              raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append(model_file)
-
-    if chunk_width is not None:
-        # this is an RNN model
-        mbsize = int(1024.0/(chunk_width))
-    else:
-        mbsize = 1024
-
-    if get_raw_nnet_from_am:
-        out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
-    else:
-        out_model = '{dir}/final.raw'.format(dir = dir)
-
-    RunKaldiCommand("""
-{command} {combine_queue_opt} {dir}/log/combine.log \
-nnet3-combine --num-iters=40 \
-   --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
-   {out_model}
-   """.format(command = run_opts.command,
-               combine_queue_opt = run_opts.combine_queue_opt,
-               dir = dir, raw_models = " ".join(raw_model_strings),
-               mbsize = mbsize,
-               out_model = out_model,
-               egs_dir = egs_dir))
-
-    # Compute the probability of the final, combined model with
-    # the same subset we used for the previous compute_probs, as the
-    # different subsets will lead to different probs.
-    if get_raw_nnet_from_am:
-        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
-    else:
-        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
-                                    wait = False, get_raw_nnet_from_am = False)
-
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
                             prior_subset_size, run_opts,
                             get_raw_nnet_from_am = True):
diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
index cc885e4bc12..eaa10ac381b 100644
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -11,7 +11,7 @@
 import logging
 import imp
 
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -50,7 +50,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
             # computation-requests) during training.
             cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
 
-        process_handle = nnet3_train_lib.RunKaldiCommand("""
+        process_handle = common_train_lib.RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
@@ -114,13 +114,13 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     # Sets off some background jobs to compute train and
     # validation set objectives
-    nnet3_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+    common_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
                                                 mb_size=cv_minibatch_size,
                                                 get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
         # Runs in the background
-        nnet3_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
+        common_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
                                         mb_size=cv_minibatch_size,
                                         get_raw_nnet_from_am = get_raw_nnet_from_am)
 
@@ -169,14 +169,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                    momentum, max_param_change,
                    shuffle_buffer_size, cur_num_chunk_per_minibatch,
                    cache_read_opt, run_opts)
-    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        nnet3_train_lib.GetAverageNnetModel(
+        common_train_lib.GetAverageNnetModel(
                         dir = dir, iter = iter,
                         nnets_list = " ".join(nnets_list),
                         run_opts = run_opts,
@@ -185,7 +185,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     else:
         # choose the best model from different jobs
-        nnet3_train_lib.GetBestNnetModel(
+        common_train_lib.GetBestNnetModel(
                         dir = dir, iter = iter,
                         best_model_index = best_model,
                         run_opts = run_opts,
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 4dc98acfee5..ff77ed93ec8 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -12,7 +12,7 @@
 import math
 import imp
 
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -25,25 +25,25 @@
 def AddCommonTrainArgs(parser):
     # feat options
     parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="""directory with the ivectors extracted in
                         an online fashion.""")
     parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="A string specifying '--norm-means' and '--norm-vars' values")
 
     # egs extraction options
     parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
     parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="""Directory with egs. If specified this directory
                         will be used rather than extracting egs""")
     parser.add_argument("--egs.stage", type=int, dest='egs_stage',
                         default = 0, help="Stage at which get_egs.sh should be restarted")
     parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
+                        default = None, action = common_train_lib.NullstrToNoneAction,
                         help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
 
     # trainer options
@@ -124,23 +124,23 @@ def AddCommonTrainArgs(parser):
                         help="Specifies the stage of the experiment to execution from")
     parser.add_argument("--exit-stage", type=int, default=None,
                         help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction,
                         dest = "command",
                         help="""Specifies the script to launch jobs.
                         e.g. queue.pl for launching on SGE cluster
                              run.pl for launching on local machine
                         """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+    parser.add_argument("--egs.cmd", type=str, action = common_train_lib.NullstrToNoneAction,
                         dest = "egs_command",
                         help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="Clean up models after training", default=True)
     parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
+                        default = True, action = common_train_lib.StrToBoolAction,
                         choices = ["true", "false"],
                         help="""If true, remove egs after experiment""")
     parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
@@ -148,7 +148,7 @@ def AddCommonTrainArgs(parser):
                         help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
 
     parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
+                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
                         help=""" Email-id to report about the progress of the experiment.
                               NOTE: It assumes the machine on which the script is being run can send
                               emails from command line via. mail program. The
@@ -158,16 +158,6 @@ def AddCommonTrainArgs(parser):
                         type=int, default=0.1,
                         help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
 
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-
 # this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
@@ -197,7 +187,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
             # computation-requests) during training.
             cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
 
-        process_handle = nnet3_train_lib.RunKaldiCommand("""
+        process_handle = common_train_lib.RunKaldiCommand("""
 {command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
@@ -261,15 +251,13 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     # Sets off some background jobs to compute train and
     # validation set objectives
-    nnet3_train_lib.ComputeTrainCvProbabilities(
-                    dir, iter, egs_dir, run_opts,
-                    get_raw_nnet_from_am = get_raw_nnet_from_am)
+    train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                          get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
         # Runs in the background
-        nnet3_train_lib.ComputeProgress(
-                        dir, iter, egs_dir, run_opts,
-                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+        train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
+                                  get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -312,27 +300,27 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    train_lib.TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                             raw_model_string, egs_dir, frames_per_eg,
+                             left_context, right_context,
+                             momentum, max_param_change,
+                             shuffle_buffer_size, cur_minibatch_size,
+                             cache_read_opt, run_opts)
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        nnet3_train_lib.GetAverageNnetModel(
+        common_train_lib.GetAverageNnetModel(
                         dir = dir, iter = iter,
                         nnets_list = " ".join(nnets_list),
                         run_opts = run_opts,
                         get_raw_nnet_from_am = get_raw_nnet_from_am)
     else:
         # choose the best model from different jobs
-        nnet3_train_lib.GetBestNnetModel(
+        common_train_lib.GetBestNnetModel(
                         dir = dir, iter = iter,
                         best_model_index = best_model,
                         run_opts = run_opts,
@@ -355,3 +343,262 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
     if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
         os.remove("{0}/cache.{1}".format(dir, iter))
+
+def GenerateEgs(data, alidir, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+
+    common_train_lib.RunKaldiCommand("""
+steps/nnet3/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  {data} {alidir} {egs_dir}
+      """.format(command = run_opts.command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, srand = srand, data = data, alidir = alidir,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+# This method generates egs directly from an scp file of targets, instead of
+# getting them from the alignments (as with the method GenerateEgs).
+# The targets are in matrix format for target_type="dense" and in posterior
+# format for target_type="sparse".
+# If using sparse targets, num_targets must be explicity specified.
+# If using dense targets, num_targets is computed by reading the feature matrix dimension.
+def GenerateEgsUsingTargets(data, targets_scp, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                target_type = 'dense', num_targets = -1,
+                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+    if target_type == 'dense':
+        num_targets = common_train_lib.GetFeatDimFromScp(targets_scp)
+    else:
+        if num_targets == -1:
+            raise Exception("--num-targets is required if target-type is dense")
+
+    common_train_lib.RunKaldiCommand("""
+steps/nnet3/get_egs_targets.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  --target-type {target_type} \
+  --num-targets {num_targets} \
+  {data} {targets_scp} {egs_dir}
+      """.format(command = run_opts.egs_command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, srand = srand,
+          num_targets = num_targets,
+          data = data,
+          targets_scp = targets_scp, target_type = target_type,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs = None, rand_prune = 4.0,
+                                 lda_opts = None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+    common_train_lib.RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+ nnet3-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command = run_opts.command,
+        num_lda_jobs = num_lda_jobs,
+        dir = dir,
+        egs_dir = egs_dir,
+        rand_prune = rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command = run_opts.command,
+        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+ nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+     """.format(command = run_opts.command,dir = dir,
+                lda_opts = lda_opts if lda_opts is not None else ""))
+
+    common_train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+
+def PrepareInitialAcousticModel(dir, alidir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    common_train_lib.PrepareInitialNetwork(dir, run_opts)
+
+  # Convert to .mdl, train the transitions, set the priors.
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, alidir = alidir))
+
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
+                                wait = False, get_raw_nnet_from_am = True):
+
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
+    else:
+        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
+
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
+        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               mb_size = mb_size,
+               model = model,
+               compute_prob_opts = compute_prob_opts,
+               egs_dir = egs_dir), wait = wait)
+
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
+       "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               mb_size = mb_size,
+               model = model,
+               compute_prob_opts = compute_prob_opts,
+               egs_dir = egs_dir), wait = wait)
+
+def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
+                    get_raw_nnet_from_am = True):
+    if get_raw_nnet_from_am:
+        prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
+    else:
+        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
+        model = '{0}/{1}.raw'.format(dir, iter)
+
+    common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+nnet3-info {model} '&&' \
+nnet3-show-progress --use-gpu=no {prev_model} {model} \
+"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               mb_size = mb_size,
+               prev_model = prev_model,
+               egs_dir = egs_dir), wait = wait)
+
+def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
+                  run_opts, chunk_width = None,
+                  get_raw_nnet_from_am = True):
+    # Now do combination.  In the nnet3 setup, the logic
+    # for doing averaging of subsets of the models in the case where
+    # there are too many models to reliably esetimate interpolation
+    # factors (max_models_combine) is moved into the nnet3-combine
+    raw_model_strings = []
+    print num_iters_combine
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      if get_raw_nnet_from_am:
+          model_file = '{0}/{1}.mdl'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      else:
+          model_file = '{0}/{1}.raw'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append(model_file)
+
+    if chunk_width is not None:
+        # this is an RNN model
+        mbsize = int(1024.0/(chunk_width))
+    else:
+        mbsize = 1024
+
+    if get_raw_nnet_from_am:
+        out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
+    else:
+        out_model = '{dir}/final.raw'.format(dir = dir)
+
+    common_train_lib.RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-combine --num-iters=40 \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+   {out_model}
+   """.format(command = run_opts.command,
+               combine_queue_opt = run_opts.combine_queue_opt,
+               dir = dir, raw_models = " ".join(raw_model_strings),
+               mbsize = mbsize,
+               out_model = out_model,
+               egs_dir = egs_dir))
+
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    if get_raw_nnet_from_am:
+        train_lib.ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    else:
+        train_lib.ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
+                                              wait = False, get_raw_nnet_from_am = False)
+
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 4ad91d354f8..73ddeb4b6ad 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -16,10 +16,10 @@
 import logging
 import imp
 import traceback
-from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -85,9 +85,9 @@ def ProcessArgs(args):
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = train_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not CheckIfCudaCompiled():
+        if not common_train_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -121,14 +121,14 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = GetNumberOfLeaves(args.ali_dir)
-    num_jobs = GetNumberOfJobs(args.ali_dir)
-    feat_dim = GetFeatDim(args.feat_dir)
-    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+    num_leaves = common_train_lib.GetNumberOfLeaves(args.ali_dir)
+    num_jobs = common_train_lib.GetNumberOfJobs(args.ali_dir)
+    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    SplitData(args.feat_dir, num_jobs)
+    common_train_lib.SplitData(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
     f = open('{0}/num_jobs'.format(args.dir), 'w')
     f.write(str(num_jobs))
@@ -137,7 +137,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -155,7 +155,7 @@ def Train(args, run_opts):
 
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
-        RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
     """.format(command = run_opts.command,
@@ -165,24 +165,27 @@ def Train(args, run_opts):
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
-                    left_context, right_context,
-                    left_context, right_context, run_opts,
-                    frames_per_eg = args.frames_per_eg,
-                    srand = args.srand,
-                    egs_opts = args.egs_opts,
-                    cmvn_opts = args.cmvn_opts,
-                    online_ivector_dir = args.online_ivector_dir,
-                    samples_per_iter = args.samples_per_iter,
-                    transform_dir = args.transform_dir,
-                    stage = args.egs_stage)
+        train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                              left_context, right_context,
+                              left_context, right_context, run_opts,
+                              frames_per_eg = args.frames_per_eg,
+                              srand = args.srand,
+                              egs_opts = args.egs_opts,
+                              cmvn_opts = args.cmvn_opts,
+                              online_ivector_dir = args.online_ivector_dir,
+                              samples_per_iter = args.samples_per_iter,
+                              transform_dir = args.transform_dir,
+                              stage = args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context,
+     frames_per_eg, num_archives] = (
+             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
+                                           left_context, right_context) )
     assert(args.frames_per_eg == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
@@ -190,27 +193,29 @@ def Train(args, run_opts):
 
     # copy the properties of the egs to dir for
     # use during decoding
-    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
     if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                     max_lda_jobs = args.max_lda_jobs,
-                                     rand_prune = args.rand_prune)
+        common_train_lib.ComputePreconditioningMatrix(
+                args.dir, egs_dir, num_archives, run_opts,
+                max_lda_jobs = args.max_lda_jobs,
+                rand_prune = args.rand_prune)
 
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
                     " softmax, using priors^{prior_scale} and rescaling to"
                     " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
 
-        ComputePresoftmaxPriorScale(args.dir, args.ali_dir, num_jobs, run_opts,
-                                    presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+        common_train_lib.ComputePresoftmaxPriorScale(
+                args.dir, args.ali_dir, num_jobs, run_opts,
+                presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
 
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+        train_lib.PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
@@ -221,16 +226,18 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+    num_iters_combine = common_train_lib.VerifyIterations(
+                                         num_iters, args.num_epochs,
                                          num_hidden_layers, num_archives_expanded,
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
-                                                                   num_archives_processed,
-                                                                    num_archives_to_process,
-                                                                    args.initial_effective_lrate,
-                                                                    args.final_effective_lrate)
+    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+                        GetLearningRate(iter, current_num_jobs, num_iters,
+                                        num_archives_processed,
+                                        num_archives_to_process,
+                                        args.initial_effective_lrate,
+                                        args.final_effective_lrate))
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -264,8 +271,9 @@ def Train(args, run_opts):
                                         run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval)
+                common_train_lib.RemoveModel(
+                                 args.dir, iter-2, num_iters, num_iters_combine,
+                                 args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -274,23 +282,25 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    SendMail(message, subject, args.email)
+                    common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = train_lib.ComputeAveragePosterior(
+                            args.dir, 'combined', egs_dir,
+                            num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
         final_model = "{dir}/final.mdl".format(dir = args.dir)
-        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+        train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
+                                 final_model, run_opts)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -300,14 +310,14 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, egs_dir,
-                     preserve_model_interval = args.preserve_model_interval,
-                     remove_egs = remove_egs)
+        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                         preserve_model_interval = args.preserve_model_interval,
+                         remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -322,7 +332,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            SendMail(message, message, args.email)
+            common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index fc0b2f6200d..269d8e545ae 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -16,10 +16,10 @@
 import logging
 import imp
 import traceback
-from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -55,7 +55,7 @@ def GetArgs():
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
 
-    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+    parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
     parser.add_argument("--feat-dir", type=str, required = True,
@@ -83,9 +83,9 @@ def ProcessArgs(args):
         directory which is the output of make_configs.py script""")
 
     # set the options corresponding to args.use_gpu
-    run_opts = train_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not CheckIfCudaCompiled():
+        if not common_train_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -119,16 +119,16 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    feat_dim = GetFeatDim(args.feat_dir)
-    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
-    SplitData(args.feat_dir, args.nj)
+    common_train_lib.SplitData(args.feat_dir, args.nj)
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -137,8 +137,8 @@ def Train(args, run_opts):
         right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
         num_targets = int(variables['num_targets'])
-        add_lda = StrToBool(variables['add_lda'])
-        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        add_lda = common_train_lib.StrToBool(variables['add_lda'])
+        include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
         objective_type = variables['objective_type']
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
@@ -149,13 +149,13 @@ def Train(args, run_opts):
     # transform.
 
     if args.use_dense_targets:
-        if GetFeatDimFromScp(targets_scp) != num_targets:
+        if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
             raise Exception("Mismatch between num-targets provided to "
                             "script vs configs")
 
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
-        RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
     """.format(command = run_opts.command,
@@ -165,34 +165,36 @@ def Train(args, run_opts):
 
     if args.use_dense_targets:
         target_type = "dense"
-        compute_accuracy = False
     else:
         target_type = "sparse"
-        compute_accuracy = True if objective_type == "linear" else False
 
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir,
-                                left_context, right_context,
-                                left_context, right_context, run_opts,
-                                frames_per_eg = args.frames_per_eg,
-                                srand = args.srand,
-                                egs_opts = args.egs_opts,
-                                cmvn_opts = args.cmvn_opts,
-                                online_ivector_dir = args.online_ivector_dir,
-                                samples_per_iter = args.samples_per_iter,
-                                transform_dir = args.transform_dir,
-                                stage = args.egs_stage,
-                                target_type = target_type,
-                                num_targets = num_targets)
+        train_lib.GenerateEgsUsingTargets(
+                  args.feat_dir, args.targets_scp, default_egs_dir,
+                  left_context, right_context,
+                  left_context, right_context, run_opts,
+                  frames_per_eg = args.frames_per_eg,
+                  srand = args.srand,
+                  egs_opts = args.egs_opts,
+                  cmvn_opts = args.cmvn_opts,
+                  online_ivector_dir = args.online_ivector_dir,
+                  samples_per_iter = args.samples_per_iter,
+                  transform_dir = args.transform_dir,
+                  stage = args.egs_stage,
+                  target_type = target_type,
+                  num_targets = num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context,
+     frames_per_eg, num_archives] = (
+             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
+                                           left_context, right_context) )
     assert(args.frames_per_eg == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
@@ -200,19 +202,20 @@ def Train(args, run_opts):
 
     # copy the properties of the egs to dir for
     # use during decoding
-    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
     if (add_lda and args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                     max_lda_jobs = args.max_lda_jobs,
-                                     rand_prune = args.rand_prune)
+        common_train_lib.ComputePreconditioningMatrix(
+                         args.dir, egs_dir, num_archives, run_opts,
+                         max_lda_jobs = args.max_lda_jobs,
+                         rand_prune = args.rand_prune)
 
 
     if (args.stage <= -1):
         logger.info("Preparing the initial network.")
-        PrepareInitialNetwork(args.dir, run_opts)
+        common_train_lib.PrepareInitialNetwork(args.dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
@@ -223,16 +226,18 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+    num_iters_combine = common_train_lib.VerifyIterations(
+                                         num_iters, args.num_epochs,
                                          num_hidden_layers, num_archives_expanded,
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
-                                                                   num_archives_processed,
-                                                                    num_archives_to_process,
-                                                                    args.initial_effective_lrate,
-                                                                    args.final_effective_lrate)
+    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+                        GetLearningRate(iter, current_num_jobs, num_iters,
+                                        num_archives_processed,
+                                        num_archives_to_process,
+                                        args.initial_effective_lrate,
+                                        args.final_effective_lrate))
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -242,7 +247,7 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+            model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
@@ -264,12 +269,12 @@ def Train(args, run_opts):
                                         max_param_change = args.max_param_change,
                                         shuffle_buffer_size = args.shuffle_buffer_size,
                                         run_opts = run_opts,
-                                        compute_accuracy = compute_accuracy,
                                         get_raw_nnet_from_am = False)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval, get_raw_nnet_from_am = False)
+                common_train_lib.RemoveModel(
+                        args.dir, iter-2, num_iters, num_iters_combine,
+                        args.preserve_model_interval, get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -278,19 +283,21 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    SendMail(message, subject, args.email)
+                    common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
-        logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                      get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
+        logger.info("Doing final combination to produce final.raw")
+        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
+                                run_opts, get_raw_nnet_from_am = False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
+        avg_post_vec_file = train_lib.ComputeAveragePosterior(
+                            args.dir, 'final', egs_dir,
+                            num_archives, args.prior_subset_size, run_opts,
+                            get_raw_nnet_from_am = False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -300,15 +307,15 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, egs_dir,
-                     preserve_model_interval = args.preserve_model_interval,
-                     remove_egs = remove_egs,
-                     get_raw_nnet_from_am = False)
+        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                         preserve_model_interval = args.preserve_model_interval,
+                         remove_egs = remove_egs,
+                         get_raw_nnet_from_am = False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -323,7 +330,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            SendMail(message, message, args.email)
+            common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 5842e63474e..44486f907da 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -13,11 +13,11 @@
 import logging
 import imp
 import traceback
-from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -101,7 +101,7 @@ def GetArgs():
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
 
-    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+    parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
     parser.add_argument("--feat-dir", type=str, required = True,
@@ -135,9 +135,9 @@ def ProcessArgs(args):
         directory which is the output of make_configs.py script""")
 
     # set the options corresponding to args.use_gpu
-    run_opts = train_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not CheckIfCudaCompiled():
+        if not common_train_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -171,16 +171,16 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    feat_dim = GetFeatDim(args.feat_dir)
-    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
-    SplitData(args.feat_dir, args.nj)
+    common_train_lib.SplitData(args.feat_dir, args.nj)
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -189,8 +189,8 @@ def Train(args, run_opts):
         model_right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
         num_targets = int(variables['num_targets'])
-        add_lda = StrToBool(variables['add_lda'])
-        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        add_lda = common_train_lib.StrToBool(variables['add_lda'])
+        include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
         objective_type = variables['objective_type']
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
@@ -205,13 +205,13 @@ def Train(args, run_opts):
     # transform.
 
     if args.use_dense_targets:
-        if GetFeatDimFromScp(args.targets_scp) != num_targets:
+        if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
             raise Exception("Mismatch between num-targets provided to "
                             "script vs configs")
 
     if (args.stage <= -4):
         logger.info("Initializing a basic network")
-        RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
     """.format(command = run_opts.command,
@@ -221,35 +221,37 @@ def Train(args, run_opts):
 
     if args.use_dense_targets:
         target_type = "dense"
-        compute_accuracy = False
     else:
         target_type = "sparse"
-        compute_accuracy = True if objective_type == "linear" else False
 
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir,
-                                left_context, right_context,
-                                args.chunk_width + left_context,
-                                args.chunk_width + right_context, run_opts,
-                                frames_per_eg = args.chunk_width,
-                                srand = args.srand,
-                                egs_opts = args.egs_opts,
-                                cmvn_opts = args.cmvn_opts,
-                                online_ivector_dir = args.online_ivector_dir,
-                                samples_per_iter = args.samples_per_iter,
-                                transform_dir = args.transform_dir,
-                                stage = args.egs_stage,
-                                target_type = target_type,
-                                num_targets = num_targets)
+        train_lib.GenerateEgsUsingTargets(
+                  args.feat_dir, args.targets_scp, default_egs_dir,
+                  left_context, right_context,
+                  args.chunk_width + left_context,
+                  args.chunk_width + right_context, run_opts,
+                  frames_per_eg = args.chunk_width,
+                  srand = args.srand,
+                  egs_opts = args.egs_opts,
+                  cmvn_opts = args.cmvn_opts,
+                  online_ivector_dir = args.online_ivector_dir,
+                  samples_per_iter = args.samples_per_iter,
+                  transform_dir = args.transform_dir,
+                  stage = args.egs_stage,
+                  target_type = target_type,
+                  num_targets = num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context,
+     frames_per_eg, num_archives] = (
+             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
+                                           left_context, right_context) )
     assert(args.chunk_width == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
@@ -257,19 +259,20 @@ def Train(args, run_opts):
 
     # copy the properties of the egs to dir for
     # use during decoding
-    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
     if (add_lda and args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
 
-        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                     max_lda_jobs = args.max_lda_jobs,
-                                     rand_prune = args.rand_prune)
+        common_train_lib.ComputePreconditioningMatrix(
+                         args.dir, egs_dir, num_archives, run_opts,
+                         max_lda_jobs = args.max_lda_jobs,
+                         rand_prune = args.rand_prune)
 
 
     if (args.stage <= -1):
-        logger.info("Preparing the initial acoustic model.")
-        PrepareInitialNetwork(args.dir, run_opts)
+        logger.info("Preparing the initial network.")
+        common_train_lib.PrepareInitialNetwork(args.dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
@@ -279,16 +282,18 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+    num_iters_combine = common_train_lib.VerifyIterations(
+                                         num_iters, args.num_epochs,
                                          num_hidden_layers, num_archives,
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
-                                                                   num_archives_processed,
-                                                                    num_archives_to_process,
-                                                                    args.initial_effective_lrate,
-                                                                    args.final_effective_lrate)
+    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+                        GetLearningRate(iter, current_num_jobs, num_iters,
+                                        num_archives_processed,
+                                        num_archives_to_process,
+                                        args.initial_effective_lrate,
+                                        args.final_effective_lrate))
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
     else:
@@ -306,7 +311,11 @@ def Train(args, run_opts):
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
-            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, get_raw_nnet_from_am = False) else 1
+            shrinkage_value = (args.shrink_value
+                               if DoShrinkage(iter, model_file, "Lstm*",
+                                              "SigmoidComponent", args.shrink_threshold,
+                                              get_raw_nnet_from_am = False)
+                               else 1)
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             rnn_train_lib.TrainOneIteration(
@@ -335,8 +344,9 @@ def Train(args, run_opts):
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval, get_raw_nnet_from_am = False)
+                common_train_lib.RemoveModel(
+                        args.dir, iter-2, num_iters, num_iters_combine,
+                        args.preserve_model_interval, get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -345,19 +355,22 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    SendMail(message, subject, args.email)
+                    common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width, get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
+        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
+                                run_opts, chunk_width = args.chunk_width,
+                                get_raw_nnet_from_am = False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
+        avg_post_vec_file = train_lib.ComputeAveragePosterior(
+                            args.dir, 'final', egs_dir,
+                            num_archives, args.prior_subset_size, run_opts,
+                            get_raw_nnet_from_am = False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -367,15 +380,15 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, egs_dir,
-                     preserve_model_interval = args.preserve_model_interval,
-                     remove_egs = remove_egs,
-                     get_raw_nnet_from_am = False)
+        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                         preserve_model_interval = args.preserve_model_interval,
+                         remove_egs = remove_egs,
+                         get_raw_nnet_from_am = False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -390,7 +403,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            SendMail(message, message, args.email)
+            common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index a5679800db6..524dd33413d 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -16,11 +16,11 @@
 import logging
 import imp
 import traceback
-from nnet3_train_lib import *
 
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -137,9 +137,9 @@ def ProcessArgs(args):
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = train_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not CheckIfCudaCompiled():
+        if not common_train_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -173,14 +173,14 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = GetNumberOfLeaves(args.ali_dir)
-    num_jobs = GetNumberOfJobs(args.ali_dir)
-    feat_dim = GetFeatDim(args.feat_dir)
-    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+    num_leaves = common_train_lib.GetNumberOfLeaves(args.ali_dir)
+    num_jobs = common_train_lib.GetNumberOfJobs(args.ali_dir)
+    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    SplitData(args.feat_dir, num_jobs)
+    common_train_lib.SplitData(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
     f = open('{0}/num_jobs'.format(args.dir), 'w')
     f.write(str(num_jobs))
@@ -189,7 +189,7 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
 
@@ -211,7 +211,7 @@ def Train(args, run_opts):
 
     if (args.stage <= -4):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
-        RunKaldiCommand("""
+        common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
     """.format(command = run_opts.command,
@@ -221,25 +221,28 @@ def Train(args, run_opts):
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
-                    left_context, right_context,
-                    args.chunk_width + left_context,
-                    args.chunk_width + right_context, run_opts,
-                    frames_per_eg = args.chunk_width,
-                    srand = args.srand,
-                    egs_opts = args.egs_opts,
-                    cmvn_opts = args.cmvn_opts,
-                    online_ivector_dir = args.online_ivector_dir,
-                    samples_per_iter = args.samples_per_iter,
-                    transform_dir = args.transform_dir,
-                    stage = args.egs_stage)
+        train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                              left_context, right_context,
+                              args.chunk_width + left_context,
+                              args.chunk_width + right_context, run_opts,
+                              frames_per_eg = args.chunk_width,
+                              srand = args.srand,
+                              egs_opts = args.egs_opts,
+                              cmvn_opts = args.cmvn_opts,
+                              online_ivector_dir = args.online_ivector_dir,
+                              samples_per_iter = args.samples_per_iter,
+                              transform_dir = args.transform_dir,
+                              stage = args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context,
+     frames_per_eg, num_archives] = (
+             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
+                                           left_context, right_context) )
     assert(args.chunk_width == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
@@ -247,18 +250,19 @@ def Train(args, run_opts):
 
     # copy the properties of the egs to dir for
     # use during decoding
-    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
     if (args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
 
-        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                     max_lda_jobs = args.max_lda_jobs,
-                                     rand_prune = args.rand_prune)
+        common_train_lib.ComputePresoftmaxPriorScale(
+                args.dir, egs_dir, num_archives, run_opts,
+                max_lda_jobs = args.max_lda_jobs,
+                rand_prune = args.rand_prune)
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+        train_lib.PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
 
 
     # set num_iters so that as close as possible, we process the data $num_epochs
@@ -268,16 +272,18 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+    num_iters_combine = common_train_lib.VerifyIterations(
+                                         num_iters, args.num_epochs,
                                          num_hidden_layers, num_archives,
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
-                                                                   num_archives_processed,
-                                                                    num_archives_to_process,
-                                                                    args.initial_effective_lrate,
-                                                                    args.final_effective_lrate)
+    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+                        GetLearningRate(iter, current_num_jobs, num_iters,
+                                        num_archives_processed,
+                                        num_archives_to_process,
+                                        args.initial_effective_lrate,
+                                        args.final_effective_lrate))
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -296,7 +302,11 @@ def Train(args, run_opts):
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
-            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
+            shrinkage_value = (args.shrink_value
+                               if common_train_lib.DoShrinkage(iter, model_file,
+                                                               "SigmoidComponent",
+                                                               args.shrink_threshold)
+                               else 1)
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             rnn_train_lib.TrainOneIteration(
@@ -323,8 +333,9 @@ def Train(args, run_opts):
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval)
+                common_train_lib.RemoveModel(
+                                 args.dir, iter-2, num_iters, num_iters_combine,
+                                 args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -333,24 +344,26 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    SendMail(message, subject, args.email)
+                    common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width)
+        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
+                                run_opts, chunk_width = args.chunk_width)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = train_lib.ComputeAveragePosterior(
+                            args.dir, 'combined', egs_dir,
+                            num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
         final_model = "{dir}/final.mdl".format(dir = args.dir)
-        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+        train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
+                                 final_model, run_opts)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -360,14 +373,14 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, egs_dir,
-                     preserve_model_interval = args.preserve_model_interval,
-                     remove_egs = remove_egs)
+        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                         preserve_model_interval = args.preserve_model_interval,
+                         remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
@@ -382,7 +395,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            SendMail(message, message, args.email)
+            common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 

From eb879a0540e5cc27febcefa66ee0f85355a61e1c Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 25 Oct 2016 09:54:19 -0400
Subject: [PATCH 14/71] raw_python_script: Partial changes to raw nnet3

---
 egs/wsj/s5/steps/nnet3/chain/train.py         | 467 ++++--------------
 .../s5/steps/nnet3/libs/chain_train_lib.py    | 198 +++++++-
 .../s5/steps/nnet3/libs/common_train_lib.py   | 155 +++++-
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      | 136 -----
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  35 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  66 +--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  76 +--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  51 +-
 8 files changed, 576 insertions(+), 608 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 7d2ca029c05..f00beaeed8b 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -5,7 +5,7 @@
 # Apache 2.0.
 
 
-# this script is based on steps/nnet3/lstm/train.sh
+# this script is based on steps/nnet3/chain/train.sh
 
 import os
 import subprocess
@@ -18,7 +18,7 @@
 import shutil
 import math
 
-common_train_lib = imp.load_source('ctl', 'steps/nnet3/libs/common_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 chain_lib = imp.load_source('ncl', 'steps/nnet3/libs/chain_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 
@@ -37,45 +37,15 @@ def GetArgs():
     parser = argparse.ArgumentParser(description="""
     Trains RNN and DNN acoustic models using the 'chain' objective function.
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
 
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="directory with the ivectors extracted in an online fashion.")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
+    common_train_lib.AddCommonTrainArgs(parser)
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
                         default = 150,
                         help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.")
-    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default = 0,
-                        help="Number of additional frames of input to the left"
-                        " of the input chunk. This extra context will be used"
-                        " in the estimation of RNN state before prediction of"
-                        " the first label. In the case of FF-DNN this extra"
-                        " context will be used to allow for frame-shifts")
-    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                        default = 0,
-                        help="Number of additional frames of input to the right"
-                        " of the input chunk. This extra context will be used"
-                        " in the estimation of bidirectional RNN state before"
-                        " prediction of the first label.")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="Directory with egs. If specified this directory "
-                        "will be used rather than extracting egs")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = -6, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
@@ -125,51 +95,14 @@ def GetArgs():
 
 
     # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
     parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
                         default = 10,
                         help="Number of epochs to train the model")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final"
-                        " model combination stage. These models will themselves"
-                        " be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help="Controls randomization of the samples on each"
-                        " iteration. If 0 or a large value the randomization is"
-                        " complete, but this will consume memory and cause spikes"
-                        " in disk I/O.  Smaller is easier on disk and memory but"
-                        " less random.  It's not a huge deal though, as samples"
-                        " are anyway randomized right at the start. (the point"
-                        " of this is to get data in different minibatches on"
-                        " different iterations, since in the preconditioning"
-                        " method, 2 samples in the same minibatch can affect"
-                        " each others' gradients.")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        " during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="The maximum change in parameters allowed per"
-                        " minibatch, measured in Frobenius norm over the entire model")
     parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter',
                         default=800000,
                         help ="Each iteration of training, see this many [input]"
                         " frames per job.  This option is passed to get_egs.sh."
                         " Aim for about a minute of training time")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="Value used in preconditioning matrix estimation")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="Max number of jobs used for LDA stats accumulation")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
@@ -178,23 +111,6 @@ def GetArgs():
     parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
                         default = 0.00002,
                         help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at"
-                        " the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = "The is the maximum number of models we give to"
-                        " the final 'combine' stage, but these models will"
-                        " themselves be averages of iteration-number ranges.")
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="Momentum used in update computation."
-                        " Note: we implemented it in such a way that it doesn't"
-                        " increase the effective learning rate.")
     parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
                         default = 1.0,
                         help="Scaling factor used for scaling the parameter"
@@ -215,44 +131,15 @@ def GetArgs():
                         " shrink-nonlinearity type")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
-                        default=512,
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=100,
                         help="Number of sequences to be processed in parallel every minibatch" )
 
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction, dest="command",
-                        help="Specifies the script to launch jobs."
-                        " e.g. queue.pl for launching on SGE cluster run.pl"
-                        " for launching on local machine", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="If true, remove egs after experiment")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
-                        help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ")
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--tree-dir", type=str, required = True,
-                        help="Languade directory")
+                        help="Tree directory")
     parser.add_argument("--lat-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -309,202 +196,6 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
-
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
-        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        if job == 1:
-            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
-        else:
-            cur_cache_io_opts = cache_io_opts
-
-        process_handle = common_train_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-chain-train {parallel_train_opts} \
-  --apply-deriv-weights={app_deriv_wts} \
-  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-   "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     deriv_time_opts = deriv_time_opts,
-                     trunc_deriv = truncate_deriv_weights,
-                     app_deriv_wts = apply_deriv_weights,
-                     fr_shft = frame_shift, l2 = l2_regularize,
-                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     cache_io_opts = cur_cache_io_opts,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
-                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      frame_subsampling_factor, truncate_deriv_weights,
-                      run_opts):
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
-
-    if iter > 0:
-        chain_lib.ComputeProgress(dir, iter, run_opts)
-
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        cache_io_opts = ""
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
-
-    if do_average:
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-      cur_max_param_change = max_param_change
-    else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-      cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, cur_max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts)
-
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
-    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
-                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
-                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
-                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
-        if not os.path.isfile(file):
-            raise Exception('Expected {0} to exist.'.format(file))
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -529,7 +220,21 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [model_left_context, model_right_context, num_hidden_layers] = common_train_lib.ParseModelConfigVarsFile(var_file)
+    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -550,36 +255,36 @@ def Train(args, run_opts):
     """.format(command = run_opts.command,
                dir = args.dir))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
-
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
         # this is where get_egs.sh is called.
         chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
-                                    left_context + args.frame_subsampling_factor/2,
-                                    right_context + args.frame_subsampling_factor/2,
-                                    run_opts,
-                                    left_tolerance = args.left_tolerance,
-                                    right_tolerance = args.right_tolerance,
-                                    frame_subsampling_factor = args.frame_subsampling_factor,
-                                    alignment_subsampling_factor = args.alignment_subsampling_factor,
-                                    frames_per_eg = args.chunk_width,
-                                    egs_opts = args.egs_opts,
-                                    cmvn_opts = args.cmvn_opts,
-                                    online_ivector_dir = args.online_ivector_dir,
-                                    frames_per_iter = args.frames_per_iter,
-                                    srand = args.srand,
-                                    transform_dir = args.transform_dir,
-                                    stage = args.egs_stage)
+                                   left_context + args.frame_subsampling_factor/2,
+                                   right_context + args.frame_subsampling_factor/2,
+                                   run_opts,
+                                   left_tolerance = args.left_tolerance,
+                                   right_tolerance = args.right_tolerance,
+                                   frame_subsampling_factor = args.frame_subsampling_factor,
+                                   alignment_subsampling_factor = args.alignment_subsampling_factor,
+                                   frames_per_eg = args.chunk_width,
+                                   egs_opts = args.egs_opts,
+                                   cmvn_opts = args.cmvn_opts,
+                                   online_ivector_dir = args.online_ivector_dir,
+                                   frames_per_iter = args.frames_per_iter,
+                                   srand = args.srand,
+                                   transform_dir = args.transform_dir,
+                                   stage = args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context,
+     frames_per_eg, num_archives] = (
+             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
+                                           left_context, right_context) )
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
@@ -612,16 +317,20 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = common_train_lib.VerifyIterations(num_iters, args.num_epochs,
-                                                   num_hidden_layers, num_archives_expanded,
-                                                   args.max_models_combine, args.add_layers_period,
-                                                   args.num_jobs_final)
-
-    learning_rate = lambda iter, current_num_jobs, num_archives_processed: common_train_lib.GetLearningRate(iter, current_num_jobs, num_iters,
-                                                                                           num_archives_processed,
-                                                                                           num_archives_to_process,
-                                                                                           args.initial_effective_lrate,
-                                                                                           args.final_effective_lrate)
+    num_iters_combine = common_train_lib.VerifyIterations(
+                                         num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives_expanded,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+                        common_train_lib.GetLearningRate(
+                                         iter, current_num_jobs, num_iters,
+                                         num_archives_processed,
+                                         num_archives_to_process,
+                                         args.initial_effective_lrate,
+                                         args.final_effective_lrate)
+                    )
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -631,29 +340,43 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            if args.shrink_value != 1.0:
-                model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
-                shrinkage_value = args.shrink_value if common_train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1
-            else:
-                shrinkage_value = args.shrink_value
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+            shrinkage_value = (args.shrink_value
+                               if common_train_lib.DoShrinkage(iter, model_file,
+                                                               args.shrink_nonlinearity,
+                                                               args.shrink_threshold)
+                               else 1
+                               )
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value,
-                              args.num_chunk_per_minibatch,
-                              num_hidden_layers, args.add_layers_period,
-                              args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate,
-                              args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size,
-                              args.frame_subsampling_factor,
-                              args.truncate_deriv_weights, run_opts)
+            TrainOneIteration(dir = args.dir, iter = iter, srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processsed =  num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value = shrinkage_value,
+                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              apply_deriv_weights = args.apply_deriv_weights,
+                              left_deriv_truncate = args.left_deriv_truncate,
+                              right_deriv_truncate = args.right_deriv_truncate,
+                              l2_regularize = args.l2_regularize,
+                              xent_regularize = args.xent_regularize,
+                              leaky_hmm_coefficient = args.leaky_hmm_coefficient,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              frame_subsampling_factor = args.frame_subsampling_factor,
+                              truncate_deriv_weight = args.truncate_deriv_weights,
+                              run_opts = run_opts)
+
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
-                            args.preserve_model_interval)
+                common_train_lib.RemoveModel(
+                                 args.dir, iter-2, num_iters, num_iters_combine,
+                                 args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -669,9 +392,9 @@ def Train(args, run_opts):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
         chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
-                args.num_chunk_per_minibatch, egs_dir,
-                args.leaky_hmm_coefficient, args.l2_regularize,
-                args.xent_regularize, run_opts)
+                                args.num_chunk_per_minibatch, egs_dir,
+                                args.leaky_hmm_coefficient, args.l2_regularize,
+                                args.xent_regularize, run_opts)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -682,8 +405,8 @@ def Train(args, run_opts):
             remove_egs = False
 
         common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                               preserve_model_interval = args.preserve_model_interval,
-                               remove_egs = remove_egs)
+                         preserve_model_interval = args.preserve_model_interval,
+                         remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability")
@@ -703,7 +426,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
index b2a0578772d..fbe89d3e80a 100644
--- a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
@@ -24,6 +24,201 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
+def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    deriv_time_opts=""
+    if left_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
+    if right_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        if job == 1:
+            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = common_train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-chain-train {parallel_train_opts} \
+  --apply-deriv-weights={app_deriv_wts} \
+  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+   "{raw_model}" {dir}/den.fst \
+  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     deriv_time_opts = deriv_time_opts,
+                     trunc_deriv = truncate_deriv_weights,
+                     app_deriv_wts = apply_deriv_weights,
+                     fr_shft = frame_shift, l2 = l2_regularize,
+                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     cache_io_opts = cur_cache_io_opts,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      frame_subsampling_factor, truncate_deriv_weights,
+                      run_opts):
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
+            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+
+    if iter > 0:
+        chain_lib.ComputeProgress(dir, iter, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
+
+    if do_average:
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+      cur_max_param_change = max_param_change
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, cur_max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts)
+
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
+    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
 def GetNumberOfLeaves(dir):
     [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
@@ -100,7 +295,7 @@ def GenerateChainEgs(dir, data, lat_dir, egs_dir,
           data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir,
           egs_opts = egs_opts if egs_opts is not None else '' ))
 
-# this function is exactly similar to the version in nnet3_common_train_lib.py
+# this function is exactly similar to the version in libs/train_lib.py
 # except it uses egs files in place of cegs files
 def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
                                  max_lda_jobs = None, rand_prune = 4.0,
@@ -249,3 +444,4 @@ def ComputeProgress(dir, iter, run_opts, wait=False):
                iter = iter,
                model = model,
                prev_model = prev_model), wait = wait)
+
diff --git a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
index 7c69a31f425..f77f422efed 100644
--- a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
@@ -29,6 +29,155 @@ def __init__(self):
         self.prior_queue_opt = None
         self.parallel_train_opts = None
 
+def AddCommonTrainArgs(parser):
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 0,
+                        help="Number of additional frames of input to the left"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of RNN state before prediction of"
+                        " the first label. In the case of FF-DNN this extra"
+                        " context will be used to allow for frame-shifts")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="Number of additional frames of input to the right"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of bidirectional RNN state before"
+                        " prediction of the first label.")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = common_train_lib.NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = common_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
 def SendMail(message, subject, email_id):
     try:
         subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
@@ -496,7 +645,7 @@ def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
 
     return num_jobs * effective_learning_rate
 
-def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
+def DoShrinkage(iter, model_file, non_linearity, shrink_threshold,
                 get_raw_nnet_from_am = True):
 
     if iter == 0:
@@ -504,9 +653,9 @@ def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
 
     try:
         if get_raw_nnet_from_am:
-            output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
+            output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
         else:
-            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
+            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
         output = output.strip().split("\n")
         # eg.
         # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index ff77ed93ec8..fd4491c9f45 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -22,142 +22,6 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-def AddCommonTrainArgs(parser):
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-    # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
-    # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = common_train_lib.NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
 # this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 73ddeb4b6ad..fabf9a0131e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -9,6 +9,7 @@
 # this script is based on steps/nnet3/tdnn/train.sh
 
 
+import os
 import subprocess
 import argparse
 import sys
@@ -16,10 +17,11 @@
 import logging
 import imp
 import traceback
+import shutil
 
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -40,8 +42,9 @@ def GetArgs():
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     conflict_handler = 'resolve')
 
-    train_lib.AddCommonTrainArgs(parser)
+    common_train_lib.AddCommonTrainArgs(parser)
 
+    # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
@@ -65,6 +68,7 @@ def GetArgs():
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
+    print(sys.argv)
 
     args = parser.parse_args()
 
@@ -142,12 +146,16 @@ def Train(args, run_opts):
     # Set some variables.
 
     try:
-        left_context = variables['model_left_context']
-        right_context = variables['model_right_context']
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -198,10 +206,9 @@ def Train(args, run_opts):
     if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        common_train_lib.ComputePreconditioningMatrix(
-                args.dir, egs_dir, num_archives, run_opts,
-                max_lda_jobs = args.max_lda_jobs,
-                rand_prune = args.rand_prune)
+        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
 
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
@@ -233,11 +240,13 @@ def Train(args, run_opts):
                                          args.num_jobs_final)
 
     learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
-                        GetLearningRate(iter, current_num_jobs, num_iters,
-                                        num_archives_processed,
-                                        num_archives_to_process,
-                                        args.initial_effective_lrate,
-                                        args.final_effective_lrate))
+                        common_train_lib.GetLearningRate(
+                                         iter, current_num_jobs, num_iters,
+                                         num_archives_processed,
+                                         num_archives_to_process,
+                                         args.initial_effective_lrate,
+                                         args.final_effective_lrate)
+                    )
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 269d8e545ae..41b223badd1 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -9,6 +9,7 @@
 # this script is based on steps/nnet3/tdnn/train_raw_nnet.sh
 
 
+import os
 import subprocess
 import argparse
 import sys
@@ -17,9 +18,9 @@
 import imp
 import traceback
 
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -41,8 +42,9 @@ def GetArgs():
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     conflict_handler = 'resolve')
 
-    train_lib.AddCommonTrainArgs(parser)
+    common_train_lib.AddCommonTrainArgs(parser)
 
+    # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
@@ -66,6 +68,7 @@ def GetArgs():
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
+    print(sys.argv)
 
     args = parser.parse_args()
 
@@ -133,26 +136,23 @@ def Train(args, run_opts):
     # Set some variables.
 
     try:
-        left_context = variables['model_left_context']
-        right_context = variables['model_right_context']
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
-        num_targets = int(variables['num_targets'])
         add_lda = common_train_lib.StrToBool(variables['add_lda'])
         include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
-        objective_type = variables['objective_type']
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
     # transform.
 
-    if args.use_dense_targets:
-        if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
-            raise Exception("Mismatch between num-targets provided to "
-                            "script vs configs")
-
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
         common_train_lib.RunKaldiCommand("""
@@ -162,15 +162,23 @@ def Train(args, run_opts):
                dir = args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
-
-    if args.use_dense_targets:
-        target_type = "dense"
-    else:
-        target_type = "sparse"
-
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
+        if args.use_dense_targets:
+            target_type = "dense"
+            try:
+                num_targets = int(variables['num_targets'])
+            except KeyError as e:
+                raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+                    str(e), '{0}/configs'.format(args.dir)))
+            if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
+                raise Exception("Mismatch between num-targets provided to "
+                                "script vs configs")
+        else:
+            target_type = "sparse"
+
+
         train_lib.GenerateEgsUsingTargets(
                   args.feat_dir, args.targets_scp, default_egs_dir,
                   left_context, right_context,
@@ -207,10 +215,9 @@ def Train(args, run_opts):
     if (add_lda and args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        common_train_lib.ComputePreconditioningMatrix(
-                         args.dir, egs_dir, num_archives, run_opts,
-                         max_lda_jobs = args.max_lda_jobs,
-                         rand_prune = args.rand_prune)
+        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
 
 
     if (args.stage <= -1):
@@ -233,11 +240,13 @@ def Train(args, run_opts):
                                          args.num_jobs_final)
 
     learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
-                        GetLearningRate(iter, current_num_jobs, num_iters,
-                                        num_archives_processed,
-                                        num_archives_to_process,
-                                        args.initial_effective_lrate,
-                                        args.final_effective_lrate))
+                        common_train_lib.GetLearningRate(
+                                         iter, current_num_jobs, num_iters,
+                                         num_archives_processed,
+                                         num_archives_to_process,
+                                         args.initial_effective_lrate,
+                                         args.final_effective_lrate)
+                    )
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -273,8 +282,9 @@ def Train(args, run_opts):
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
-                        args.dir, iter-2, num_iters, num_iters_combine,
-                        args.preserve_model_interval, get_raw_nnet_from_am = False)
+                                 args.dir, iter-2, num_iters, num_iters_combine,
+                                 args.preserve_model_interval,
+                                 get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 44486f907da..c6a9b6a4e03 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python
 
+
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0.
 
+
 # this script is based on steps/nnet3/lstm/train.sh
 
+import os
 import subprocess
 import argparse
 import sys
@@ -14,10 +17,10 @@
 import imp
 import traceback
 
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -59,10 +62,7 @@ def GetArgs():
                         default = 40,
                         help="""Number of left steps used in the estimation of LSTM
                         state before prediction of the first label""")
-    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                        default = 0,
-                        help="""Number of right steps used in the estimation of BLSTM
-                        state before prediction of the first label""")
+
     parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
                         default=20000,
                         help="""This is really the number of egs in each
@@ -112,6 +112,7 @@ def GetArgs():
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
+    print(sys.argv)
 
     args = parser.parse_args()
 
@@ -125,10 +126,10 @@ def ProcessArgs(args):
         raise Exception("--egs.chunk-width should have a minimum value of 1")
 
     if args.chunk_left_context < 0:
-        raise Exception("--egs.chunk-left-context should be positive")
+        raise Exception("--egs.chunk-left-context should be non-negative")
 
     if args.chunk_right_context < 0:
-        raise Exception("--egs.chunk-right-context should be positive")
+        raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
@@ -188,10 +189,8 @@ def Train(args, run_opts):
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
         num_hidden_layers = variables['num_hidden_layers']
-        num_targets = int(variables['num_targets'])
         add_lda = common_train_lib.StrToBool(variables['add_lda'])
         include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
-        objective_type = variables['objective_type']
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))
@@ -204,11 +203,6 @@ def Train(args, run_opts):
     # we do this as it's a convenient way to get the stats for the 'lda-like'
     # transform.
 
-    if args.use_dense_targets:
-        if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
-            raise Exception("Mismatch between num-targets provided to "
-                            "script vs configs")
-
     if (args.stage <= -4):
         logger.info("Initializing a basic network")
         common_train_lib.RunKaldiCommand("""
@@ -218,15 +212,22 @@ def Train(args, run_opts):
                dir = args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
-
-    if args.use_dense_targets:
-        target_type = "dense"
-    else:
-        target_type = "sparse"
-
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
 
+        if args.use_dense_targets:
+            target_type = "dense"
+            try:
+                num_targets = int(variables['num_targets'])
+            except KeyError as e:
+                raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+                    str(e), '{0}/configs'.format(args.dir)))
+            if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
+                raise Exception("Mismatch between num-targets provided to "
+                                "script vs configs")
+        else:
+            target_type = "sparse"
+
         train_lib.GenerateEgsUsingTargets(
                   args.feat_dir, args.targets_scp, default_egs_dir,
                   left_context, right_context,
@@ -264,10 +265,9 @@ def Train(args, run_opts):
     if (add_lda and args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
 
-        common_train_lib.ComputePreconditioningMatrix(
-                         args.dir, egs_dir, num_archives, run_opts,
-                         max_lda_jobs = args.max_lda_jobs,
-                         rand_prune = args.rand_prune)
+        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
 
 
     if (args.stage <= -1):
@@ -289,11 +289,14 @@ def Train(args, run_opts):
                                          args.num_jobs_final)
 
     learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
-                        GetLearningRate(iter, current_num_jobs, num_iters,
-                                        num_archives_processed,
-                                        num_archives_to_process,
-                                        args.initial_effective_lrate,
-                                        args.final_effective_lrate))
+                        common_train_lib.GetLearningRate(
+                                         iter, current_num_jobs, num_iters,
+                                         num_archives_processed,
+                                         num_archives_to_process,
+                                         args.initial_effective_lrate,
+                                         args.final_effective_lrate)
+                    )
+
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
     else:
@@ -312,10 +315,12 @@ def Train(args, run_opts):
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
             shrinkage_value = (args.shrink_value
-                               if DoShrinkage(iter, model_file, "Lstm*",
-                                              "SigmoidComponent", args.shrink_threshold,
-                                              get_raw_nnet_from_am = False)
-                               else 1)
+                               if common_train_lib.DoShrinkage(iter, model_file,
+                                                               "SigmoidComponent",
+                                                               args.shrink_threshold,
+                                                               get_raw_nnet_from_am = False)
+                               else 1
+                               )
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             rnn_train_lib.TrainOneIteration(
@@ -345,8 +350,9 @@ def Train(args, run_opts):
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
-                        args.dir, iter-2, num_iters, num_iters_combine,
-                        args.preserve_model_interval, get_raw_nnet_from_am = False)
+                                 args.dir, iter-2, num_iters, num_iters_combine,
+                                 args.preserve_model_interval,
+                                 get_raw_nnet_from_am = False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 524dd33413d..974fbdec3c7 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -8,7 +8,7 @@
 
 # this script is based on steps/nnet3/lstm/train.sh
 
-
+import os
 import subprocess
 import argparse
 import sys
@@ -16,11 +16,12 @@
 import logging
 import imp
 import traceback
+import shutil
 
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -62,10 +63,7 @@ def GetArgs():
                         default = 40,
                         help="""Number of left steps used in the estimation of LSTM
                         state before prediction of the first label""")
-    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                        default = 0,
-                        help="""Number of right steps used in the estimation of BLSTM
-                        state before prediction of the first label""")
+
     parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
                         default=20000,
                         help="""This is really the number of egs in each
@@ -111,6 +109,7 @@ def GetArgs():
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
+    print(sys.argv)
 
     args = parser.parse_args()
 
@@ -124,10 +123,10 @@ def ProcessArgs(args):
         raise Exception("--egs.chunk-width should have a minimum value of 1")
 
     if args.chunk_left_context < 0:
-        raise Exception("--egs.chunk-left-context should be positive")
+        raise Exception("--egs.chunk-left-context should be non-negative")
 
     if args.chunk_right_context < 0:
-        raise Exception("--egs.chunk-right-context should be positive")
+        raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
@@ -209,7 +208,7 @@ def Train(args, run_opts):
     # we do this as it's a convenient way to get the stats for the 'lda-like'
     # transform.
 
-    if (args.stage <= -4):
+    if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
         common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
@@ -218,7 +217,7 @@ def Train(args, run_opts):
                dir = args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
-    if (args.stage <= -3) and args.egs_dir is None:
+    if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
         train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
@@ -252,13 +251,22 @@ def Train(args, run_opts):
     # use during decoding
     common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
 
-    if (args.stage <= -2):
+    if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
+        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
+
+    if (args.stage <= -2):
+        logger.info("Computing initial vector for FixedScaleComponent before"
+                    " softmax, using priors^{prior_scale} and rescaling to"
+                    " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
+
         common_train_lib.ComputePresoftmaxPriorScale(
-                args.dir, egs_dir, num_archives, run_opts,
-                max_lda_jobs = args.max_lda_jobs,
-                rand_prune = args.rand_prune)
+                args.dir, args.ali_dir, num_jobs, run_opts,
+                presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
@@ -279,11 +287,13 @@ def Train(args, run_opts):
                                          args.num_jobs_final)
 
     learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
-                        GetLearningRate(iter, current_num_jobs, num_iters,
-                                        num_archives_processed,
-                                        num_archives_to_process,
-                                        args.initial_effective_lrate,
-                                        args.final_effective_lrate))
+                        common_train_lib.GetLearningRate(
+                                         iter, current_num_jobs, num_iters,
+                                         num_archives_processed,
+                                         num_archives_to_process,
+                                         args.initial_effective_lrate,
+                                         args.final_effective_lrate)
+                    )
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -306,7 +316,8 @@ def Train(args, run_opts):
                                if common_train_lib.DoShrinkage(iter, model_file,
                                                                "SigmoidComponent",
                                                                args.shrink_threshold)
-                               else 1)
+                               else 1
+                               )
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             rnn_train_lib.TrainOneIteration(

From 7ee150cbb654a7255d1670c08d63216c3aac6b32 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 25 Oct 2016 10:38:23 -0400
Subject: [PATCH 15/71] raw_python_script: Removing rnn_train_lib.py

---
 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py | 213 -------------------
 egs/wsj/s5/steps/nnet3/libs/train_lib.py     |  51 +++--
 egs/wsj/s5/steps/nnet3/train_dnn.py          |  38 ++--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py      |   2 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py      |  49 +++--
 egs/wsj/s5/steps/nnet3/train_rnn.py          |  44 ++--
 6 files changed, 101 insertions(+), 296 deletions(-)
 delete mode 100644 egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py

diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
deleted file mode 100644
index eaa10ac381b..00000000000
--- a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
-# Apache 2.0.
-
-# This is a module with methods which will be used by scripts for training of
-# recurrent neural network acoustic model and raw model (i.e., generic neural
-# network without transition model) with frame-level objectives.
-
-import logging
-import imp
-
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/common_train_lib.py')
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-# this is the main method which differs between RNN and DNN training
-def TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   cache_read_opt, run_opts):
-    # We cannot easily use a single parallel SGE job to do the main training,
-    # because the computation of which archive and which --frame option
-    # to use for each job is a little complex, so we spawn each one separately.
-    # this is no longer true for RNNs as we use do not use the --frame option
-    # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-
-        cache_write_opt = ""
-        if job == 1:
-            # an option for writing cache (storing pairs of nnet-computations and
-            # computation-requests) during training.
-            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = common_train_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
-  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      cv_minibatch_size, run_opts,
-                      get_raw_nnet_from_am = True):
-
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    # Sets off some background jobs to compute train and
-    # validation set objectives
-    common_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
-                                                mb_size=cv_minibatch_size,
-                                                get_raw_nnet_from_am = get_raw_nnet_from_am)
-
-    if iter > 0:
-        # Runs in the background
-        common_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
-                                        mb_size=cv_minibatch_size,
-                                        get_raw_nnet_from_am = get_raw_nnet_from_am)
-
-    # an option for writing cache (storing pairs of nnet-computations
-    # and computation-requests) during training.
-    cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just added new hiden layer, don't do
-                           # averaging but take the best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        if get_raw_nnet_from_am:
-            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        else:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        if get_raw_nnet_from_am:
-            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-        else:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        common_train_lib.GetAverageNnetModel(
-                        dir = dir, iter = iter,
-                        nnets_list = " ".join(nnets_list),
-                        run_opts = run_opts,
-                        get_raw_nnet_from_am = get_raw_nnet_from_am,
-                        shrink = shrinkage_value)
-
-    else:
-        # choose the best model from different jobs
-        common_train_lib.GetBestNnetModel(
-                        dir = dir, iter = iter,
-                        best_model_index = best_model,
-                        run_opts = run_opts,
-                        get_raw_nnet_from_am = get_raw_nnet_from_am,
-                        shrink = shrinkage_value)
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    if get_raw_nnet_from_am:
-        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-    else:
-        new_model = "{0}/{1}.raw".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index fd4491c9f45..d563bdbca08 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -22,20 +22,21 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-# this is the main method which differs between RNN and DNN training
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
+                   raw_model_string, egs_dir,
                    left_context, right_context,
                    momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size,
-                   cache_read_opt, run_opts):
+                   shuffle_buffer_size, minibatch_size, frames_per_eg,
+                   cache_read_opt, run_opts, min_deriv_time = None):
     # We cannot easily use a single parallel SGE job to do the main training,
     # because the computation of which archive and which --frame option
     # to use for each job is a little complex, so we spawn each one separately.
     # this is no longer true for RNNs as we use do not use the --frame option
     # but we use the same script for consistency with FF-DNN code
 
+    chunk_level_training = False if frames_per_eg > 0 else True
+
     context_opts="--left-context={0} --right-context={1}".format(
                   left_context, right_context)
     processes = []
@@ -43,7 +44,9 @@ def TrainNewModels(dir, iter, srand, num_jobs,
         k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
                                                # the other indexes from.
         archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame = (k / num_archives) % frames_per_eg
+
+        if not chunk_level_training:
+            frame = (k / num_archives) % frames_per_eg
 
         cache_write_opt = ""
         if job == 1:
@@ -56,16 +59,21 @@ def TrainNewModels(dir, iter, srand, num_jobs,
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
-  "{raw_model}" \
-  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {optimization_opts} "{raw_model}" \
+  "ark,bg:nnet3-copy-egs {frame_opts} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
                      train_queue_opt = run_opts.train_queue_opt,
                      dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
                      parallel_train_opts = run_opts.parallel_train_opts,
                      cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     frame = frame,
+                     frame_opts = ""
+                                  if chunk_level_training
+                                  else "--frame={0}".format(frame),
                      momentum = momentum, max_param_change = max_param_change,
+                     optimization_opts = "--optimization.min-deriv-time={0}".format(min_deriv_time)
+                                         if min_deriv_time is not None
+                                         else "",
                      raw_model = raw_model_string, context_opts = context_opts,
                      egs_dir = egs_dir, archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
@@ -89,10 +97,12 @@ def TrainNewModels(dir, iter, srand, num_jobs,
 def TrainOneIteration(dir, iter, srand, egs_dir,
                       num_jobs, num_archives_processed, num_archives,
                       learning_rate, minibatch_size,
-                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      num_hidden_layers, add_layers_period,
                       left_context, right_context,
                       momentum, max_param_change, shuffle_buffer_size,
                       run_opts,
+                      cv_minibatch_size = 256, frames_per_eg = -1,
+                      min_deriv_time = None, shrinkage_value = 1.0,
                       get_raw_nnet_from_am = True):
 
 
@@ -116,11 +126,13 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     # Sets off some background jobs to compute train and
     # validation set objectives
     train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                          mb_size = cv_minibatch_size,
                                           get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
         # Runs in the background
         train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
+                                  mb_size = cv_minibatch_size,
                                   get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
@@ -164,12 +176,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    train_lib.TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                             raw_model_string, egs_dir, frames_per_eg,
-                             left_context, right_context,
-                             momentum, max_param_change,
-                             shuffle_buffer_size, cur_minibatch_size,
-                             cache_read_opt, run_opts)
+    TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size, frames_per_eg,
+                   cache_read_opt, run_opts, min_deriv_time = min_deriv_time)
+
     [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
@@ -181,14 +195,17 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                         dir = dir, iter = iter,
                         nnets_list = " ".join(nnets_list),
                         run_opts = run_opts,
-                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
+
     else:
         # choose the best model from different jobs
         common_train_lib.GetBestNnetModel(
                         dir = dir, iter = iter,
                         best_model_index = best_model,
                         run_opts = run_opts,
-                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
 
     try:
         for i in range(1, num_jobs + 1):
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index fabf9a0131e..9130441133f 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -260,24 +260,26 @@ def Train(args, run_opts):
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            train_lib.TrainOneIteration(dir = args.dir,
-                                        iter = iter,
-                                        srand = args.srand,
-                                        egs_dir = egs_dir,
-                                        num_jobs = current_num_jobs,
-                                        num_archives_processed = num_archives_processed,
-                                        num_archives = num_archives,
-                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                                        minibatch_size = args.minibatch_size,
-                                        frames_per_eg = args.frames_per_eg,
-                                        num_hidden_layers = num_hidden_layers,
-                                        add_layers_period = args.add_layers_period,
-                                        left_context = left_context,
-                                        right_context = right_context,
-                                        momentum = args.momentum,
-                                        max_param_change = args.max_param_change,
-                                        shuffle_buffer_size = args.shuffle_buffer_size,
-                                        run_opts = run_opts)
+            train_lib.TrainOneIteration(
+                      dir = args.dir,
+                      iter = iter,
+                      srand = args.srand,
+                      egs_dir = egs_dir,
+                      num_jobs = current_num_jobs,
+                      num_archives_processed = num_archives_processed,
+                      num_archives = num_archives,
+                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                      minibatch_size = args.minibatch_size,
+                      frames_per_eg = args.frames_per_eg,
+                      num_hidden_layers = num_hidden_layers,
+                      add_layers_period = args.add_layers_period,
+                      left_context = left_context,
+                      right_context = right_context,
+                      momentum = args.momentum,
+                      max_param_change = args.max_param_change,
+                      shuffle_buffer_size = args.shuffle_buffer_size,
+                      run_opts = run_opts)
+
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 41b223badd1..4d223c56c2b 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -154,7 +154,7 @@ def Train(args, run_opts):
     # transform.
 
     if (args.stage <= -5):
-        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        logger.info("Initializing a basic network")
         common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index c6a9b6a4e03..928bde40711 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -8,6 +8,7 @@
 
 # this script is based on steps/nnet3/lstm/train.sh
 
+
 import os
 import subprocess
 import argparse
@@ -17,9 +18,8 @@
 import imp
 import traceback
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
@@ -323,29 +323,28 @@ def Train(args, run_opts):
                                )
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            rnn_train_lib.TrainOneIteration(
-                          dir = args.dir,
-                          iter = iter,
-                          srand = args.srand,
-                          egs_dir = egs_dir,
-                          num_jobs = current_num_jobs,
-                          num_archives_processed = num_archives_processed,
-                          num_archives = num_archives,
-                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                          shrinkage_value = shrinkage_value,
-                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                          num_hidden_layers = num_hidden_layers,
-                          add_layers_period = args.add_layers_period,
-                          left_context = left_context,
-                          right_context = right_context,
-                          min_deriv_time = min_deriv_time,
-                          momentum = args.momentum,
-                          max_param_change = args.max_param_change,
-                          shuffle_buffer_size = args.shuffle_buffer_size,
-                          cv_minibatch_size = args.cv_minibatch_size,
-                          run_opts = run_opts,
-                          compute_accuracy = compute_accuracy,
-                          get_raw_nnet_from_am = False)
+            train_lib.TrainOneIteration(
+                      dir = args.dir,
+                      iter = iter,
+                      srand = args.srand,
+                      egs_dir = egs_dir,
+                      num_jobs = current_num_jobs,
+                      num_archives_processed = num_archives_processed,
+                      num_archives = num_archives,
+                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                      shrinkage_value = shrinkage_value,
+                      num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                      num_hidden_layers = num_hidden_layers,
+                      add_layers_period = args.add_layers_period,
+                      left_context = left_context,
+                      right_context = right_context,
+                      min_deriv_time = min_deriv_time,
+                      momentum = args.momentum,
+                      max_param_change = args.max_param_change,
+                      shuffle_buffer_size = args.shuffle_buffer_size,
+                      cv_minibatch_size = args.cv_minibatch_size,
+                      run_opts = run_opts,
+                      get_raw_nnet_from_am = False)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 974fbdec3c7..9070086df0e 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -8,6 +8,7 @@
 
 # this script is based on steps/nnet3/lstm/train.sh
 
+
 import os
 import subprocess
 import argparse
@@ -20,7 +21,6 @@
 
 common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
 nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
 train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
@@ -320,27 +320,27 @@ def Train(args, run_opts):
                                )
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            rnn_train_lib.TrainOneIteration(
-                          dir = args.dir,
-                          iter = iter,
-                          srand = args.srand,
-                          egs_dir = egs_dir,
-                          num_jobs = current_num_jobs,
-                          num_archives_processed = num_archives_processed,
-                          num_archives = num_archives,
-                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                          shrinkage_value = shrinkage_value,
-                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                          num_hidden_layers = num_hidden_layers,
-                          add_layers_period = args.add_layers_period,
-                          left_context = left_context,
-                          right_context = right_context,
-                          min_deriv_time = min_deriv_time,
-                          momentum = args.momentum,
-                          max_param_change = args.max_param_change,
-                          shuffle_buffer_size = args.shuffle_buffer_size,
-                          cv_minibatch_size = args.cv_minibatch_size,
-                          run_opts = run_opts)
+            train_lib.TrainOneIteration(
+                      dir = args.dir,
+                      iter = iter,
+                      srand = args.srand,
+                      egs_dir = egs_dir,
+                      num_jobs = current_num_jobs,
+                      num_archives_processed = num_archives_processed,
+                      num_archives = num_archives,
+                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                      shrinkage_value = shrinkage_value,
+                      minibatch_size = args.num_chunk_per_minibatch,
+                      num_hidden_layers = num_hidden_layers,
+                      add_layers_period = args.add_layers_period,
+                      left_context = left_context,
+                      right_context = right_context,
+                      min_deriv_time = min_deriv_time,
+                      momentum = args.momentum,
+                      max_param_change = args.max_param_change,
+                      shuffle_buffer_size = args.shuffle_buffer_size,
+                      cv_minibatch_size = args.cv_minibatch_size,
+                      run_opts = run_opts)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions

From b8c74b4e2201973188e03a200ac1f838f040adb2 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 25 Oct 2016 10:54:25 -0400
Subject: [PATCH 16/71] raw_python_script: Minor changes

---
 .../s5/steps/nnet3/libs/chain_train_lib.py    | 41 ++++++++++---------
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      | 26 +++++++-----
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
index fbe89d3e80a..8eded225a10 100644
--- a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
@@ -4,17 +4,16 @@
 #           2016 Vimal Manohar
 # Apache 2.0.
 
+# This is a module with methods which will be used by scripts for training of
+# deep neural network acoustic model with chain objective.
 
-import subprocess
 import logging
 import math
-import re
-import time
 import imp
 import os
 import sys
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -24,7 +23,8 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
                    apply_deriv_weights,
                    left_deriv_truncate, right_deriv_truncate,
@@ -33,11 +33,11 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
                    shuffle_buffer_size, num_chunk_per_minibatch,
                    frame_subsampling_factor, truncate_deriv_weights,
                    cache_io_opts, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
 
     deriv_time_opts=""
     if left_deriv_truncate is not None:
@@ -149,16 +149,16 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
 
     if do_average:
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-      cur_max_param_change = max_param_change
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_max_param_change = max_param_change
     else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
@@ -173,7 +173,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
@@ -219,6 +219,7 @@ def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
                  '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
         if not os.path.isfile(file):
             raise Exception('Expected {0} to exist.'.format(file))
+
 def GetNumberOfLeaves(dir):
     [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index d563bdbca08..603e43000db 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+
 
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
@@ -11,8 +11,9 @@
 import logging
 import math
 import imp
+import os
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/common_train_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -27,8 +28,9 @@ def TrainNewModels(dir, iter, srand, num_jobs,
                    raw_model_string, egs_dir,
                    left_context, right_context,
                    momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size, frames_per_eg,
-                   cache_read_opt, run_opts, min_deriv_time = None):
+                   shuffle_buffer_size, minibatch_size,
+                   cache_read_opt, run_opts,
+                   frames_per_eg = -1, min_deriv_time = None):
     # We cannot easily use a single parallel SGE job to do the main training,
     # because the computation of which archive and which --frame option
     # to use for each job is a little complex, so we spawn each one separately.
@@ -36,6 +38,10 @@ def TrainNewModels(dir, iter, srand, num_jobs,
     # but we use the same script for consistency with FF-DNN code
 
     chunk_level_training = False if frames_per_eg > 0 else True
+    deriv_time_opts = (""
+                       if min_deriv_time is None
+                       else "--optimization.min-deriv-time={0}".format(min_deriv_time)
+                       )
 
     context_opts="--left-context={0} --right-context={1}".format(
                   left_context, right_context)
@@ -59,7 +65,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
-  {optimization_opts} "{raw_model}" \
+  {deriv_time_opts} "{raw_model}" \
   "ark,bg:nnet3-copy-egs {frame_opts} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
@@ -71,9 +77,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
                                   if chunk_level_training
                                   else "--frame={0}".format(frame),
                      momentum = momentum, max_param_change = max_param_change,
-                     optimization_opts = "--optimization.min-deriv-time={0}".format(min_deriv_time)
-                                         if min_deriv_time is not None
-                                         else "",
+                     deriv_time_opts = deriv_time_opts
                      raw_model = raw_model_string, context_opts = context_opts,
                      egs_dir = egs_dir, archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
@@ -181,8 +185,10 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                    raw_model_string, egs_dir,
                    left_context, right_context,
                    momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size, frames_per_eg,
-                   cache_read_opt, run_opts, min_deriv_time = min_deriv_time)
+                   shuffle_buffer_size, cur_minibatch_size,
+                   cache_read_opt, run_opts,
+                   frames_per_eg = frames_per_eg,
+                   min_deriv_time = min_deriv_time)
 
     [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []

From 18e666416dfae066166fe4f1c67b02416ba9f4b3 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 25 Oct 2016 16:52:25 -0400
Subject: [PATCH 17/71] raw_python_script: Minor fixes

---
 egs/wsj/s5/steps/nnet3/chain/train.py         | 202 +++++++++++++++++-
 .../s5/steps/nnet3/libs/chain_train_lib.py    | 197 -----------------
 .../s5/steps/nnet3/libs/common_train_lib.py   |   4 +-
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   |   2 +-
 4 files changed, 203 insertions(+), 202 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index f00beaeed8b..a2b15e9a342 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -131,8 +131,8 @@ def GetArgs():
                         " shrink-nonlinearity type")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
-                        default=100,
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=512,
                         help="Number of sequences to be processed in parallel every minibatch" )
 
     # General options
@@ -196,6 +196,204 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    deriv_time_opts=""
+    if left_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
+    if right_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        if job == 1:
+            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = common_train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-chain-train {parallel_train_opts} \
+  --apply-deriv-weights={app_deriv_wts} \
+  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+   "{raw_model}" {dir}/den.fst \
+  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     deriv_time_opts = deriv_time_opts,
+                     trunc_deriv = truncate_deriv_weights,
+                     app_deriv_wts = apply_deriv_weights,
+                     fr_shft = frame_shift, l2 = l2_regularize,
+                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     cache_io_opts = cur_cache_io_opts,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      frame_subsampling_factor, truncate_deriv_weights,
+                      run_opts):
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
+            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+
+    if iter > 0:
+        chain_lib.ComputeProgress(dir, iter, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, cur_max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts)
+
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
+    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
+
+
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
diff --git a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
index 8eded225a10..e25a5930d4b 100644
--- a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
@@ -23,203 +23,6 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-def TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts):
-    # We cannot easily use a single parallel SGE job to do the main training,
-    # because the computation of which archive and which --frame option
-    # to use for each job is a little complex, so we spawn each one separately.
-    # this is no longer true for RNNs as we use do not use the --frame option
-    # but we use the same script for consistency with FF-DNN code
-
-    deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
-
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
-        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        if job == 1:
-            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
-        else:
-            cur_cache_io_opts = cache_io_opts
-
-        process_handle = common_train_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-chain-train {parallel_train_opts} \
-  --apply-deriv-weights={app_deriv_wts} \
-  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-   "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     deriv_time_opts = deriv_time_opts,
-                     trunc_deriv = truncate_deriv_weights,
-                     app_deriv_wts = apply_deriv_weights,
-                     fr_shft = frame_shift, l2 = l2_regularize,
-                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     cache_io_opts = cur_cache_io_opts,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
-                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      frame_subsampling_factor, truncate_deriv_weights,
-                      run_opts):
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
-
-    if iter > 0:
-        chain_lib.ComputeProgress(dir, iter, run_opts)
-
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        cache_io_opts = ""
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-        cur_max_param_change = max_param_change
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-        cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, cur_max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts)
-
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
-    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
-                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
-                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
-                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
-        if not os.path.isfile(file):
-            raise Exception('Expected {0} to exist.'.format(file))
-
 def GetNumberOfLeaves(dir):
     [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
diff --git a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
index f77f422efed..0c57c6c3936 100644
--- a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
@@ -753,8 +753,8 @@ def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
         raise err
 
 def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
-               preserve_model_interval = 100,
-               get_raw_nnet_from_am = True):
+                preserve_model_interval = 100,
+                get_raw_nnet_from_am = True):
     if iter % preserve_model_interval == 0:
         return
     if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 71a73d65067..10861170a1a 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -54,7 +54,7 @@ def GetArgs():
                         help="Output dimension of the linear layer at the CNN output "
                         "for dimension reduction, e.g. 256."
                         "The default zero means this layer is not needed.", default=0)
-    parser.add_argument("--cnn.cepstral-lifter", type=float,
+    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
                         help="The factor used for determining the liftering vector in the production of MFCC. "
                         "User has to ensure that it matches the lifter used in MFCC generation, "
                         "e.g. 22.0", default=22.0)

From 29650bd5661b7ea4e4b891fb740bb85931b362bb Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 25 Oct 2016 16:53:20 -0400
Subject: [PATCH 18/71] raw_python_script: Removing make_configs changes from
 this PR

---
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py | 74 ++++++---------------
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py | 50 ++++----------
 2 files changed, 32 insertions(+), 92 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 260def1d03f..53739f0f9ce 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -50,18 +50,6 @@ def GetArgs():
                         default=0.0)
     parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
-    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="If \"true\" an LDA matrix computed from the input features "
-                        "(spliced according to the first set of splice-indexes) will be used as "
-                        "the first Affine layer. This affine layer's parameters are fixed during training. "
-                        "This variable needs to be set to \"false\" when using dense-targets.",
-                        default=True, choices = ["false", "true"])
-    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
-                        choices=['true', 'false'], default = False)
-    parser.add_argument("--objective-type", type=str, default="linear",
-                        choices = ["linear", "quadratic"],
-                        help = "the type of objective; i.e. quadratic or linear")
 
     # LSTM options
     parser.add_argument("--num-lstm-layers", type=int,
@@ -210,9 +198,7 @@ def ParseLstmDelayString(lstm_delay):
                 raise ValueError("invalid --lstm-delay argument, too-short element: "
                                 + lstm_delay)
             elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
-                raise ValueError('Warning: ' + str(indexes) +
-                                 ' is not a standard BLSTM mode. ' +
-                                 'There should be a negative delay for the forward, and a postive delay for the backward.')
+                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
             if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
                 indexes[0], indexes[1] = indexes[1], indexes[0]
             lstm_delay_array.append(indexes)
@@ -222,34 +208,28 @@ def ParseLstmDelayString(lstm_delay):
     return lstm_delay_array
 
 
-def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                 splice_indexes, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
                 num_lstm_layers, num_hidden_layers,
                 norm_based_clipping, clipping_threshold,
                 ng_per_element_scale_options, ng_affine_options,
-                label_delay, include_log_softmax, add_final_sigmoid,
-                objective_type, xent_regularize,
+                label_delay, include_log_softmax, xent_regularize,
                 self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
 
     config_lines = {'components':[], 'component-nodes':[]}
 
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
     config_files[config_dir + '/init.config'] = init_config_lines
 
-    # add_lda needs to be set "false" when using dense targets,
-    # or if the task is not a simple classification task
-    # (e.g. regression, multi-task)
-    if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
 
     for i in range(num_lstm_layers):
         if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
@@ -258,23 +238,17 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
                                                     recurrent_projection_dim, non_recurrent_projection_dim,
                                                     clipping_threshold, norm_based_clipping,
                                                     ng_per_element_scale_options, ng_affine_options,
-                                                    lstm_delay = lstm_delay[i],
-                                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
-                                                    self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+                                                    lstm_delay = lstm_delay[i], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         else: # add a uni-directional LSTM layer
             prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
                                                    prev_layer_output, cell_dim,
                                                    recurrent_projection_dim, non_recurrent_projection_dim,
                                                    clipping_threshold, norm_based_clipping,
                                                    ng_per_element_scale_options, ng_affine_options,
-                                                   lstm_delay = lstm_delay[i][0],
-                                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
-                                                   self_repair_scale_clipgradient = self_repair_scale_clipgradient)
+                                                   lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options,
-                            label_delay = label_delay, include_log_softmax = include_log_softmax,
-                            add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
 
 
         if xent_regularize != 0.0:
@@ -291,9 +265,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
                                                ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options,
-                            label_delay = label_delay, include_log_softmax = include_log_softmax,
-                            add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
 
         if xent_regularize != 0.0:
             nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
@@ -321,30 +293,24 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer
     if (num_hidden_layers < num_lstm_layers):
         raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
 
-    return [left_context, right_context, num_hidden_layers, splice_indexes]
-
-
-def Main():
-    args = GetArgs()
-    [left_context, right_context,
-     num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes,
-                                                               args.label_delay, args.num_lstm_layers)
-
     # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(args.config_dir + "/vars", "w")
+    f = open(config_dir + "/vars", "w")
     print('model_left_context=' + str(left_context), file=f)
     print('model_right_context=' + str(right_context), file=f)
     print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    print('num_targets=' + str(args.num_targets), file=f)
-    print('objective_type=' + str(args.objective_type), file=f)
-    print('add_lda=' + ("true" if args.add_lda else "false"), file=f)
-    print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f)
+    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
     f.close()
 
+    return [left_context, right_context, num_hidden_layers, splice_indexes]
+
+
+def Main():
+    args = GetArgs()
+    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
+
     MakeConfigs(config_dir = args.config_dir,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
-                add_lda = args.add_lda,
                 splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
                 cell_dim = args.cell_dim,
                 hidden_dim = args.hidden_dim,
@@ -358,8 +324,6 @@ def Main():
                 ng_affine_options = args.ng_affine_options,
                 label_delay = args.label_delay,
                 include_log_softmax = args.include_log_softmax,
-                add_final_sigmoid = args.add_final_sigmoid,
-                objective_type = args.objective_type,
                 xent_regularize = args.xent_regularize,
                 self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
                 self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 10861170a1a..bac260e93bc 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -69,7 +69,6 @@ def GetArgs():
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
-                        "This variable needs to be set to \"false\" when using dense-targets. "
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
@@ -106,26 +105,17 @@ def GetArgs():
     relu_dim_group.add_argument("--relu-dim", type=int,
                         help="dimension of all ReLU nonlinearity layers")
     relu_dim_group.add_argument("--relu-dim-final", type=int,
-                        help="dimension of the last ReLU nonlinearity layer. "
-                        "Dimensions increase geometrically from the first through the last ReLU layer.",
-                        default=None)
+                        help="dimension of the last ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)
     parser.add_argument("--relu-dim-init", type=int,
-                        help="dimension of the first ReLU nonlinearity layer. "
-                        "Dimensions increase geometrically from the first through the last ReLU layer.",
-                        default=None)
+                        help="dimension of the first ReLU nonlinearity layer. Dimensions increase geometrically from the first through the last ReLU layer.", default=None)
 
     parser.add_argument("--self-repair-scale-nonlinearity", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the "
-                        "sigmoid and tanh non-linearities of the LSTM", default=None)
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
 
 
     parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
-
-    # Options to convert input MFCC into Fbank features. This is useful when a
-    # LDA layer is not added (such as when using dense targets)
-
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -236,8 +226,7 @@ def AddConvMaxpLayer(config_lines, name, input, args):
 
 # The ivectors are processed through an affine layer parallel to the CNN layers,
 # then concatenated with the CNN output and passed to the deeper part of the network.
-def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
-                 config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
     cnn_args = ParseCnnString(cnn_layer)
     num_cnn_layers = len(cnn_args)
     # We use an Idct layer here to convert MFCC to FBANK features
@@ -246,8 +235,7 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
                          'dimension': feat_dim}
     prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
 
-    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n)
-                if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
     splice_descriptor = "Append({0})".format(", ".join(list))
     cnn_input_dim = len(splice_indexes) * feat_dim
     prev_layer_output = {'descriptor':  splice_descriptor,
@@ -259,15 +247,13 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
         prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
 
     if cnn_bottleneck_dim > 0:
-        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck",
-                                                 prev_layer_output, cnn_bottleneck_dim, "")
+        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
 
     if ivector_dim > 0:
         iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
                            'dimension': ivector_dim}
         iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
-        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'],
-                                                                    iv_layer_output['descriptor'])
+        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
         prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
 
     return prev_layer_output
@@ -359,17 +345,14 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     if xent_separate_forward_affine:
         if splice_indexes[-1] != [0]:
-            raise Exception("--xent-separate-forward-affine option is supported only if the " +
-            "last-hidden layer has no splicing before it. " +
-            "Please use a splice-indexes with just 0 as the final splicing config.")
+            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
 
     prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
 
     config_lines = {'components':[], 'component-nodes':[]}
 
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
-                        ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -379,16 +362,11 @@ def MakeConfigs(config_dir, splice_indexes_string,
     config_files[config_dir + '/init.config'] = init_config_lines
 
     if cnn_layer is not None:
-        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim,
-                                         cepstral_lifter, config_dir,
+        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
                                          feat_dim, splice_indexes[0], ivector_dim)
 
-    # add_lda needs to be set "false" when using dense targets,
-    # or if the task is not a simple classification task
-    # (e.g. regression, multi-task)
     if add_lda:
-        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0",
-                                              prev_layer_output, config_dir + '/lda.mat')
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
 
     left_context = 0
     right_context = 0
@@ -402,11 +380,9 @@ def MakeConfigs(config_dir, splice_indexes_string,
         raise Exception("num-hidden-layers has to be greater than 1 if relu-dim-init and relu-dim-final is different.")
     else:
         # computes relu-dim for each hidden layer. They increase geometrically across layers
-        factor = pow(float(nonlin_output_dim_final) / nonlin_output_dim_init,
-                     1.0 / (num_hidden_layers - 1)) if num_hidden_layers > 1 else 1
+        factor = pow(float(nonlin_output_dim_final) / nonlin_output_dim_init, 1.0 / (num_hidden_layers - 1)) if num_hidden_layers > 1 else 1
         nonlin_output_dims = [int(round(nonlin_output_dim_init * pow(factor, i))) for i in range(0, num_hidden_layers)]
-        assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and
-               nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
+        assert(nonlin_output_dims[-1] >= nonlin_output_dim_final - 1 and nonlin_output_dims[-1] <= nonlin_output_dim_final + 1) # due to rounding error
         nonlin_output_dims[-1] = nonlin_output_dim_final # It ensures that the dim of the last hidden layer is exactly the same as what is specified
 
     for i in range(0, num_hidden_layers):

From 40345dbc83395bd92dda19dbafbadd79286aa145 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 28 Oct 2016 06:55:46 -0400
Subject: [PATCH 19/71] raw_python_script: Some fixes based on simple tests

---
 egs/wsj/s5/steps/nnet3/chain/train.py         |   3 +
 .../s5/steps/nnet3/libs/common_train_lib.py   | 308 +++++++++---------
 egs/wsj/s5/steps/nnet3/libs/train_lib.py      |   5 +-
 .../steps/nnet3/report/nnet3_log_parse_lib.py |   2 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  21 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  22 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  24 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  17 +-
 8 files changed, 205 insertions(+), 197 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index a2b15e9a342..5a6ce4f55a8 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -196,6 +196,9 @@ def ProcessArgs(args):
 
     return [args, run_opts]
 
+# Called from TrainOneIteration, this model does one iteration of training
+# with 'num_jobs' jobs, and
+# writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
diff --git a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
index 0c57c6c3936..10a78ad5863 100644
--- a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
@@ -29,155 +29,6 @@ def __init__(self):
         self.prior_queue_opt = None
         self.parallel_train_opts = None
 
-def AddCommonTrainArgs(parser):
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
-    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default = 0,
-                        help="Number of additional frames of input to the left"
-                        " of the input chunk. This extra context will be used"
-                        " in the estimation of RNN state before prediction of"
-                        " the first label. In the case of FF-DNN this extra"
-                        " context will be used to allow for frame-shifts")
-    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                        default = 0,
-                        help="Number of additional frames of input to the right"
-                        " of the input chunk. This extra context will be used"
-                        " in the estimation of bidirectional RNN state before"
-                        " prediction of the first label.")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-    # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
-    # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = common_train_lib.NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--egs.cmd", type=str, action = common_train_lib.NullstrToNoneAction,
-                        dest = "egs_command",
-                        help="""Script to launch egs jobs""", default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = common_train_lib.NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
 def SendMail(message, subject, email_id):
     try:
         subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
@@ -188,10 +39,10 @@ def SendMail(message, subject, email_id):
         logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
         pass
 
-def StrToBool(values):
-    if values == "true":
+def StrToBool(value):
+    if value == "true":
         return True
-    elif values == "false":
+    elif value == "false":
         return False
     else:
         raise ValueError
@@ -802,3 +653,156 @@ def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
         idct_matrix[k].append(0)
     WriteKaldiMatrix(file_path, idct_matrix)
 
+## This argument parser adds common options related to nnet3 training
+## such as egs creation, training optimization options.
+## These are used in the nnet3 train scripts
+## in steps/nnet3/train*.py and steps/nnet3/chain/train.py
+common_parser = argparse.ArgumentParser(add_help=False)
+# feat options
+common_parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                           default = None, action = NullstrToNoneAction,
+                           help="""directory with the ivectors extracted in
+                           an online fashion.""")
+common_parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                           default = None, action = NullstrToNoneAction,
+                           help="A string specifying '--norm-means' and '--norm-vars' values")
+
+# egs extraction options
+common_parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                           default = 0,
+                           help="Number of additional frames of input to the left"
+                           " of the input chunk. This extra context will be used"
+                           " in the estimation of RNN state before prediction of"
+                           " the first label. In the case of FF-DNN this extra"
+                           " context will be used to allow for frame-shifts")
+common_parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                           default = 0,
+                           help="Number of additional frames of input to the right"
+                           " of the input chunk. This extra context will be used"
+                           " in the estimation of bidirectional RNN state before"
+                           " prediction of the first label.")
+common_parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                           default = None, action = NullstrToNoneAction,
+                           help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+common_parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                           default = None, action = NullstrToNoneAction,
+                           help="""Directory with egs. If specified this directory
+                           will be used rather than extracting egs""")
+common_parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                           default = 0, help="Stage at which get_egs.sh should be restarted")
+common_parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                           default = None, action = NullstrToNoneAction,
+                           help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+# trainer options
+common_parser.add_argument("--trainer.srand", type=int, dest='srand',
+                           default = 0,
+                           help="Sets the random seed for model initialization and egs shuffling. "
+                           "Warning: This random seed does not control all aspects of this experiment. "
+                           "There might be other random seeds used in other stages of the experiment "
+                           "like data preparation (e.g. volume perturbation).")
+common_parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                           default = 8,
+                           help="Number of epochs to train the model")
+common_parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                           default = 20000,
+                           help="Number of samples for computing priors")
+common_parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                           default = 10,
+                           help="The prior computation jobs are single threaded and run on the CPU")
+common_parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                           default = 20,
+                           help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+common_parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                           default = 5000,
+                           help=""" Controls randomization of the samples on each
+                           iteration. If 0 or a large value the randomization is
+                           complete, but this will consume memory and cause spikes
+                           in disk I/O.  Smaller is easier on disk and memory but
+                           less random.  It's not a huge deal though, as samples
+                           are anyway randomized right at the start.
+                           (the point of this is to get data in different
+                           minibatches on different iterations, since in the
+                           preconditioning method, 2 samples in the same minibatch
+                           can affect each others' gradients.""")
+common_parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                           default=2,
+                           help="The number of iterations between adding layers"
+                           "during layer-wise discriminative training.")
+common_parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                           default=2.0,
+                           help="""The maximum change in parameters allowed
+                           per minibatch, measured in Frobenius norm over
+                           the entire model""")
+common_parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                           default=400000,
+                           help="This is really the number of egs in each archive.")
+common_parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                           default=4.0,
+                           help="""Value used in preconditioning matrix estimation""")
+common_parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                           default=10,
+                           help="""Max number of jobs used for LDA stats accumulation""")
+
+# Parameters for the optimization
+common_parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                           default = 0.0003,
+                           help="Learning rate used during the initial iteration")
+common_parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                           default = 0.00003,
+                           help="Learning rate used during the final iteration")
+common_parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                           default = 1,
+                           help="Number of neural net jobs to run in parallel at the start of training")
+common_parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                           default = 8,
+                           help="Number of neural net jobs to run in parallel at the end of training")
+common_parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                           default = 20,
+                           help = """ The is the maximum number of models we give to the
+                                      final 'combine' stage, but these models will themselves
+                                      be averages of iteration-number ranges. """)
+common_parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                           default = 0.0,
+                           help="""Momentum used in update computation.
+                           Note: we implemented it in such a way that
+                           it doesn't increase the effective learning rate.""")
+# General options
+common_parser.add_argument("--stage", type=int, default=-4,
+                           help="Specifies the stage of the experiment to execution from")
+common_parser.add_argument("--exit-stage", type=int, default=None,
+                           help="If specified, training exits before running this stage")
+common_parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                           dest = "command",
+                           help="""Specifies the script to launch jobs.
+                           e.g. queue.pl for launching on SGE cluster
+                                run.pl for launching on local machine
+                           """, default = "queue.pl")
+common_parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                           dest = "egs_command",
+                           help="""Script to launch egs jobs""", default = "queue.pl")
+common_parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                           choices = ["true", "false"],
+                           help="Use GPU for training", default=True)
+common_parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                           choices = ["true", "false"],
+                           help="Clean up models after training", default=True)
+common_parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                           default = True, action = StrToBoolAction,
+                           choices = ["true", "false"],
+                           help="""If true, remove egs after experiment""")
+common_parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                           type=int, default=100,
+                           help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
+
+common_parser.add_argument("--reporting.email", dest = "email",
+                           type=str, default=None, action = NullstrToNoneAction,
+                           help=""" Email-id to report about the progress of the experiment.
+                                 NOTE: It assumes the machine on which the script is being run can send
+                                 emails from command line via. mail program. The
+                                 Kaldi mailing list will not support this feature.
+                                 It might require local expertise to setup. """)
+common_parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                               type=int, default=0.1,
+                               help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
index 603e43000db..0aab71f221c 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -23,6 +23,9 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
+# Called from TrainOneIteration, this model does one iteration of training
+# with 'num_jobs' jobs, and
+# writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
 def TrainNewModels(dir, iter, srand, num_jobs,
                    num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
@@ -77,7 +80,7 @@ def TrainNewModels(dir, iter, srand, num_jobs,
                                   if chunk_level_training
                                   else "--frame={0}".format(frame),
                      momentum = momentum, max_param_change = max_param_change,
-                     deriv_time_opts = deriv_time_opts
+                     deriv_time_opts = deriv_time_opts,
                      raw_model = raw_model_string, context_opts = context_opts,
                      egs_dir = egs_dir, archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
index 7d014003102..dd76edc5b33 100755
--- a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
+++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
@@ -5,7 +5,7 @@
 import sys, glob, re, math, datetime, argparse
 import imp
 
-ntl = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+ntl = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
 
 #exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
 def ParseProgressLogsForNonlinearityStats(exp_dir):
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 9130441133f..c6be8d1f6fc 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -19,9 +19,9 @@
 import traceback
 import shutil
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -40,9 +40,10 @@ def GetArgs():
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve')
-
-    common_train_lib.AddCommonTrainArgs(parser)
+    conflict_handler = 'resolve',
+    parents=[common_train_lib.common_parser])
+    # For common options defined in common_train_lib.common_parser,
+    # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -148,7 +149,7 @@ def Train(args, run_opts):
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers']
+        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))
@@ -239,7 +240,7 @@ def Train(args, run_opts):
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
                                          iter, current_num_jobs, num_iters,
                                          num_archives_processed,
@@ -258,7 +259,7 @@ def Train(args, run_opts):
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
 
-            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed)))
 
             train_lib.TrainOneIteration(
                       dir = args.dir,
@@ -268,7 +269,7 @@ def Train(args, run_opts):
                       num_jobs = current_num_jobs,
                       num_archives_processed = num_archives_processed,
                       num_archives = num_archives,
-                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                      learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
                       minibatch_size = args.minibatch_size,
                       frames_per_eg = args.frames_per_eg,
                       num_hidden_layers = num_hidden_layers,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 4d223c56c2b..612ff386b89 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -18,9 +18,9 @@
 import imp
 import traceback
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -40,9 +40,10 @@ def GetArgs():
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve')
-
-    common_train_lib.AddCommonTrainArgs(parser)
+    conflict_handler = 'resolve',
+    parents=[common_train_lib.common_parser])
+    # For common options defined in common_train_lib.common_parser,
+    # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -56,7 +57,6 @@ def GetArgs():
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
-
     parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
@@ -138,7 +138,7 @@ def Train(args, run_opts):
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers']
+        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
         add_lda = common_train_lib.StrToBool(variables['add_lda'])
         include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
     except KeyError as e:
@@ -239,7 +239,7 @@ def Train(args, run_opts):
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
                                          iter, current_num_jobs, num_iters,
                                          num_archives_processed,
@@ -258,7 +258,7 @@ def Train(args, run_opts):
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
 
-            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed)))
 
             train_lib.TrainOneIteration(dir = args.dir,
                                         iter = iter,
@@ -267,7 +267,7 @@ def Train(args, run_opts):
                                         num_jobs = current_num_jobs,
                                         num_archives_processed = num_archives_processed,
                                         num_archives = num_archives,
-                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
                                         minibatch_size = args.minibatch_size,
                                         frames_per_eg = args.frames_per_eg,
                                         num_hidden_layers = num_hidden_layers,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 928bde40711..8eac8f5fec2 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -18,9 +18,9 @@
 import imp
 import traceback
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -47,9 +47,10 @@ def GetArgs():
         3. RNNs can also be trained with state preservation training
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve')
-
-    train_lib.AddCommonTrainArgs(parser)
+    conflict_handler = 'resolve',
+    parents=[common_train_lib.common_parser])
+    # For common options defined in common_train_lib.common_parser,
+    # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -87,8 +88,6 @@ def GetArgs():
             default = 256,
             help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
 
-
-
     # RNN specific trainer options
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
                         default=100,
@@ -100,7 +99,6 @@ def GetArgs():
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
-
     parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
                        default = True, choices = ["true", "false"],
                        help="Train neural network using dense targets")
@@ -188,7 +186,7 @@ def Train(args, run_opts):
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers']
+        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
         add_lda = common_train_lib.StrToBool(variables['add_lda'])
         include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
     except KeyError as e:
@@ -288,7 +286,7 @@ def Train(args, run_opts):
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
                                          iter, current_num_jobs, num_iters,
                                          num_archives_processed,
@@ -321,7 +319,7 @@ def Train(args, run_opts):
                                                                get_raw_nnet_from_am = False)
                                else 1
                                )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             train_lib.TrainOneIteration(
                       dir = args.dir,
@@ -331,7 +329,7 @@ def Train(args, run_opts):
                       num_jobs = current_num_jobs,
                       num_archives_processed = num_archives_processed,
                       num_archives = num_archives,
-                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                      learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
                       shrinkage_value = shrinkage_value,
                       num_chunk_per_minibatch = args.num_chunk_per_minibatch,
                       num_hidden_layers = num_hidden_layers,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 9070086df0e..794fac465bb 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -19,9 +19,9 @@
 import traceback
 import shutil
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/lib/common_train_lib.py')
-nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -48,9 +48,10 @@ def GetArgs():
         3. RNNs can also be trained with state preservation training
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve')
-
-    train_lib.AddCommonTrainArgs(parser)
+    conflict_handler = 'resolve',
+    parents=[common_train_lib.common_parser])
+    # For common options defined in common_train_lib.common_parser,
+    # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -88,8 +89,6 @@ def GetArgs():
             default = 256,
             help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
 
-
-
     # RNN specific trainer options
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
                         default=100,
@@ -195,7 +194,7 @@ def Train(args, run_opts):
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers']
+        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
     except KeyError as e:
         raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
             str(e), '{0}/configs'.format(args.dir)))

From 0a2fce991436c91949b6866d209c1f0811aa3333 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 1 Nov 2016 04:12:36 -0400
Subject: [PATCH 20/71] raw_python_script: Fixing minor typos

---
 egs/wsj/s5/steps/nnet3/chain/train.py         | 60 +++++++++++--------
 .../s5/steps/nnet3/libs/chain_train_lib.py    | 10 ----
 .../s5/steps/nnet3/libs/common_train_lib.py   | 12 +++-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   | 15 +++--
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   | 23 ++++---
 5 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 5a6ce4f55a8..f341eddfad3 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -38,9 +38,10 @@ def GetArgs():
     Trains RNN and DNN acoustic models using the 'chain' objective function.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve')
-
-    common_train_lib.AddCommonTrainArgs(parser)
+    conflict_handler = 'resolve',
+    parents=[common_train_lib.common_parser])
+    # For common options defined in common_train_lib.common_parser,
+    # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -336,15 +337,24 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, cur_max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts)
+    TrainNewModels(dir = dir, iter = iter, srand = srand, num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   apply_deriv_weights = apply_deriv_weights,
+                   left_deriv_truncate = left_deriv_truncate,
+                   right_deriv_truncate = right_deriv_truncate,
+                   l2_regularize = l2_regularize,
+                   xent_regularize = xent_regularize,
+                   leaky_hmm_coefficient = leaky_hmm_coefficient,
+                   momentum = momentum,
+                   max_param_change = cur_max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor = frame_subsampling_factor,
+                   truncate_deriv_weights = truncate_deriv_weights,
+                   cache_io_opts = cache_io_opts, run_opts = run_opts)
 
     [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
@@ -524,7 +534,7 @@ def Train(args, run_opts):
                                          args.max_models_combine, args.add_layers_period,
                                          args.num_jobs_final)
 
-    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
                                          iter, current_num_jobs, num_iters,
                                          num_archives_processed,
@@ -541,21 +551,23 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
-            shrinkage_value = (args.shrink_value
-                               if common_train_lib.DoShrinkage(iter, model_file,
-                                                               args.shrink_nonlinearity,
-                                                               args.shrink_threshold)
-                               else 1
-                               )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+            shrinkage_value = 1.0
+            if args.shrink_value != 1.0:
+                model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+                shrinkage_value = (args.shrink_value
+                                   if common_train_lib.DoShrinkage(iter, model_file,
+                                                                   args.shrink_nonlinearity,
+                                                                   args.shrink_threshold)
+                                   else 1
+                                   )
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             TrainOneIteration(dir = args.dir, iter = iter, srand = args.srand,
                               egs_dir = egs_dir,
                               num_jobs = current_num_jobs,
-                              num_archives_processsed =  num_archives_processed,
+                              num_archives_processed =  num_archives_processed,
                               num_archives = num_archives,
-                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
                               shrinkage_value = shrinkage_value,
                               num_chunk_per_minibatch = args.num_chunk_per_minibatch,
                               num_hidden_layers = num_hidden_layers,
@@ -570,7 +582,7 @@ def Train(args, run_opts):
                               max_param_change = args.max_param_change,
                               shuffle_buffer_size = args.shuffle_buffer_size,
                               frame_subsampling_factor = args.frame_subsampling_factor,
-                              truncate_deriv_weight = args.truncate_deriv_weights,
+                              truncate_deriv_weights = args.truncate_deriv_weights,
                               run_opts = run_opts)
 
             if args.cleanup:
diff --git a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
index e25a5930d4b..a88f595ab81 100644
--- a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
@@ -23,16 +23,6 @@
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
-def GetNumberOfLeaves(dir):
-    [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
-    parts = stdout.split()
-    #number of pdfs 7115
-    assert(' '.join(parts[0:3]) == "number of pdfs")
-    num_leaves = int(parts[3])
-    if num_leaves == 0:
-        raise Exception("Number of leaves is 0")
-    return num_leaves
-
 def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
     common_train_lib.RunKaldiCommand("""
   {command} {dir}/log/make_phone_lm.log \
diff --git a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
index 10a78ad5863..1fec1ae2a1b 100644
--- a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
@@ -184,7 +184,7 @@ def GetBestNnetModel(dir, iter, best_model_index, run_opts,
                best_model =  best_model,
                out_model = out_model, scale = scale))
 
-def GetNumberOfLeaves(alidir):
+def GetNumberOfLeavesFromTree(alidir):
     [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
     parts = stdout.split()
     assert(parts[0] == "num-pdfs")
@@ -193,6 +193,16 @@ def GetNumberOfLeaves(alidir):
         raise Exception("Number of leaves is 0")
     return num_leaves
 
+def GetNumberOfLeavesFromModel(dir):
+    [stdout, stderr] = common_train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    parts = stdout.split()
+    #number of pdfs 7115
+    assert(' '.join(parts[0:3]) == "number of pdfs")
+    num_leaves = int(parts[3])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
 def GetNumberOfJobs(alidir):
     try:
         num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip())
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 53739f0f9ce..f4bc2818218 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -9,8 +9,7 @@
 import imp
 
 nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
@@ -48,7 +47,7 @@ def GetArgs():
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
                         default=0.0)
-    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--include-log-softmax", type=str, action=common_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
 
     # LSTM options
@@ -70,7 +69,7 @@ def GetArgs():
                         help="options to be supplied to NaturalGradientAffineComponent", default="")
 
     # Gradient clipper options
-    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--norm-based-clipping", type=str, action=common_train_lib.StrToBoolAction,
                         help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
     parser.add_argument("--clipping-threshold", type=float,
                         help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
@@ -102,15 +101,15 @@ def CheckArgs(args):
 
     ## Check arguments.
     if args.feat_dir is not None:
-        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+        args.feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
 
     if args.ali_dir is not None:
-        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
     elif args.tree_dir is not None:
-        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.tree_dir)
 
     if args.ivector_dir is not None:
-        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+        args.ivector_dim = common_train_lib.GetIvectorDim(args.ivector_dir)
 
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index bac260e93bc..5c91402aa75 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -12,8 +12,7 @@
 import ast
 
 nodes = imp.load_source('', 'steps/nnet3/components.py')
-nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
@@ -65,16 +64,16 @@ def GetArgs():
                         "If CNN layers are used the first set of splice indexes will be used as input "
                         "to the first CNN layer and later splice indexes will be interpreted as indexes "
                         "for the TDNNs.")
-    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--add-lda", type=str, action=common_train_lib.StrToBoolAction,
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
-    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--include-log-softmax", type=str, action=common_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
-    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--add-final-sigmoid", type=str, action=common_train_lib.StrToBoolAction,
                         help="add a final sigmoid layer as alternate to log-softmax-layer. "
                         "Can only be used if include-log-softmax is false. "
                         "This is useful in cases where you want the output to be "
@@ -89,7 +88,7 @@ def GetArgs():
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
                         default=0.0)
-    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=common_train_lib.StrToBoolAction,
                         help="if using --xent-regularize, gives it separate last-but-one weight matrix",
                         default=False, choices = ["false", "true"])
     parser.add_argument("--final-layer-normalize-target", type=float,
@@ -113,7 +112,7 @@ def GetArgs():
                         help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
 
 
-    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=common_train_lib.StrToBoolAction,
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
     parser.add_argument("config_dir",
@@ -132,15 +131,15 @@ def CheckArgs(args):
 
     ## Check arguments.
     if args.feat_dir is not None:
-        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+        args.feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
 
     if args.ali_dir is not None:
-        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
     elif args.tree_dir is not None:
-        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.tree_dir)
 
     if args.ivector_dir is not None:
-        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+        args.ivector_dim = common_train_lib.GetIvectorDim(args.ivector_dir)
 
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
@@ -230,7 +229,7 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c
     cnn_args = ParseCnnString(cnn_layer)
     num_cnn_layers = len(cnn_args)
     # We use an Idct layer here to convert MFCC to FBANK features
-    nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+    common_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
     prev_layer_output = {'descriptor':  "input",
                          'dimension': feat_dim}
     prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')

From a91c49c119e8ebb3fa77c6e8c76695591c9ed539 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 14 Nov 2016 17:51:55 -0500
Subject: [PATCH 21/71] raw_python_script: Reorganize nnet3 libraries

---
 egs/wsj/s5/steps/libs/__init__.py             |   1 +
 egs/wsj/s5/steps/libs/common.py               | 364 +++++++++
 egs/wsj/s5/steps/libs/nnet3/__init__.py       |   0
 .../libs => libs/nnet3}/chain_train_lib.py    |  10 +-
 .../libs => libs/nnet3}/common_train_lib.py   |  25 +-
 egs/wsj/s5/steps/libs/nnet3/train/__init__.py |  13 +
 .../nnet3/train/chain_objf/acoustic_model.py  | 282 +++++++
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 688 ++++++++++++++++++
 .../nnet3/train/frame_level_objf/__init__.py  |   8 +
 .../train/frame_level_objf/acoustic_model.py  |  69 ++
 .../nnet3/train/frame_level_objf/common.py    | 461 ++++++++++++
 .../nnet3/train/frame_level_objf/raw_model.py |  85 +++
 .../{nnet3/libs => libs/nnet3}/train_lib.py   |  34 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 125 ++--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 100 +--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 133 ++--
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 206 +++---
 17 files changed, 2298 insertions(+), 306 deletions(-)
 create mode 100644 egs/wsj/s5/steps/libs/__init__.py
 create mode 100644 egs/wsj/s5/steps/libs/common.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/__init__.py
 rename egs/wsj/s5/steps/{nnet3/libs => libs/nnet3}/chain_train_lib.py (97%)
 rename egs/wsj/s5/steps/{nnet3/libs => libs/nnet3}/common_train_lib.py (97%)
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/__init__.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/common.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
 rename egs/wsj/s5/steps/{nnet3/libs => libs/nnet3}/train_lib.py (94%)

diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py
new file mode 100644
index 00000000000..37aa01b75c1
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/__init__.py
@@ -0,0 +1 @@
+__all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
new file mode 100644
index 00000000000..cb66bba292d
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -0,0 +1,364 @@
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0
+
+""" This module contains several utility functions and classes that are
+commonly used in many kaldi python scripts.
+"""
+
+import subprocess
+import argparse
+import logging
+import os
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def SendMail(message, subject, email_id):
+    try:
+        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
+            message = message,
+            subject = subject,
+            email = email_id), shell=True)
+    except Exception as e:
+        logger.info(" Unable to send mail due to error:\n {error}".format(error=str(e)))
+        pass
+
+
+def StrToBool(value):
+    if value == "true":
+        return True
+    elif value == "false":
+        return False
+    else:
+        raise ValueError
+
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        try:
+            setattr(namespace, self.dest, StrToBool(values))
+        except ValueError:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+
+class NullstrToNoneAction(argparse.Action):
+    """ A custom action to convert empty strings passed by shell
+        to None in python. This is necessary as shell scripts print null strings
+        when a variable is not specified. We could use the more apt None
+        in python. """
+    def __call__(self, parser, namespace, values, option_string=None):
+            if values.strip() == "":
+                setattr(namespace, self.dest, None)
+            else:
+                setattr(namespace, self.dest, values)
+
+
+def CheckIfCudaCompiled():
+    p = subprocess.Popen("cuda-compiled")
+    p.communicate()
+    if p.returncode == 1:
+        return False
+    else:
+        return True
+
+
+class KaldiCommandException(Exception):
+    def __init__(self, command, err):
+        Exception.__init__(self, "There was an error while running the command "
+                                 "{0}\n{1}\n{2}".format(command, "-"*10, err))
+
+
+class ListNode():
+    """ A structure to store a node in a doubly linked-list
+
+    Attributes:
+        data: Any object that is to be stored
+        next_node: A reference to the next object
+        previous_node: A reference to the previous object
+    """
+    def __init__(self, data=None, next_node=None, previous_node=None):
+        self.data = data
+        self.next_node = next_node
+        self.previous_node = previous_node
+
+
+class LinkedListIterator():
+    def __init__(self, node):
+        self.__current = node
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        if self.__current is None:
+            raise StopIteration()
+
+        data = self.__current.data
+        self.__current = self.__current.next_node
+
+        return data
+
+
+class LinkedList():
+    def __init__(self):
+        self.__head = None
+        self.__tail = None
+
+    def __iter__(self):
+        return LinkedListIterator(self.__head)
+
+    def Push(self, node):
+        """Pushes the node <node> at the "front" of the linked list
+        """
+        node.next_node = self.__head
+        node.previous_node = None
+        self.__head.previous_node = node
+        self.__head = node
+
+    def Pop(self):
+        """Pops the last node out of the list"""
+        old_last_node = self.__tail
+        to_be_last = self.__tail.previous_node
+        to_be_last.next_node = None
+        old_last_node.previous_node = None
+
+        # Set the last node to the "to_be_last"
+        self.__tail = to_be_last
+
+        return old_last_node
+
+    def Remove(self, node):
+        """Removes and returns node, and connects the previous and next
+        nicely
+        """
+        next_node = node.next_node
+        previous_node = node.previous_node
+
+        previous_node.next_node = next_node
+        next_node.previous_node = previous_node
+
+        # Make it "free"
+        node.next_node = node.previous_node = None
+
+        return node
+
+
+class BackgroundProcessHandler():
+    """ This class handles background processes to ensure that a top-level
+    script waits until all the processes end before exiting
+
+    A top-level script is expected to instantiate an object of this class
+    and pass it to all calls of RunKaldiCommand that are to be run in the
+    background. The background processes are queued and these are polled
+    in a parallel thread at set interval to check for failures.
+    The top-level script can ensure at the end ensure that all processes are
+    completed before exiting.
+
+    Attributes:
+        __process_queue: Stores a list of process handles and command tuples
+
+    """
+
+    def __init__(self, polling_time=600):
+        self.__process_queue = LinkedList()
+        self.__polling_time = polling_time
+        self.Poll()
+
+    def Poll(self):
+        for n in self.__process_queue:
+            if self.IsProcessDone(n.data):
+                self.EnsureProcessIsDone(n.data)
+        threading.Timer(self.__polling_time, Poll).start()
+
+    def AddProcess(self, t):
+        """ Add a (process handle, command) tuple to the queue
+        """
+        self.__process_queue.Push(Node(data=t))
+
+    def IsProcessDone(self, t):
+        p, command = t
+        if p.poll() is None:
+            return False
+        return True
+
+    def EnsureProcessIsDone(self, t):
+        p, command = t
+        [stdout, stderr] = p.communicate()
+        if p.returncode is not 0:
+            raise KaldiCommandException(command, stderr)
+
+    def EnsureProcessesAreDone(self):
+        for n in self.__process_queue:
+            self.EnsureProcessIsDone(n.data)
+
+
+def RunKaldiCommand(command, wait=True, background_process_handler=None):
+    """ Runs commands frequently seen in Kaldi scripts. These are usually a
+        sequence of commands connected by pipes, so we use shell=True.
+
+    Args:
+        background_process_handler: An object of the BackgroundProcessHandler
+            class that is instantiated by the top-level script. If this is
+            provided, then the created process handle is added to the object.
+        wait: If True, wait until the process is completed. However, if the
+            background_process_handler is provided, this option will be
+            ignored and the process will be run in the background.
+    """
+    #logger.info("Running the command\n{0}".format(command))
+    p = subprocess.Popen(command, shell=True,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
+
+    if background_process_handler is not None:
+        wait = False
+        background_process_handler.AddProcess((p,command))
+
+    if wait:
+        [stdout, stderr] = p.communicate()
+        if p.returncode is not 0:
+            raise KaldiCommandException(command, stderr)
+        return stdout, stderr
+    else:
+        return p
+
+
+def GetNumberOfLeavesFromTree(alidir):
+    [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
+    parts = stdout.split()
+    assert(parts[0] == "num-pdfs")
+    num_leaves = int(parts[1])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+
+def GetNumberOfLeavesFromModel(dir):
+    [stdout, stderr] = RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    parts = stdout.split()
+    #number of pdfs 7115
+    assert(' '.join(parts[0:3]) == "number of pdfs")
+    num_leaves = int(parts[3])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+
+def GetNumberOfJobs(alidir):
+    try:
+        num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip())
+    except IOError, ValueError:
+        raise Exception('Exception while reading the number of alignment jobs')
+    return num_jobs
+
+
+def GetIvectorDim(ivector_dir=None):
+    if ivector_dir is None:
+        return 0
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir=ivector_dir))
+    ivector_dim = int(stdout_val)
+    return ivector_dim
+
+
+def GetFeatDim(feat_dir):
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data=feat_dir))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
+
+def GetFeatDimFromScp(feat_scp):
+    [stdout_val, stderr_val] =  RunKaldiCommand("feat-to-dim --print-args=false scp:{feat_scp} -".format(feat_scp = feat_scp))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
+
+def ReadKaldiMatrix(matrix_file):
+    try:
+        lines = map(lambda x: x.split(), open(matrix_file).readlines())
+        first_field = lines[0][0]
+        last_field = lines[-1][-1]
+        lines[0] = lines[0][1:]
+        lines[-1] = lines[-1][:-1]
+        if not (first_field == "[" and last_field == "]"):
+            raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script")
+        for i in range(len(lines)):
+            lines[i] = map(lambda x: int(float(x)), lines[i])
+        return lines
+    except IOError:
+        raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file))
+
+
+def WriteKaldiMatrix(output_file, matrix):
+    # matrix is a list of lists
+    file = open(output_file, 'w')
+    file.write("[ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to have the same length")
+        file.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file.write("\n")
+    file.write(" ]")
+    file.close()
+
+
+def ForceSymlink(file1, file2):
+    try:
+        os.symlink(file1, file2)
+    except OSError, e:
+        if e.errno == errno.EEXIST:
+            os.remove(file2)
+            os.symlink(file1, file2)
+
+
+def ComputeLifterCoeffs(lifter, dim):
+    coeffs = [0] * dim
+    for i in range(0, dim):
+        coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter));
+
+    return coeffs
+
+
+def ComputeIdctMatrix(K, N, cepstral_lifter=0):
+    matrix = [[0] * K for i in range(N)]
+    # normalizer for X_0
+    normalizer = math.sqrt(1.0 / float(N));
+    for j in range(0, N):
+        matrix[j][0] = normalizer;
+    # normalizer for other elements
+    normalizer = math.sqrt(2.0 / float(N));
+    for k in range(1, K):
+      for n in range(0, N):
+        matrix[n][k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k);
+
+    if cepstral_lifter != 0:
+        lifter_coeffs = ComputeLifterCoeffs(cepstral_lifter, K)
+        for k in range(0, K):
+          for n in range(0, N):
+            matrix[n][k] = matrix[n][k] / lifter_coeffs[k];
+
+    return matrix
+
+
+def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
+    # generate the IDCT matrix and write to the file
+    idct_matrix = ComputeIdctMatrix(feat_dim, feat_dim, cepstral_lifter)
+    # append a zero column to the matrix, this is the bias of the fixed affine component
+    for k in range(0, feat_dim):
+        idct_matrix[k].append(0)
+    WriteKaldiMatrix(file_path, idct_matrix)
diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py b/egs/wsj/s5/steps/libs/nnet3/chain_train_lib.py
similarity index 97%
rename from egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
rename to egs/wsj/s5/steps/libs/nnet3/chain_train_lib.py
index a88f595ab81..7611d5675f9 100644
--- a/egs/wsj/s5/steps/nnet3/libs/chain_train_lib.py
+++ b/egs/wsj/s5/steps/libs/nnet3/chain_train_lib.py
@@ -17,11 +17,11 @@
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+#handler = logging.StreamHandler()
+#handler.setLevel(logging.INFO)
+#formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+#handler.setFormatter(formatter)
+#logger.addHandler(handler)
 
 def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
     common_train_lib.RunKaldiCommand("""
diff --git a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py b/egs/wsj/s5/steps/libs/nnet3/common_train_lib.py
similarity index 97%
rename from egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
rename to egs/wsj/s5/steps/libs/nnet3/common_train_lib.py
index 1fec1ae2a1b..ad338c83d53 100644
--- a/egs/wsj/s5/steps/nnet3/libs/common_train_lib.py
+++ b/egs/wsj/s5/steps/libs/nnet3/common_train_lib.py
@@ -11,7 +11,7 @@
 import time
 import argparse
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger(__name__ + ".common_train_lib")
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -134,19 +134,19 @@ def GetAverageNnetModel(dir, iter, nnets_list, run_opts,
     if shrink is not None:
         scale = shrink
 
-    new_iter = iter + 1
+    next_iter = iter + 1
     if get_raw_nnet_from_am:
         out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
-{dir}/{iter}.mdl {dir}/{new_iter}.mdl""".format(dir = dir, iter = iter,
-                                                new_iter = new_iter,
+{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir = dir, iter = iter,
+                                                next_iter = next_iter,
                                                 scale = scale)
     else:
         if shrink is not None:
             out_model = """- \| nnet3-copy --scale={scale} \
-- {dir}/{new_iter}.raw""".format(dir = dir, new_iter = new_iter, scale = scale)
+- {dir}/{next_iter}.raw""".format(dir = dir, next_iter = next_iter, scale = scale)
         else:
-            out_model = "{dir}/{new_iter}.raw".format(dir = dir,
-                                                      new_iter = new_iter)
+            out_model = "{dir}/{next_iter}.raw".format(dir = dir,
+                                                      next_iter = next_iter)
 
     RunKaldiCommand("""
 {command} {dir}/log/average.{iter}.log \
@@ -171,7 +171,7 @@ def GetBestNnetModel(dir, iter, best_model_index, run_opts,
     if get_raw_nnet_from_am:
         out_model = """- \| nnet3-am-copy --set-raw-nnet=- \
 {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir = dir, iter = iter,
-                                                 new_iter = iter + 1)
+                                                 next_iter = iter + 1)
     else:
         out_model = '{dir}/{next_iter}.raw'.format(dir = dir,
                                                    next_iter = iter + 1)
@@ -562,7 +562,7 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
     nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-    {model} ark:- ark:- \| \
+    "{model}" ark:- ark:- \| \
 matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
     """.format(command = run_opts.command,
                dir = dir, model = model,
@@ -588,7 +588,7 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
 def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts):
     RunKaldiCommand("""
 {command} {dir}/log/adjust_priors.final.log \
-nnet3-am-adjust-priors {input_model} {avg_posterior_vector} {output_model}
+nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} "{output_model}"
     """.format(command = run_opts.command,
                dir = dir, input_model = input_model,
                avg_posterior_vector = avg_posterior_vector,
@@ -753,6 +753,11 @@ def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
 common_parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
                            default=10,
                            help="""Max number of jobs used for LDA stats accumulation""")
+common_parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float,
+                           dest='presoftmax_prior_scale_power',
+                           default=-0.25,
+                           help="")
+
 
 # Parameters for the optimization
 common_parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py
new file mode 100644
index 00000000000..57883b372fd
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py
@@ -0,0 +1,13 @@
+
+# Copyright 2016 Vimal Manohar
+# Apache 2.0
+
+""" This library has classes and methods commonly used for training nnet3
+neural networks.
+
+It has separate submodules for frame-level objectives and chain objective:
+frame_level_objf -- For both recurrent and non-recurrent architectures
+chain_objf -- LF-MMI objective training
+"""
+
+__all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
new file mode 100644
index 00000000000..092e9c66ff3
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -0,0 +1,282 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+""" This is a module with methods which will be used by scripts for training of
+deep neural network acoustic model with chain objective.
+"""
+
+import logging
+import math
+import imp
+import os
+import sys
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+#handler = logging.StreamHandler()
+#handler.setLevel(logging.INFO)
+#formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+#handler.setFormatter(formatter)
+#logger.addHandler(handler)
+
+
+def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts=None):
+    """Create a phone LM for chain training
+
+    This method trains a phone LM for chain training using the alignments
+    in "tree_dir"
+    """
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/make_phone_lm.log \
+    chain-est-phone-lm {lm_opts} \
+    "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
+    {dir}/phone_lm.fst""".format(command=run_opts.command,
+                                 dir=dir,
+                                 lm_opts=lm_opts if lm_opts is not None else '',
+                                 tree_dir=tree_dir))
+
+
+def CreateDenominatorFst(dir, tree_dir, run_opts):
+    common_lib.RunKaldiCommand("""
+copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
+{command} {dir}/log/make_den_fst.log \
+    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \
+    {dir}/den.fst {dir}/normalization.fst
+    """.format(tree_dir=tree_dir, dir=dir, command=run_opts.command))
+
+
+def GenerateChainEgs(dir, data, lat_dir, egs_dir,
+                    left_context, right_context,
+                    run_opts, stage=0,
+                    valid_left_context=None, valid_right_context=None,
+                    left_tolerance=None, right_tolerance=None,
+                    frame_subsampling_factor=3,
+                    alignment_subsampling_factor=3,
+                    feat_type='raw', online_ivector_dir=None,
+                    frames_per_iter=20000, frames_per_eg=20, srand=0,
+                    egs_opts=None, cmvn_opts=None, transform_dir=None):
+    """Wrapper for steps/nnet3/chain/get_egs.sh
+
+    See options in that script.
+    """
+
+    common_lib.RunKaldiCommand("""
+steps/nnet3/chain/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context '{valid_left_context}' \
+  --valid-right-context '{valid_right_context}' \
+  --left-tolerance '{left_tolerance}' \
+  --right-tolerance '{right_tolerance}' \
+  --frame-subsampling-factor {frame_subsampling_factor} \
+  --alignment-subsampling-factor {alignment_subsampling_factor} \
+  --stage {stage} \
+  --frames-per-iter {frames_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  {data} {dir} {lat_dir} {egs_dir}
+      """.format(command=run_opts.command,
+                 cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+                 feat_type=feat_type,
+                 transform_dir=transform_dir
+                               if transform_dir is not None
+                               else '',
+                 ivector_dir=online_ivector_dir
+                             if online_ivector_dir is not None
+                             else '',
+                 left_context=left_context, right_context=right_context,
+                 valid_left_context=valid_left_context
+                                    if valid_left_context is not None
+                                    else '',
+                 valid_right_context=valid_right_context
+                                     if valid_right_context is not None
+                                     else '',
+                 left_tolerance=left_tolerance
+                                if left_tolerance is not None
+                                else '',
+                 right_tolerance=right_tolerance
+                                 if right_tolerance is not None
+                                 else '',
+                 frame_subsampling_factor=frame_subsampling_factor,
+                 alignment_subsampling_factor=alignment_subsampling_factor,
+                 stage=stage, frames_per_iter=frames_per_iter,
+                 frames_per_eg=frames_per_eg, srand=srand,
+                 data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
+                 egs_opts=egs_opts if egs_opts is not None else ''))
+
+
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs=None, rand_prune=4.0,
+                                 lda_opts=None):
+    """ Function for calling binaries to estimate and write LDA matrix from cegs
+
+    This function is exactly similar to the version in
+    libs/nnet3/train/common.py
+    except it uses egs files in place of cegs files.
+
+    """
+
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+    common_lib.RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+    nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command=run_opts.command,
+        num_lda_jobs=num_lda_jobs,
+        dir=dir,
+        egs_dir=egs_dir,
+        rand_prune=rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files=map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                       range(1, num_lda_jobs + 1))
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command=run_opts.command,
+        dir=dir, lda_stat_files=" ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+    nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+    """.format(command=run_opts.command, dir=dir,
+               lda_opts=lda_opts if lda_opts is not None else ""))
+
+    common_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+def PrepareInitialAcousticModel(dir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw
+   """.format(command=run_opts.command, dir=dir))
+
+    # The model-format for a 'chain' acoustic model is just the transition
+    # model and then the raw nnet, so we can use 'cat' to create this, as
+    # long as they have the same mode (binary or not binary).
+    # We ensure that they have the same mode (even if someone changed the
+    # script to make one or both of them text mode) by copying them both
+    # before concatenating them.
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format(
+        command=run_opts.command, dir=dir))
+
+
+def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
+                  egs_dir, leaky_hmm_coefficient, l2_regularize,
+                  xent_regularize, run_opts):
+    """ Function to do model combination
+
+    In the nnet3 setup, the logic
+    for doing averaging of subsets of the models in the case where
+    there are too many models to reliably esetimate interpolation
+    factors (max_models_combine) is moved into the nnet3-combine
+    """
+
+    raw_model_strings = []
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      model_file = '{0}/{1}.mdl'.format(dir, iter)
+      if os.path.exists(model_file):
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      else:
+          print('{0}: warning: model file {1} does not exist (final combination)'.format(
+                  sys.argv[0], model_file))
+    common_lib.RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-chain-combine --num-iters=40 \
+   --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {dir}/den.fst {raw_models} """
+   """ "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
+    "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
+    """.format(command=run_opts.command,
+               combine_queue_opt=run_opts.combine_queue_opt,
+               l2=l2_regularize, leaky=leaky_hmm_coefficient,
+               dir=dir, raw_models=" ".join(raw_model_strings),
+               num_chunk_per_minibatch=num_chunk_per_minibatch,
+               num_iters=num_iters,
+               egs_dir=egs_dir))
+
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize,
+                                leaky_hmm_coefficient, run_opts, wait=False)
+
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
+                                leaky_hmm_coefficient, run_opts, wait=False):
+
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+    nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+    --xent-regularize={xent_reg} \
+    "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+    "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+    """.format(command=run_opts.command,
+               dir=dir, iter=iter, model=model,
+               l2=l2_regularize, leaky=leaky_hmm_coefficient,
+               xent_reg=xent_regularize,
+               egs_dir=egs_dir), wait=wait)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+    nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+    --xent-regularize={xent_reg} \
+    "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+    "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+    """.format(command=run_opts.command,
+               dir=dir,
+               iter=iter,
+               model=model,
+               l2=l2_regularize, leaky=leaky_hmm_coefficient,
+               xent_reg=xent_regularize,
+               egs_dir=egs_dir), wait=wait)
+
+
+def ComputeProgress(dir, iter, run_opts, wait=False):
+
+    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+    nnet3-am-info {model} '&&' \
+    nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" \
+    "nnet3-am-copy --raw=true {model} - |"
+    """.format(command=run_opts.command,
+               dir=dir,
+               iter=iter,
+               model=model,
+               prev_model=prev_model), wait=wait)
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
new file mode 100644
index 00000000000..87ae4cad02f
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -0,0 +1,688 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0
+
+"""This module contains classes and methods common to training of
+nnet3 neural networks.
+"""
+
+import sys
+import logging
+import math
+import re
+import time
+import argparse
+import shutil
+
+import common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+class RunOpts:
+    """A structure to store run options.
+
+    Run options like queue.pl and run.pl, along with their memory
+    and parallel training options for various types of commands such
+    as the ones for training, parallel-training, running on GPU etc.
+    """
+
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
+
+def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
+    assert(num_models > 0)
+
+    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
+    objf = []
+    for i in range(num_models):
+        model_num = i + 1
+        logfile = re.sub('%', str(model_num), log_file_pattern)
+        lines = open(logfile, 'r').readlines()
+        this_objf = -100000
+        for line_num in range(1, len(lines) + 1):
+            # we search from the end as this would result in
+            # lesser number of regex searches. Python regex is slow !
+            mat_obj = parse_regex.search(lines[-1*line_num])
+            if mat_obj is not None:
+                this_objf = float(mat_obj.groups()[0])
+                break;
+        objf.append(this_objf);
+    max_index = objf.index(max(objf))
+    accepted_models = []
+    for i in range(num_models):
+        if (objf[max_index] - objf[i]) <= difference_threshold:
+            accepted_models.append(i+1)
+
+    if len(accepted_models) != num_models:
+        logger.warn("""Only {0}/{1} of the models have been accepted
+for averaging, based on log files {2}.""".format(len(accepted_models),
+                                                 num_models, log_file_pattern))
+
+    return [accepted_models, max_index+1]
+
+
+def GetAverageNnetModel(dir, iter, nnets_list, run_opts,
+                        get_raw_nnet_from_am=True, shrink=None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    next_iter = iter + 1
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
+{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir=dir, iter=iter,
+                                                 next_iter=next_iter,
+                                                 scale=scale)
+    else:
+        if shrink is not None:
+            out_model = """- \| nnet3-copy --scale={scale} \
+- {dir}/{next_iter}.raw""".format(dir=dir, next_iter=next_iter, scale=scale)
+        else:
+            out_model = "{dir}/{next_iter}.raw".format(dir=dir,
+                                                       next_iter=next_iter)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnets_list} \
+{out_model}""".format(command=run_opts.command,
+                      dir=dir,
+                      iter=iter,
+                      nnets_list=nnets_list,
+                      out_model=out_model))
+
+
+def GetBestNnetModel(dir, iter, best_model_index, run_opts,
+                     get_raw_nnet_from_am=True, shrink=None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(
+            dir=dir,
+            next_iter=iter + 1,
+            best_model_index=best_model_index)
+
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- \
+{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir=dir, iter=iter,
+                                                 next_iter=iter + 1)
+    else:
+        out_model = '{dir}/{next_iter}.raw'.format(dir=dir,
+                                                   next_iter=iter + 1)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+nnet3-copy --scale={scale} {best_model} \
+{out_model}""".format(command=run_opts.command,
+                      dir=dir, iter=iter,
+                      best_model=best_model,
+                      out_model=out_model, scale=scale))
+
+
+def CopyEgsPropertiesToExpDir(egs_dir, dir):
+    try:
+        for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
+            file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
+            if os.path.isfile(file_name):
+                shutil.copy2(file_name, dir)
+    except IOError:
+        raise Exception("Error while trying to copy egs property files to {dir}".format(dir=dir))
+
+
+def SplitData(data, num_jobs):
+   common_lib.RunKaldiCommand(
+           "utils/split_data.sh {data} {num_jobs}".format(data=data,
+                                                          num_jobs=num_jobs))
+
+
+def ParseGenericConfigVarsFile(var_file):
+    variables = {}
+    try:
+        var_file_handle = open(var_file, 'r')
+        for line in var_file_handle:
+            parts = line.split('=')
+            field_name = parts[0].strip()
+            field_value = parts[1].strip()
+            if field_name in ['model_left_context', 'left_context']:
+                variables['model_left_context'] = int(field_value)
+            elif field_name in ['model_right_context', 'right_context']:
+                variables['model_right_context'] = int(field_value)
+            elif field_name == 'num_hidden_layers':
+                variables['num_hidden_layers'] = int(field_value)
+            else:
+                variables[field_name] = field_value
+        return variables
+    except ValueError:
+        # we will throw an error at the end of the function so I will just pass
+        pass
+
+    raise Exception('Error while parsing the file {0}'.format(var_file))
+
+
+def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
+    try:
+        egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
+        egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline())
+        egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline())
+        egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline())
+        if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
+            raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
+
+        if (egs_left_context < left_context) or (egs_right_context < right_context):
+            raise Exception('The egs have insufficient context')
+
+        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
+        num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
+
+        return [egs_left_context, egs_right_context, frames_per_eg, num_archives]
+    except IOError, ValueError:
+        raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir))
+
+
+def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
+                                presoftmax_prior_scale_power=-0.25):
+
+    # getting the raw pdf count
+    common_lib.RunKaldiCommand("""
+{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
+ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
+post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB
+     """.format(command = run_opts.command,
+                num_jobs=num_jobs,
+                dir=dir,
+                alidir=alidir))
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_pdf_counts.log \
+vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts
+       """.format(command=run_opts.command,  dir=dir))
+
+    import glob
+    for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
+        os.remove(file)
+    pdf_counts = common_lib.ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+    scaled_counts = SmoothPresoftmaxPriorScaleVector(
+            pdf_counts,
+            presoftmax_prior_scale_power=presoftmax_prior_scale_power,
+            smooth=0.01)
+
+    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
+    common_lib.WriteKaldiMatrix(output_file, [scaled_counts])
+    common_lib.ForceSymlink("../presoftmax_prior_scale.vec",
+                            "{0}/configs/presoftmax_prior_scale.vec".format(dir))
+
+
+def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power=-0.25, smooth=0.01):
+    total = sum(pdf_counts)
+    average_count = total/len(pdf_counts)
+    scales = []
+    for i in range(len(pdf_counts)):
+        scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
+    num_pdfs = len(pdf_counts)
+    scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+    return scaled_counts
+
+
+def PrepareInitialNetwork(dir, run_opts):
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config \
+{dir}/0.raw""".format(command=run_opts.command,
+                      dir=dir))
+
+
+def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
+                     num_archives, max_models_combine,
+                     add_layers_period, num_jobs_final):
+    """ Verifies that number of iterations are sufficient for various
+        phases of training."""
+
+    finish_add_layers_iter = num_hidden_layers * add_layers_period
+
+    if num_iters <= (finish_add_layers_iter + 2):
+        raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.')
+
+    approx_iters_per_epoch_final = num_archives/num_jobs_final
+    # First work out how many iterations we want to combine over in the final
+    # nnet3-combine-fast invocation.
+    # The number we use is:
+    # min(max(max_models_combine, approx_iters_per_epoch_final),
+    #     1/2 * iters_after_last_layer_added)
+    # But if this value is > max_models_combine, then the models
+    # are subsampled to get these many models to combine.
+    half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2
+
+    num_iters_combine_initial = min(approx_iters_per_epoch_final,
+                                    half_iters_after_add_layers)
+
+    if num_iters_combine_initial > max_models_combine:
+        subsample_model_factor = int(
+            float(num_iters_combine_initial) / max_models_combine)
+        num_iters_combine = num_iters_combine_initial
+        models_to_combine = set(range(
+            num_iters - num_iters_combine_initial + 1,
+            num_iters + 1, subsample_model_factor))
+        models_to_combine.add(num_iters)
+    else:
+        subsample_model_factor = 1
+        num_iters_combine = min(max_models_combine,
+                                half_iters_after_add_layers)
+        models_to_combine = set(range(num_iters - num_iters_combine + 1,
+                                      num_iters + 1))
+
+    return models_to_combine
+
+
+def GetRealignIters(realign_times, num_iters,
+                    num_jobs_initial, num_jobs_final):
+    """ Takes the realign_times string and identifies the approximate
+        iterations at which realignments have to be done.
+
+    realign_times is a space seperated string of values between 0 and 1
+    """
+
+    realign_iters = []
+    for realign_time in realign_times.split():
+        realign_time = float(realign_time)
+        assert(realign_time > 0 and realign_time < 1)
+        if num_jobs_initial == num_jobs_final:
+            realign_iter = int(0.5 + num_iters * realign_time)
+        else:
+            realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2)
+                            + realign_time * math.pow(num_jobs_final, 2))
+            realign_iter = realign_iter - num_jobs_initial
+            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter * num_iters
+        realign_iters.append(int(realign_iter))
+
+    return realign_iters
+
+
+def Align(dir, data, lang, run_opts, iter=None, transform_dir=None,
+          online_ivector_dir=None):
+
+    alidir = '{dir}/ali{ali_suffix}'.format(dir=dir,
+               ali_suffix="_iter_{0}".format(iter) if iter is not None else "")
+
+    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
+        gpu=" using gpu " if run_opts.realign_use_gpu else " ",
+        num_jobs=run_opts.realign_num_jobs ))
+    common_lib.RunKaldiCommand("""
+steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \
+        --use-gpu {align_use_gpu} \
+        --transform-dir "{transform_dir}" \
+        --online-ivector-dir "{online_ivector_dir}" \
+        --iter "{iter}" {data} {lang} {dir} {alidir}
+    """.format(dir=dir, align_use_gpu="yes" if run_opts.realign_use_gpu else "no",
+               align_cmd=run_opts.realign_command,
+               align_queue_opt=run_opts.realign_queue_opt,
+               num_jobs_align=run_opts.realign_num_jobs,
+               transform_dir=transform_dir if transform_dir is not None else "",
+               online_ivector_dir=online_ivector_dir if online_ivector_dir is not None else "",
+               iter=iter if iter is not None else "",
+               alidir=alidir,
+               lang=lang, data=data))
+    return alidir
+
+
+def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
+            prior_subset_size, num_archives, run_opts,
+            transform_dir=None, online_ivector_dir=None):
+    raise Exception("Realignment stage has not been implemented in nnet3")
+    logger.info("Getting average posterior for purposes of adjusting the priors.")
+    # Note: this just uses CPUs, using a smallish subset of data.
+    # always use the first egs archive, which makes the script simpler;
+    # we're using different random subsets of it.
+
+    avg_post_vec_file = ComputeAveragePosterior(
+            dir, iter, prev_egs_dir,
+            num_archives, prior_subset_size, run_opts)
+
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    logger.info("Re-adjusting priors based on computed posteriors")
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts)
+
+    alidir = Align(dir, feat_dir, lang, run_opts, iter,
+                   transform_dir, online_ivector_dir)
+    common_lib.RunKaldiCommand("""
+steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \
+    {prev_egs_dir} {cur_egs_dir}""".format(
+            command=run_opts.command,
+            iter=iter,
+            dir=dir,
+            alidir=alidir,
+            prev_egs_dir=prev_egs_dir,
+            cur_egs_dir=cur_egs_dir))
+
+
+def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
+                    num_archives_to_process,
+                    initial_effective_lrate, final_effective_lrate):
+    if iter + 1 >= num_iters:
+        effective_learning_rate = final_effective_lrate
+    else:
+        effective_learning_rate = (
+            initial_effective_lrate * math.exp(num_archives_processed *
+                math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process))
+
+    return num_jobs * effective_learning_rate
+
+
+def DoShrinkage(iter, model_file, non_linearity, shrink_threshold,
+                get_raw_nnet_from_am=True):
+
+    if iter == 0:
+        return True
+
+    try:
+        if get_raw_nnet_from_am:
+            output, error = common_lib.RunKaldiCommand(
+                "nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(
+                    non_linearity=non_linearity, model_file=model_file))
+        else:
+            output, error = common_lib.RunKaldiCommand(
+                "nnet3-info --print-args=false {model_file} | grep {non_linearity}".format(
+                    non_linearity=non_linearity, model_file=model_file))
+        output = output.strip().split("\n")
+        # eg.
+        # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
+
+        mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*")
+        total_mean_deriv = 0
+        num_derivs = 0
+        for line in output:
+            mat_obj = mean_pattern.search(line)
+            if mat_obj is None:
+                raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line))
+            mean_deriv = float(mat_obj.groups()[0])
+            total_mean_deriv += mean_deriv
+            num_derivs += 1
+        if total_mean_deriv / num_derivs < shrink_threshold:
+            return True
+    except ValueError:
+        raise Exception("Error while parsing the model info output")
+
+    return False
+
+
+def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
+                            prior_subset_size, run_opts,
+                            get_raw_nnet_from_am=True):
+    """ Computes the average posterior of the network
+    Note: this just uses CPUs, using a smallish subset of data.
+    """
+    import glob
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+
+    if run_opts.num_jobs_compute_prior > num_archives:
+        egs_part = 1
+    else:
+        egs_part = 'JOB'
+
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir=dir)
+    else:
+        model = "{dir}/final.raw".format(dir=dir)
+
+    common_lib.RunKaldiCommand("""
+{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
+    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
+    nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
+    "{model}" ark:- ark:- \| \
+matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
+    """.format(command=run_opts.command,
+               dir=dir, model=model,
+               num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
+               prior_queue_opt=run_opts.prior_queue_opt,
+               iter=iter, prior_subset_size=prior_subset_size,
+               egs_dir=egs_dir, egs_part=egs_part,
+               prior_gpu_opt=run_opts.prior_gpu_opt))
+
+    # make sure there is time for $dir/post.{iter}.*.vec to appear.
+    time.sleep(5)
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/vector_sum.{iter}.log \
+    vector-sum {dir}/post.{iter}.*.vec {output_file}
+        """.format(command=run_opts.command,
+                   dir=dir, iter=iter, output_file=avg_post_vec_file))
+
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+    return avg_post_vec_file
+
+
+def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts):
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/adjust_priors.final.log \
+    nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} "{output_model}"
+    """.format(command=run_opts.command,
+               dir=dir, input_model=input_model,
+               avg_posterior_vector=avg_posterior_vector,
+               output_model=output_model))
+
+
+def RemoveEgs(egs_dir):
+    common_lib.RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir))
+
+
+def CleanNnetDir(nnet_dir, num_iters, egs_dir,
+                 preserve_model_interval=100,
+                 remove_egs=True,
+                 get_raw_nnet_from_am=True):
+    try:
+        if remove_egs:
+            RemoveEgs(egs_dir)
+
+        for iter in range(num_iters):
+            RemoveModel(nnet_dir, iter, num_iters, None,
+                        preserve_model_interval,
+                        get_raw_nnet_from_am=get_raw_nnet_from_am)
+    except (IOError, OSError) as err:
+        logger.warning("Error while cleaning up the nnet directory")
+        raise err
+
+
+def RemoveModel(nnet_dir, iter, num_iters, models_to_combine=None,
+                preserve_model_interval=100,
+                get_raw_nnet_from_am=True):
+    if iter % preserve_model_interval == 0:
+        return
+    if models_to_combine is not None and iter in models_to_combine:
+        return
+    if get_raw_nnet_from_am:
+        file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    else:
+        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)
+
+    if os.path.isfile(file_name):
+        os.remove(file_name)
+
+
+class CommonParser:
+    """Parser for parsing common options related to nnet3 training.
+
+    This argument parser adds common options related to nnet3 training
+    such as egs creation, training optimization options.
+    These are used in the nnet3 train scripts
+    in steps/nnet3/train*.py and steps/nnet3/chain/train.py
+    """
+
+    def __init__(self):
+        self.parser = argparser.ArgumentParser(add_help=False)
+
+        # feat options
+        self.parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                                 default = None, action = NullstrToNoneAction,
+                                 help="""directory with the ivectors extracted in
+                                 an online fashion.""")
+        self.parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                                 default = None, action = NullstrToNoneAction,
+                                 help="A string specifying '--norm-means' and '--norm-vars' values")
+
+        # egs extraction options
+        self.parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                                 default = 0,
+                                 help="Number of additional frames of input to the left"
+                                 " of the input chunk. This extra context will be used"
+                                 " in the estimation of RNN state before prediction of"
+                                 " the first label. In the case of FF-DNN this extra"
+                                 " context will be used to allow for frame-shifts")
+        self.parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                                 default = 0,
+                                 help="Number of additional frames of input to the right"
+                                 " of the input chunk. This extra context will be used"
+                                 " in the estimation of bidirectional RNN state before"
+                                 " prediction of the first label.")
+        self.parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                                 default = None, action = NullstrToNoneAction,
+                                 help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+        self.parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                                 default = None, action = NullstrToNoneAction,
+                                 help="""Directory with egs. If specified this directory
+                                 will be used rather than extracting egs""")
+        self.parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                                 default = 0, help="Stage at which get_egs.sh should be restarted")
+        self.parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                                 default = None, action = NullstrToNoneAction,
+                                 help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+        # trainer options
+        self.parser.add_argument("--trainer.srand", type=int, dest='srand',
+                                 default = 0,
+                                 help="Sets the random seed for model initialization and egs shuffling. "
+                                 "Warning: This random seed does not control all aspects of this experiment. "
+                                 "There might be other random seeds used in other stages of the experiment "
+                                 "like data preparation (e.g. volume perturbation).")
+        self.parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                                 default = 8,
+                                 help="Number of epochs to train the model")
+        self.parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                                 default = 20000,
+                                 help="Number of samples for computing priors")
+        self.parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                                 default = 10,
+                                 help="The prior computation jobs are single threaded and run on the CPU")
+        self.parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                                 default = 20,
+                                 help="The maximum number of models used in the final model combination stage. "
+                                 "These models will themselves be averages of iteration-number ranges")
+        self.parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                                 default = 5000,
+                                 help=""" Controls randomization of the samples on each
+                                 iteration. If 0 or a large value the randomization is
+                                 complete, but this will consume memory and cause spikes
+                                 in disk I/O.  Smaller is easier on disk and memory but
+                                 less random.  It's not a huge deal though, as samples
+                                 are anyway randomized right at the start.
+                                 (the point of this is to get data in different
+                                 minibatches on different iterations, since in the
+                                 preconditioning method, 2 samples in the same minibatch
+                                 can affect each others' gradients.""")
+        self.parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                                 default=2,
+                                 help="The number of iterations between adding layers"
+                                 "during layer-wise discriminative training.")
+        self.parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                                 default=2.0,
+                                 help="""The maximum change in parameters allowed
+                                 per minibatch, measured in Frobenius norm over
+                                 the entire model""")
+        self.parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                                 default=400000,
+                                 help="This is really the number of egs in each archive.")
+        self.parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                                 default=4.0,
+                                 help="""Value used in preconditioning matrix estimation""")
+        self.parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                                 default=10,
+                                 help="""Max number of jobs used for LDA stats accumulation""")
+        self.parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float,
+                                 dest='presoftmax_prior_scale_power',
+                                 default=-0.25,
+                                 help="")
+
+
+        # Parameters for the optimization
+        self.parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                                 default = 0.0003,
+                                 help="Learning rate used during the initial iteration")
+        self.parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                                 default = 0.00003,
+                                 help="Learning rate used during the final iteration")
+        self.parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                                 default = 1,
+                                 help="Number of neural net jobs to run in parallel at the start of training")
+        self.parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                                 default = 8,
+                                 help="Number of neural net jobs to run in parallel at the end of training")
+        self.parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                                 default = 20,
+                                 help = """The is the maximum number of models we give to the
+                                           final 'combine' stage, but these models will themselves
+                                           be averages of iteration-number ranges. """)
+        self.parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                                 default = 0.0,
+                                 help="""Momentum used in update computation.
+                                 Note: we implemented it in such a way that
+                                 it doesn't increase the effective learning rate.""")
+        # General options
+        self.parser.add_argument("--stage", type=int, default=-4,
+                                 help="Specifies the stage of the experiment to execution from")
+        self.parser.add_argument("--exit-stage", type=int, default=None,
+                                 help="If specified, training exits before running this stage")
+        self.parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                                 dest = "command",
+                                 help="""Specifies the script to launch jobs.
+                                 e.g. queue.pl for launching on SGE cluster
+                                        run.pl for launching on local machine
+                                 """, default = "queue.pl")
+        self.parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
+                                 dest = "egs_command",
+                                 help="""Script to launch egs jobs""", default = "queue.pl")
+        self.parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                                 choices = ["true", "false"],
+                                 help="Use GPU for training", default=True)
+        self.parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                                 choices = ["true", "false"],
+                                 help="Clean up models after training", default=True)
+        self.parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                                 default = True, action = StrToBoolAction,
+                                 choices = ["true", "false"],
+                                 help="""If true, remove egs after experiment""")
+        self.parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                                 type=int, default=100,
+                                 help="Determines iterations for which models will be preserved during cleanup. "
+                                 "If mod(iter,preserve_model_interval) == 0 model will be preserved.")
+
+        self.parser.add_argument("--reporting.email", dest = "email",
+                                 type=str, default=None, action = NullstrToNoneAction,
+                                 help=""" Email-id to report about the progress of the experiment.
+                                         NOTE: It assumes the machine on which the script is being run can send
+                                         emails from command line via. mail program. The
+                                         Kaldi mailing list will not support this feature.
+                                         It might require local expertise to setup. """)
+        self.parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                                 type=int, default=0.1,
+                                 help="Frequency with which reports have to be sent, "
+                                 "measured in terms of fraction of iterations. "
+                                 "If 0 and reporting mail has been specified then only failure notifications are sent")
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
new file mode 100644
index 00000000000..172030d8297
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
@@ -0,0 +1,8 @@
+
+
+# Copyright 2016 Vimal Manohar
+# Apache 2.0
+
+""" This library has classes and methods commonly used for training nnet3
+neural networks with frame-level objectives.
+"""
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
new file mode 100644
index 00000000000..40e224b0672
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -0,0 +1,69 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+""" This is a module with method which will be used by scripts for
+training of deep neural network acoustic model with frame-level objective.
+"""
+
+import logging
+import math
+import imp
+import os
+import sys
+
+sys.path.append("steps/libs")
+import common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def GenerateEgs(data, alidir, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage=0,
+                feat_type='raw', online_ivector_dir=None,
+                samples_per_iter=20000, frames_per_eg=20, srand=0,
+                egs_opts=None, cmvn_opts=None, transform_dir=None):
+
+    """ Wrapper for calling steps/nnet3/get_egs.sh
+
+    Generates targets from alignment directory 'alidir', which contains
+    the model final.mdl and alignments.
+    """
+
+    common_lib.RunKaldiCommand("""
+steps/nnet3/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  {data} {alidir} {egs_dir}
+  """.format(command=run_opts.command,
+             cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+             feat_type=feat_type,
+             transform_dir=transform_dir if transform_dir is not None else '',
+             ivector_dir=online_ivector_dir if online_ivector_dir is not None else '',
+             left_context=left_context, right_context=right_context,
+             valid_left_context=valid_left_context,
+             valid_right_context=valid_right_context,
+             stage=stage, samples_per_iter=samples_per_iter,
+             frames_per_eg=frames_per_eg, srand=srand, data=data, alidir=alidir,
+             egs_dir=egs_dir,
+             egs_opts=egs_opts if egs_opts is not None else ''))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
new file mode 100644
index 00000000000..952a797c14f
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -0,0 +1,461 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+""" This is a module with methods which will be used by scripts for training of
+deep neural network acoustic model and raw model (i.e., generic neural
+network without transition model) with frame-level objectives.
+"""
+
+import logging
+import math
+import imp
+import os
+import sys
+
+sys.path.append("steps/libs")
+import nnet3.train.common as common_train_lib
+import common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   cache_read_opt, run_opts,
+                   frames_per_eg=-1, min_deriv_time=None):
+    """ Called from TrainOneIteration, this model does one iteration of training
+    with 'num_jobs' jobs, and
+    writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
+
+    We cannot easily use a single parallel SGE job to do the main training,
+    because the computation of which archive and which --frame option
+    to use for each job is a little complex, so we spawn each one separately.
+    this is no longer true for RNNs as we use do not use the --frame option
+    but we use the same script for consistency with FF-DNN code
+
+    Args:
+        frames_per_eg: The default value -1 implies chunk_level_training, which
+            is particularly applicable to RNN training. If it is > 0, then it
+            implies frame-level training, which is applicable for DNN training.
+            If it is > 0, then each parallel SGE job created, a different frame
+            numbered 0..frames_per_eg-1 is used.
+        min_deriv_time: Applicable for RNN training. A default value of None
+            implies a min_deriv_time of 0 is used. During RNN training, its value
+            is set to chunk_width - num_bptt_steps in the training script.
+    """
+
+    chunk_level_training = False if frames_per_eg > 0 else True
+
+    deriv_time_opts = (""
+                       if min_deriv_time is None
+                       else "--optimization.min-deriv-time={0}".format(min_deriv_time)
+                       )
+
+    context_opts = "--left-context={0} --right-context={1}".format(
+        left_context, right_context)
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        if not chunk_level_training:
+            frame = (k / num_archives) % frames_per_eg
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = common_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+    nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+    --print-interval=10 --momentum={momentum} \
+    --max-param-change={max_param_change} \
+    {deriv_time_opts} "{raw_model}" \
+    "ark,bg:nnet3-copy-egs {frame_opts} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- |"""
+    """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| """
+    """nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false """
+    """--discard-partial-minibatches=true ark:- ark:- |" \
+    {dir}/{next_iter}.{job}.raw
+    """.format(command=run_opts.command,
+               train_queue_opt=run_opts.train_queue_opt,
+               dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1,
+               job=job, parallel_train_opts=run_opts.parallel_train_opts,
+               cache_read_opt=cache_read_opt, cache_write_opt=cache_write_opt,
+               frame_opts=""
+                          if chunk_level_training
+                          else "--frame={0}".format(frame),
+               momentum=momentum, max_param_change=max_param_change,
+               deriv_time_opts=deriv_time_opts,
+               raw_model=raw_model_string, context_opts=context_opts,
+               egs_dir=egs_dir, archive_index=archive_index,
+               shuffle_buffer_size=shuffle_buffer_size,
+               minibatch_size=minibatch_size),
+            background_process_handler=background_process_handler)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts,
+                      cv_minibatch_size=256, frames_per_eg=-1,
+                      min_deriv_time=None, shrinkage_value=1.0,
+                      get_raw_nnet_from_am=True):
+    """ Called from steps/nnet3/train_*.py scripts for one iteration of training
+
+    Args:
+        frames_per_eg: The default value -1 implies chunk_level_training, which
+            is particularly applicable to RNN training. If it is > 0, then it
+            implies frame-level training, which is applicable for DNN training.
+            If it is > 0, then each parallel SGE job created, a different frame
+            numbered 0..frames_per_eg-1 is used.
+        min_deriv_time: Applicable for RNN training. A default value of None
+            implies a min_deriv_time of 0 is used. During RNN training, its value
+            is set to chunk_width - num_bptt_steps in the training script.
+        shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
+            parameter values are scaled by this value.
+        get_raw_nnet_from_am: If True, then the network is read and stored as
+            acoustic model i.e. along with transition model e.g. 10.mdl
+            as against a raw network e.g. 10.raw when the value is False.
+    """
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    # Sets off some background jobs to compute train and
+    # validation set objectives
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                mb_size=cv_minibatch_size,
+                                get_raw_nnet_from_am=get_raw_nnet_from_am)
+
+    if iter > 0:
+        # Runs in the background
+        ComputeProgress(dir, iter, egs_dir, run_opts,
+                        mb_size=cv_minibatch_size,
+                        get_raw_nnet_from_am=get_raw_nnet_from_am)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just added new hiden layer, don't do
+                           # averaging but take the best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr=learning_rate, dir=dir, iter=iter)
+
+    if do_average:
+        cur_minibatch_size = minibatch_size
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_minibatch_size = minibatch_size / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   cache_read_opt, run_opts,
+                   frames_per_eg=frames_per_eg,
+                   min_deriv_time=min_deriv_time)
+
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(
+            num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        common_train_lib.GetAverageNnetModel(
+            dir=dir, iter=iter,
+            nnets_list=" ".join(nnets_list),
+            run_opts=run_opts,
+            get_raw_nnet_from_am=get_raw_nnet_from_am,
+            shrink=shrinkage_value)
+
+    else:
+        # choose the best model from different jobs
+        common_train_lib.GetBestNnetModel(
+            dir=dir, iter=iter,
+            best_model_index=best_model,
+            run_opts=run_opts,
+            get_raw_nnet_from_am=get_raw_nnet_from_am,
+            shrink=shrinkage_value)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    if get_raw_nnet_from_am:
+        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs=None, rand_prune=4.0,
+                                 lda_opts=None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+    common_lib.RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+nnet3-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command=run_opts.command,
+        num_lda_jobs=num_lda_jobs,
+        dir=dir,
+        egs_dir=egs_dir,
+        rand_prune=rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command=run_opts.command,
+        dir=dir, lda_stat_files=" ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+    nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+    """.format(command=run_opts.command,dir=dir,
+               lda_opts=lda_opts if lda_opts is not None else ""))
+
+    common_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+
+def PrepareInitialAcousticModel(dir, alidir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    common_lib.PrepareInitialNetwork(dir, run_opts)
+
+  # Convert to .mdl, train the transitions, set the priors.
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+    """.format(command=run_opts.command,
+               dir=dir, alidir=alidir))
+
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                mb_size=256, background_process_handler=None,
+                                get_raw_nnet_from_am=True):
+
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir=dir, iter=iter)
+    else:
+        model = "{dir}/{iter}.raw".format(dir=dir, iter=iter)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+    nnet3-compute-prob "{model}" \
+    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+    """.format(command=run_opts.command,
+               dir=dir,
+               iter=iter,
+               mb_size=mb_size,
+               model=model,
+               egs_dir=egs_dir),
+            background_process_handler=background_process_handler)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+    nnet3-compute-prob "{model}" \
+    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
+    """.format(command=run_opts.command,
+               dir=dir,
+               iter=iter,
+               mb_size=mb_size,
+               model=model,
+               egs_dir=egs_dir),
+            background_process_handler=background_process_handler)
+
+
+def ComputeProgress(dir, iter, egs_dir, run_opts,
+                    mb_size=256, background_process_handler=None,
+                    get_raw_nnet_from_am=True):
+    if get_raw_nnet_from_am:
+        prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter - 1)
+        model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter)
+    else:
+        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
+        model = '{0}/{1}.raw'.format(dir, iter)
+
+    common_lib.RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+    nnet3-info {model} '&&' \
+    nnet3-show-progress --use-gpu=no {prev_model} {model} \
+    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
+    """.format(command=run_opts.command,
+               dir=dir,
+               iter=iter,
+               model=model,
+               mb_size=mb_size,
+               prev_model=prev_model,
+               egs_dir=egs_dir),
+            background_process_handler=background_process_handler)
+
+
+def CombineModels(dir, num_iters, models_to_combine, egs_dir,
+                  run_opts, background_process_handler=None,
+                  chunk_width=None,
+                  get_raw_nnet_from_am=True):
+    """
+    Now do combination.  In the nnet3 setup, the logic
+    for doing averaging of subsets of the models in the case where
+    there are too many models to reliably esetimate interpolation
+    factors (max_models_combine) is moved into the nnet3-combine.
+    """
+    raw_model_strings = []
+    print len(models_to_combine)
+
+    models_to_combine.add(num_iters)
+
+    for iter in models_to_combine:
+      if get_raw_nnet_from_am:
+          model_file = '{0}/{1}.mdl'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      else:
+          model_file = '{0}/{1}.raw'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append(model_file)
+
+    if chunk_width is not None:
+        # this is an RNN model
+        mbsize = int(1024.0/(chunk_width))
+    else:
+        mbsize = 1024
+
+    if get_raw_nnet_from_am:
+        out_model = "| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir=dir, num_iters=num_iters)
+    else:
+        out_model = '{dir}/final.raw'.format(dir=dir)
+
+    common_lib.RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-combine --num-iters=40 \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+   "{out_model}"
+   """.format(command=run_opts.command,
+               combine_queue_opt=run_opts.combine_queue_opt,
+               dir=dir, raw_models=" ".join(raw_model_strings),
+               mbsize=mbsize,
+               out_model=out_model,
+               egs_dir=egs_dir))
+
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    if get_raw_nnet_from_am:
+        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts,
+                                    background_process_handler=background_process_handler)
+    else:
+        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
+                                    background_process_handler=background_process_handler,
+                                    get_raw_nnet_from_am=False)
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
new file mode 100644
index 00000000000..244672692e9
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -0,0 +1,85 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+""" This is a module with method which will be used by scripts for
+training of deep neural network raw model (i.e. without acoustic model)
+with frame-level objective.
+"""
+
+import logging
+import math
+import imp
+import os
+import sys
+
+sys.path.append("steps/libs")
+import common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def GenerateEgsUsingTargets(data, targets_scp, egs_dir,
+                            left_context, right_context,
+                            valid_left_context, valid_right_context,
+                            run_opts, stage=0,
+                            feat_type='raw', online_ivector_dir=None,
+                            target_type='dense', num_targets=-1,
+                            samples_per_iter=20000, frames_per_eg=20, srand=0,
+                            egs_opts=None, cmvn_opts=None, transform_dir=None):
+    """
+    This method generates egs directly from an scp file of targets, instead of
+    getting them from the alignments (as with the method GenerateEgs).
+    The targets are in matrix format for target_type="dense" and in posterior
+    format for target_type="sparse".
+    If using sparse targets, num_targets must be explicity specified.
+    If using dense targets, num_targets is computed by reading the feature matrix dimension.
+    """
+
+    if target_type == 'dense':
+        num_targets = common_lib.GetFeatDimFromScp(targets_scp)
+    else:
+        if num_targets == -1:
+            raise Exception("--num-targets is required if target-type is dense")
+
+    common_lib.RunKaldiCommand("""
+steps/nnet3/get_egs_targets.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  --target-type {target_type} \
+  --num-targets {num_targets} \
+  {data} {targets_scp} {egs_dir}
+      """.format(command=run_opts.egs_command,
+          cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+          feat_type=feat_type,
+          transform_dir=transform_dir if transform_dir is not None else '',
+          ivector_dir=online_ivector_dir if online_ivector_dir is not None else '',
+          left_context=left_context, right_context=right_context,
+          valid_left_context=valid_left_context,
+          valid_right_context=valid_right_context,
+          stage=stage, samples_per_iter=samples_per_iter,
+          frames_per_eg=frames_per_eg, srand=srand,
+          num_targets=num_targets,
+          data=data,
+          targets_scp=targets_scp, target_type=target_type,
+          egs_dir=egs_dir,
+          egs_opts=egs_opts if egs_opts is not None else '' ))
+
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/libs/nnet3/train_lib.py
similarity index 94%
rename from egs/wsj/s5/steps/nnet3/libs/train_lib.py
rename to egs/wsj/s5/steps/libs/nnet3/train_lib.py
index 0aab71f221c..0633453e9cd 100644
--- a/egs/wsj/s5/steps/nnet3/libs/train_lib.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train_lib.py
@@ -15,7 +15,7 @@
 
 common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger(__name__ + ".train_lib")
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -132,15 +132,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     # Sets off some background jobs to compute train and
     # validation set objectives
-    train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
-                                          mb_size = cv_minibatch_size,
-                                          get_raw_nnet_from_am = get_raw_nnet_from_am)
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                mb_size = cv_minibatch_size,
+                                get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     if iter > 0:
         # Runs in the background
-        train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
-                                  mb_size = cv_minibatch_size,
-                                  get_raw_nnet_from_am = get_raw_nnet_from_am)
+        ComputeProgress(dir, iter, egs_dir, run_opts,
+                        mb_size = cv_minibatch_size,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -394,33 +394,31 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
 
     common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
-  nnet3-compute-prob {compute_prob_opts} "{model}" \
+  nnet3-compute-prob "{model}" \
         "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
-               compute_prob_opts = compute_prob_opts,
                egs_dir = egs_dir), wait = wait)
 
     common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
-  nnet3-compute-prob {compute_prob_opts} "{model}" \
+  nnet3-compute-prob "{model}" \
        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
-               compute_prob_opts = compute_prob_opts,
                egs_dir = egs_dir), wait = wait)
 
 def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
                     get_raw_nnet_from_am = True):
     if get_raw_nnet_from_am:
-        prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
-        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
+        prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter - 1)
+        model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter)
     else:
         prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
         model = '{0}/{1}.raw'.format(dir, iter)
@@ -466,7 +464,7 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
         mbsize = 1024
 
     if get_raw_nnet_from_am:
-        out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
+        out_model = "| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
     else:
         out_model = '{dir}/final.raw'.format(dir = dir)
 
@@ -475,7 +473,7 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
 nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
    --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
-   {out_model}
+   "{out_model}"
    """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
                dir = dir, raw_models = " ".join(raw_model_strings),
@@ -487,8 +485,8 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     # the same subset we used for the previous compute_probs, as the
     # different subsets will lead to different probs.
     if get_raw_nnet_from_am:
-        train_lib.ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
     else:
-        train_lib.ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
-                                              wait = False, get_raw_nnet_from_am = False)
+        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
+                                    wait = False, get_raw_nnet_from_am = False)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index c6be8d1f6fc..2fd5f871f55 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -40,32 +40,28 @@ def GetArgs():
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve',
+    conflict_handler='resolve',
     parents=[common_train_lib.common_parser])
     # For common options defined in common_train_lib.common_parser,
     # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
-                        default = 8,
+                        default=8,
                         help="Number of output labels per example")
 
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
-                        default = 512,
+                        default=512,
                         help="Size of the minibatch used to compute the gradient")
 
-    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
-                        default=-0.25,
-                        help="")
-
     # General options
-    parser.add_argument("--feat-dir", type=str, required = True,
+    parser.add_argument("--feat-dir", type=str, required=True,
                         help="Directory with features used for training the neural network.")
-    parser.add_argument("--lang", type=str, required = True,
+    parser.add_argument("--lang", type=str, required=True,
                         help="Language directory")
-    parser.add_argument("--ali-dir", type=str, required = True,
+    parser.add_argument("--ali-dir", type=str, required=True,
                         help="Directory with alignments used for training the neural network.")
-    parser.add_argument("--dir", type=str, required = True,
+    parser.add_argument("--dir", type=str, required=True,
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
@@ -126,7 +122,7 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = common_train_lib.GetNumberOfLeaves(args.ali_dir)
+    num_leaves = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
     num_jobs = common_train_lib.GetNumberOfJobs(args.ali_dir)
     feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
     ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
@@ -167,8 +163,8 @@ def Train(args, run_opts):
         common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command = run_opts.command,
-               dir = args.dir))
+    """.format(command=run_opts.command,
+               dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
@@ -177,14 +173,14 @@ def Train(args, run_opts):
         train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
                               left_context, right_context,
                               left_context, right_context, run_opts,
-                              frames_per_eg = args.frames_per_eg,
-                              srand = args.srand,
-                              egs_opts = args.egs_opts,
-                              cmvn_opts = args.cmvn_opts,
-                              online_ivector_dir = args.online_ivector_dir,
-                              samples_per_iter = args.samples_per_iter,
-                              transform_dir = args.transform_dir,
-                              stage = args.egs_stage)
+                              frames_per_eg=args.frames_per_eg,
+                              srand=args.srand,
+                              egs_opts=args.egs_opts,
+                              cmvn_opts=args.cmvn_opts,
+                              online_ivector_dir=args.online_ivector_dir,
+                              samples_per_iter=args.samples_per_iter,
+                              transform_dir=args.transform_dir,
+                              stage=args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -208,17 +204,17 @@ def Train(args, run_opts):
         logger.info('Computing the preconditioning matrix for input features')
 
         train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs = args.max_lda_jobs,
-                                               rand_prune = args.rand_prune)
+                                               max_lda_jobs=args.max_lda_jobs,
+                                               rand_prune=args.rand_prune)
 
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
                     " softmax, using priors^{prior_scale} and rescaling to"
-                    " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
+                    " average 1".format(prior_scale=args.presoftmax_prior_scale_power))
 
         common_train_lib.ComputePresoftmaxPriorScale(
                 args.dir, args.ali_dir, num_jobs, run_opts,
-                presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+                presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)
 
 
     if (args.stage <= -1):
@@ -234,11 +230,11 @@ def Train(args, run_opts):
     num_archives_processed = 0
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
-    num_iters_combine = common_train_lib.VerifyIterations(
-                                         num_iters, args.num_epochs,
-                                         num_hidden_layers, num_archives_expanded,
-                                         args.max_models_combine, args.add_layers_period,
-                                         args.num_jobs_final)
+    models_to_combine = common_train_lib.VerifyIterations(
+        num_iters, args.num_epochs,
+        num_hidden_layers, num_archives_expanded,
+        args.max_models_combine, args.add_layers_period,
+        args.num_jobs_final)
 
     LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
@@ -257,35 +253,37 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
 
-            logger.info("On iteration {0}, learning rate is {1}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed)))
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, LearningRate(iter, current_num_jobs,
+                                   num_archives_processed)))
 
             train_lib.TrainOneIteration(
-                      dir = args.dir,
-                      iter = iter,
-                      srand = args.srand,
-                      egs_dir = egs_dir,
-                      num_jobs = current_num_jobs,
-                      num_archives_processed = num_archives_processed,
-                      num_archives = num_archives,
-                      learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
-                      minibatch_size = args.minibatch_size,
-                      frames_per_eg = args.frames_per_eg,
-                      num_hidden_layers = num_hidden_layers,
-                      add_layers_period = args.add_layers_period,
-                      left_context = left_context,
-                      right_context = right_context,
-                      momentum = args.momentum,
-                      max_param_change = args.max_param_change,
-                      shuffle_buffer_size = args.shuffle_buffer_size,
-                      run_opts = run_opts)
+                dir=args.dir,
+                iter=iter,
+                srand=args.srand,
+                egs_dir=egs_dir,
+                num_jobs=current_num_jobs,
+                num_archives_processed=num_archives_processed,
+                num_archives=num_archives,
+                learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                minibatch_size=args.minibatch_size,
+                frames_per_eg=args.frames_per_eg,
+                num_hidden_layers=num_hidden_layers,
+                add_layers_period=args.add_layers_period,
+                left_context=left_context,
+                right_context=right_context,
+                momentum=args.momentum,
+                max_param_change=args.max_param_change,
+                shuffle_buffer_size=args.shuffle_buffer_size,
+                run_opts=run_opts)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
-                                 args.dir, iter-2, num_iters, num_iters_combine,
-                                 args.preserve_model_interval)
+                    args.dir, iter-2, num_iters, models_to_combine,
+                    args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -293,14 +291,16 @@ def Train(args, run_opts):
                 # lets do some reporting
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
                     common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+        train_lib.CombineModels(
+            args.dir, num_iters, models_to_combine,
+            num_iters_combine, egs_dir, run_opts)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
@@ -309,8 +309,8 @@ def Train(args, run_opts):
                             num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
-        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
-        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        combined_model = "{dir}/combined.mdl".format(dir=args.dir)
+        final_model = "{dir}/final.mdl".format(dir=args.dir)
         train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
                                  final_model, run_opts)
 
@@ -322,16 +322,17 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval = args.preserve_model_interval,
-                         remove_egs = remove_egs)
+        common_train_lib.CleanNnetDir(
+            args.dir, num_iters, egs_dir,
+            preserve_model_interval=args.preserve_model_interval,
+            remove_egs=remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
         common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
     report_handle.write(report)
     report_handle.close()
 
@@ -343,7 +344,7 @@ def Main():
         Train(args, run_opts)
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
             common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 612ff386b89..65d5b56bfc2 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -40,31 +40,31 @@ def GetArgs():
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve',
+    conflict_handler='resolve',
     parents=[common_train_lib.common_parser])
     # For common options defined in common_train_lib.common_parser,
     # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
-                        default = 8,
+                        default=8,
                         help="Number of output labels per example")
 
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
-                        default = 512,
+                        default=512,
                         help="Size of the minibatch used to compute the gradient")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
     parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
-                       default = True, choices = ["true", "false"],
+                       default=True, choices=["true", "false"],
                        help="Train neural network using dense targets")
-    parser.add_argument("--feat-dir", type=str, required = True,
+    parser.add_argument("--feat-dir", type=str, required=True,
                         help="Directory with features used for training the neural network.")
-    parser.add_argument("--targets-scp", type=str, required = True,
+    parser.add_argument("--targets-scp", type=str, required=True,
                         help="Target for training neural network.")
-    parser.add_argument("--dir", type=str, required = True,
+    parser.add_argument("--dir", type=str, required=True,
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
@@ -158,8 +158,8 @@ def Train(args, run_opts):
         common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command = run_opts.command,
-               dir = args.dir))
+    """.format(command=run_opts.command,
+               dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
@@ -183,16 +183,16 @@ def Train(args, run_opts):
                   args.feat_dir, args.targets_scp, default_egs_dir,
                   left_context, right_context,
                   left_context, right_context, run_opts,
-                  frames_per_eg = args.frames_per_eg,
-                  srand = args.srand,
-                  egs_opts = args.egs_opts,
-                  cmvn_opts = args.cmvn_opts,
-                  online_ivector_dir = args.online_ivector_dir,
-                  samples_per_iter = args.samples_per_iter,
-                  transform_dir = args.transform_dir,
-                  stage = args.egs_stage,
-                  target_type = target_type,
-                  num_targets = num_targets)
+                  frames_per_eg=args.frames_per_eg,
+                  srand=args.srand,
+                  egs_opts=args.egs_opts,
+                  cmvn_opts=args.cmvn_opts,
+                  online_ivector_dir=args.online_ivector_dir,
+                  samples_per_iter=args.samples_per_iter,
+                  transform_dir=args.transform_dir,
+                  stage=args.egs_stage,
+                  target_type=target_type,
+                  num_targets=num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -216,8 +216,8 @@ def Train(args, run_opts):
         logger.info('Computing the preconditioning matrix for input features')
 
         train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs = args.max_lda_jobs,
-                                               rand_prune = args.rand_prune)
+                                               max_lda_jobs=args.max_lda_jobs,
+                                               rand_prune=args.rand_prune)
 
 
     if (args.stage <= -1):
@@ -256,35 +256,35 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
+            model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed)))
 
-            train_lib.TrainOneIteration(dir = args.dir,
-                                        iter = iter,
-                                        srand = args.srand,
-                                        egs_dir = egs_dir,
-                                        num_jobs = current_num_jobs,
-                                        num_archives_processed = num_archives_processed,
-                                        num_archives = num_archives,
-                                        learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
-                                        minibatch_size = args.minibatch_size,
-                                        frames_per_eg = args.frames_per_eg,
-                                        num_hidden_layers = num_hidden_layers,
-                                        add_layers_period = args.add_layers_period,
-                                        left_context = left_context,
-                                        right_context = right_context,
-                                        momentum = args.momentum,
-                                        max_param_change = args.max_param_change,
-                                        shuffle_buffer_size = args.shuffle_buffer_size,
-                                        run_opts = run_opts,
-                                        get_raw_nnet_from_am = False)
+            train_lib.TrainOneIteration(dir=args.dir,
+                                        iter=iter,
+                                        srand=args.srand,
+                                        egs_dir=egs_dir,
+                                        num_jobs=current_num_jobs,
+                                        num_archives_processed=num_archives_processed,
+                                        num_archives=num_archives,
+                                        learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                                        minibatch_size=args.minibatch_size,
+                                        frames_per_eg=args.frames_per_eg,
+                                        num_hidden_layers=num_hidden_layers,
+                                        add_layers_period=args.add_layers_period,
+                                        left_context=left_context,
+                                        right_context=right_context,
+                                        momentum=args.momentum,
+                                        max_param_change=args.max_param_change,
+                                        shuffle_buffer_size=args.shuffle_buffer_size,
+                                        run_opts=run_opts,
+                                        get_raw_nnet_from_am=False)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
                                  args.dir, iter-2, num_iters, num_iters_combine,
                                  args.preserve_model_interval,
-                                 get_raw_nnet_from_am = False)
+                                 get_raw_nnet_from_am=False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -292,7 +292,7 @@ def Train(args, run_opts):
                 # lets do some reporting
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
                     common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
@@ -300,14 +300,14 @@ def Train(args, run_opts):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
-                                run_opts, get_raw_nnet_from_am = False)
+                                run_opts, get_raw_nnet_from_am=False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
         avg_post_vec_file = train_lib.ComputeAveragePosterior(
                             args.dir, 'final', egs_dir,
                             num_archives, args.prior_subset_size, run_opts,
-                            get_raw_nnet_from_am = False)
+                            get_raw_nnet_from_am=False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -318,16 +318,16 @@ def Train(args, run_opts):
             remove_egs = False
 
         common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval = args.preserve_model_interval,
-                         remove_egs = remove_egs,
-                         get_raw_nnet_from_am = False)
+                         preserve_model_interval=args.preserve_model_interval,
+                         remove_egs=remove_egs,
+                         get_raw_nnet_from_am=False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
         common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
     report_handle.write(report)
     report_handle.close()
 
@@ -339,7 +339,7 @@ def Main():
         Train(args, run_opts)
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
             common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 8eac8f5fec2..7e0f06f95e3 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -47,20 +47,20 @@ def GetArgs():
         3. RNNs can also be trained with state preservation training
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve',
+    conflict_handler='resolve',
     parents=[common_train_lib.common_parser])
     # For common options defined in common_train_lib.common_parser,
     # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default = 20,
+                        default=20,
                         help="""Number of output labels in the sequence
                         used to train an LSTM.
                         Caution: if you double this you should halve
                         --trainer.samples-per-iter.""")
     parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default = 40,
+                        default=40,
                         help="""Number of left steps used in the estimation of LSTM
                         state before prediction of the first label""")
 
@@ -74,19 +74,24 @@ def GetArgs():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.5,
+                        default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
                         it doesn't increase the effective learning rate.""")
     parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
-                        default = 0.99,
-                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
+                        default=0.99,
+                        help="Scaling factor used for scaling the parameter matrices "
+                             "when the derivative averages are below the "
+                             "shrink-threshold at the non-linearities")
     parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
-                        default = 0.15,
-                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+                        default=0.15,
+                        help="If the derivative averages are below this "
+                        "threshold we scale the parameter matrices with the shrink-value. "
+                        "It is less than 0.25 for sigmoid non-linearities.")
     parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
-            default = 256,
-            help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
+                        default=256,
+                        help="Size of the minibatch to be used in diagnostic jobs "
+                        "(use smaller value for BLSTMs to control memory usage)")
 
     # RNN specific trainer options
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
@@ -94,19 +99,20 @@ def GetArgs():
                         help="Number of sequences to be processed in parallel every minibatch" )
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
                         default=None,
-                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+                        help="The number of time steps to back-propagate from the "
+                        "last label in the chunk. By default it is same as the chunk-width." )
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
     parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
-                       default = True, choices = ["true", "false"],
+                       default=True, choices=["true", "false"],
                        help="Train neural network using dense targets")
-    parser.add_argument("--feat-dir", type=str, required = True,
+    parser.add_argument("--feat-dir", type=str, required=True,
                         help="Directory with features used for training the neural network.")
-    parser.add_argument("--targets-scp", type=str, required = True,
+    parser.add_argument("--targets-scp", type=str, required=True,
                         help="Target for training neural network.")
-    parser.add_argument("--dir", type=str, required = True,
+    parser.add_argument("--dir", type=str, required=True,
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
@@ -206,8 +212,8 @@ def Train(args, run_opts):
         common_train_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command = run_opts.command,
-               dir = args.dir))
+    """.format(command=run_opts.command,
+               dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
@@ -231,16 +237,16 @@ def Train(args, run_opts):
                   left_context, right_context,
                   args.chunk_width + left_context,
                   args.chunk_width + right_context, run_opts,
-                  frames_per_eg = args.chunk_width,
-                  srand = args.srand,
-                  egs_opts = args.egs_opts,
-                  cmvn_opts = args.cmvn_opts,
-                  online_ivector_dir = args.online_ivector_dir,
-                  samples_per_iter = args.samples_per_iter,
-                  transform_dir = args.transform_dir,
-                  stage = args.egs_stage,
-                  target_type = target_type,
-                  num_targets = num_targets)
+                  frames_per_eg=args.chunk_width,
+                  srand=args.srand,
+                  egs_opts=args.egs_opts,
+                  cmvn_opts=args.cmvn_opts,
+                  online_ivector_dir=args.online_ivector_dir,
+                  samples_per_iter=args.samples_per_iter,
+                  transform_dir=args.transform_dir,
+                  stage=args.egs_stage,
+                  target_type=target_type,
+                  num_targets=num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -264,8 +270,8 @@ def Train(args, run_opts):
         logger.info('Computing the preconditioning matrix for input features')
 
         train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs = args.max_lda_jobs,
-                                               rand_prune = args.rand_prune)
+                                               max_lda_jobs=args.max_lda_jobs,
+                                               rand_prune=args.rand_prune)
 
 
     if (args.stage <= -1):
@@ -311,45 +317,46 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
+            model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
             shrinkage_value = (args.shrink_value
                                if common_train_lib.DoShrinkage(iter, model_file,
                                                                "SigmoidComponent",
                                                                args.shrink_threshold,
-                                                               get_raw_nnet_from_am = False)
+                                                               get_raw_nnet_from_am=False)
                                else 1
                                )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(
+                iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
             train_lib.TrainOneIteration(
-                      dir = args.dir,
-                      iter = iter,
-                      srand = args.srand,
-                      egs_dir = egs_dir,
-                      num_jobs = current_num_jobs,
-                      num_archives_processed = num_archives_processed,
-                      num_archives = num_archives,
-                      learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
-                      shrinkage_value = shrinkage_value,
-                      num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                      num_hidden_layers = num_hidden_layers,
-                      add_layers_period = args.add_layers_period,
-                      left_context = left_context,
-                      right_context = right_context,
-                      min_deriv_time = min_deriv_time,
-                      momentum = args.momentum,
-                      max_param_change = args.max_param_change,
-                      shuffle_buffer_size = args.shuffle_buffer_size,
-                      cv_minibatch_size = args.cv_minibatch_size,
-                      run_opts = run_opts,
-                      get_raw_nnet_from_am = False)
+                      dir=args.dir,
+                      iter=iter,
+                      srand=args.srand,
+                      egs_dir=egs_dir,
+                      num_jobs=current_num_jobs,
+                      num_archives_processed=num_archives_processed,
+                      num_archives=num_archives,
+                      learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                      shrinkage_value=shrinkage_value,
+                      num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                      num_hidden_layers=num_hidden_layers,
+                      add_layers_period=args.add_layers_period,
+                      left_context=left_context,
+                      right_context=right_context,
+                      min_deriv_time=min_deriv_time,
+                      momentum=args.momentum,
+                      max_param_change=args.max_param_change,
+                      shuffle_buffer_size=args.shuffle_buffer_size,
+                      cv_minibatch_size=args.cv_minibatch_size,
+                      run_opts=run_opts,
+                      get_raw_nnet_from_am=False)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 common_train_lib.RemoveModel(
                                  args.dir, iter-2, num_iters, num_iters_combine,
                                  args.preserve_model_interval,
-                                 get_raw_nnet_from_am = False)
+                                 get_raw_nnet_from_am=False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
@@ -357,7 +364,7 @@ def Train(args, run_opts):
                 # lets do some reporting
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
                     common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
@@ -365,15 +372,15 @@ def Train(args, run_opts):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
-                                run_opts, chunk_width = args.chunk_width,
-                                get_raw_nnet_from_am = False)
+                                run_opts, chunk_width=args.chunk_width,
+                                get_raw_nnet_from_am=False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
         avg_post_vec_file = train_lib.ComputeAveragePosterior(
                             args.dir, 'final', egs_dir,
                             num_archives, args.prior_subset_size, run_opts,
-                            get_raw_nnet_from_am = False)
+                            get_raw_nnet_from_am=False)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
@@ -384,16 +391,16 @@ def Train(args, run_opts):
             remove_egs = False
 
         common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval = args.preserve_model_interval,
-                         remove_egs = remove_egs,
-                         get_raw_nnet_from_am = False)
+                         preserve_model_interval=args.preserve_model_interval,
+                         remove_egs=remove_egs,
+                         get_raw_nnet_from_am=False)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
         common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
     report_handle.write(report)
     report_handle.close()
 
@@ -405,7 +412,7 @@ def Main():
         Train(args, run_opts)
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
             common_train_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 794fac465bb..e845154dfdd 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -1,14 +1,11 @@
 #!/usr/bin/env python
 
-
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0.
 
-
 # this script is based on steps/nnet3/lstm/train.sh
 
-
 import os
 import subprocess
 import argparse
@@ -19,9 +16,13 @@
 import traceback
 import shutil
 
-common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
+sys.path.append("steps/libs")
+
+import common as common_lib
+import nnet3.train.common  as common_train_lib
+import nnet3.train.frame_level_objf as train_lib
+
 nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -49,19 +50,19 @@ def GetArgs():
     """,
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     conflict_handler = 'resolve',
-    parents=[common_train_lib.common_parser])
+    parents=[common_train_lib.CommonParser.parser])
     # For common options defined in common_train_lib.common_parser,
     # see steps/nnet3/libs/common_train_lib.py
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default = 20,
+                        default=20,
                         help="""Number of output labels in the sequence
                         used to train an LSTM.
                         Caution: if you double this you should halve
                         --trainer.samples-per-iter.""")
     parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default = 40,
+                        default=40,
                         help="""Number of left steps used in the estimation of LSTM
                         state before prediction of the first label""")
 
@@ -75,18 +76,18 @@ def GetArgs():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.5,
+                        default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
                         it doesn't increase the effective learning rate.""")
     parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
-                        default = 0.99,
+                        default=0.99,
                         help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
     parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
-                        default = 0.15,
+                        default=0.15,
                         help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
     parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
-            default = 256,
+            default=256,
             help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
 
     # RNN specific trainer options
@@ -98,13 +99,13 @@ def GetArgs():
                         help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
 
     # General options
-    parser.add_argument("--feat-dir", type=str, required = True,
+    parser.add_argument("--feat-dir", type=str, required=True,
                         help="Directory with features used for training the neural network.")
-    parser.add_argument("--lang", type=str, required = True,
+    parser.add_argument("--lang", type=str, required=True,
                         help="Language directory")
-    parser.add_argument("--ali-dir", type=str, required = True,
+    parser.add_argument("--ali-dir", type=str, required=True,
                         help="Directory with alignments used for training the neural network.")
-    parser.add_argument("--dir", type=str, required = True,
+    parser.add_argument("--dir", type=str, required=True,
                         help="Directory to store the models and all other files.")
 
     print(' '.join(sys.argv))
@@ -137,7 +138,7 @@ def ProcessArgs(args):
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
-        if not common_train_lib.CheckIfCudaCompiled():
+        if not common_lib.CheckIfCudaCompiled():
             logger.warning("""
     You are running with one thread but you have not compiled
     for CUDA.  You may be running a setup optimized for GPUs.  If you have
@@ -171,14 +172,14 @@ def Train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = common_train_lib.GetNumberOfLeaves(args.ali_dir)
-    num_jobs = common_train_lib.GetNumberOfJobs(args.ali_dir)
-    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
+    num_leaves = common_lib.GetNumberOfLeavesFromTree(args.ali_dir)
+    num_jobs = common_lib.GetNumberOfJobs(args.ali_dir)
+    feat_dim = common_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = common_lib.GetIvectorDim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    common_train_lib.SplitData(args.feat_dir, num_jobs)
+    common_lib.SplitData(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
     f = open('{0}/num_jobs'.format(args.dir), 'w')
     f.write(str(num_jobs))
@@ -190,7 +191,6 @@ def Train(args, run_opts):
     variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
 
     # Set some variables.
-
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
@@ -209,28 +209,29 @@ def Train(args, run_opts):
 
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating preconditioning matrix")
-        common_train_lib.RunKaldiCommand("""
+        common_lib.RunKaldiCommand("""
 {command} {dir}/log/nnet_init.log \
     nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command = run_opts.command,
-               dir = args.dir))
+    """.format(command=run_opts.command,
+               dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
-                              left_context, right_context,
-                              args.chunk_width + left_context,
-                              args.chunk_width + right_context, run_opts,
-                              frames_per_eg = args.chunk_width,
-                              srand = args.srand,
-                              egs_opts = args.egs_opts,
-                              cmvn_opts = args.cmvn_opts,
-                              online_ivector_dir = args.online_ivector_dir,
-                              samples_per_iter = args.samples_per_iter,
-                              transform_dir = args.transform_dir,
-                              stage = args.egs_stage)
+        train_lib.acoustic_model.GenerateEgs(
+            args.feat_dir, args.ali_dir, default_egs_dir,
+            left_context, right_context,
+            args.chunk_width + left_context,
+            args.chunk_width + right_context, run_opts,
+            frames_per_eg=args.chunk_width,
+            srand=args.srand,
+            egs_opts=args.egs_opts,
+            cmvn_opts=args.cmvn_opts,
+            online_ivector_dir=args.online_ivector_dir,
+            samples_per_iter=args.samples_per_iter,
+            transform_dir=args.transform_dir,
+            stage=args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -253,18 +254,19 @@ def Train(args, run_opts):
     if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs = args.max_lda_jobs,
-                                               rand_prune = args.rand_prune)
+        train_lib.common.ComputePreconditioningMatrix(
+            args.dir, egs_dir, num_archives, run_opts,
+            max_lda_jobs=args.max_lda_jobs,
+            rand_prune=args.rand_prune)
 
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
                     " softmax, using priors^{prior_scale} and rescaling to"
-                    " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
+                    " average 1".format(prior_scale=args.presoftmax_prior_scale_power))
 
         common_train_lib.ComputePresoftmaxPriorScale(
-                args.dir, args.ali_dir, num_jobs, run_opts,
-                presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+            args.dir, args.ali_dir, num_jobs, run_opts,
+            presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)
 
 
     if (args.stage <= -1):
@@ -280,18 +282,18 @@ def Train(args, run_opts):
     num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
 
     num_iters_combine = common_train_lib.VerifyIterations(
-                                         num_iters, args.num_epochs,
-                                         num_hidden_layers, num_archives,
-                                         args.max_models_combine, args.add_layers_period,
-                                         args.num_jobs_final)
+        num_iters, args.num_epochs,
+        num_hidden_layers, num_archives,
+        args.max_models_combine, args.add_layers_period,
+        args.num_jobs_final)
 
-    learning_rate = (lambda iter, current_num_jobs, num_archives_processed:
+    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
                         common_train_lib.GetLearningRate(
-                                         iter, current_num_jobs, num_iters,
-                                         num_archives_processed,
-                                         num_archives_to_process,
-                                         args.initial_effective_lrate,
-                                         args.final_effective_lrate)
+                            iter, current_num_jobs, num_iters,
+                            num_archives_processed,
+                            num_archives_to_process,
+                            args.initial_effective_lrate,
+                            args.final_effective_lrate)
                     )
 
     if args.num_bptt_steps is None:
@@ -303,6 +305,7 @@ def Train(args, run_opts):
 
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
@@ -310,36 +313,40 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
-            shrinkage_value = (args.shrink_value
-                               if common_train_lib.DoShrinkage(iter, model_file,
-                                                               "SigmoidComponent",
-                                                               args.shrink_threshold)
-                               else 1
-                               )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
-
-            train_lib.TrainOneIteration(
-                      dir = args.dir,
-                      iter = iter,
-                      srand = args.srand,
-                      egs_dir = egs_dir,
-                      num_jobs = current_num_jobs,
-                      num_archives_processed = num_archives_processed,
-                      num_archives = num_archives,
-                      learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                      shrinkage_value = shrinkage_value,
-                      minibatch_size = args.num_chunk_per_minibatch,
-                      num_hidden_layers = num_hidden_layers,
-                      add_layers_period = args.add_layers_period,
-                      left_context = left_context,
-                      right_context = right_context,
-                      min_deriv_time = min_deriv_time,
-                      momentum = args.momentum,
-                      max_param_change = args.max_param_change,
-                      shuffle_buffer_size = args.shuffle_buffer_size,
-                      cv_minibatch_size = args.cv_minibatch_size,
-                      run_opts = run_opts)
+            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
+
+            shrinkage_value = 1.0
+            if args.shrink_value != 1.0:
+                shrinkage_value = (args.shrink_value
+                                   if common_train_lib.DoShrinkage(
+                                        iter, model_file, "SigmoidComponent",
+                                        args.shrink_threshold)
+                                   else 1
+                                   )
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(
+                iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            train_lib.common.TrainOneIteration(
+                dir=args.dir,
+                iter=iter,
+                srand=args.srand,
+                egs_dir=egs_dir,
+                num_jobs=current_num_jobs,
+                num_archives_processed=num_archives_processed,
+                num_archives=num_archives,
+                learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                shrinkage_value=shrinkage_value,
+                minibatch_size=args.num_chunk_per_minibatch,
+                num_hidden_layers=num_hidden_layers,
+                add_layers_period=args.add_layers_period,
+                left_context=left_context,
+                right_context=right_context,
+                min_deriv_time=min_deriv_time,
+                momentum=args.momentum,
+                max_param_change=args.max_param_change,
+                shuffle_buffer_size=args.shuffle_buffer_size,
+                cv_minibatch_size=args.cv_minibatch_size,
+                run_opts=run_opts)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
@@ -353,25 +360,26 @@ def Train(args, run_opts):
                 # lets do some reporting
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
                     common_train_lib.SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
-                                run_opts, chunk_width = args.chunk_width)
+        train_lib.common.CombineModels(args.dir, num_iters, num_iters_combine,
+                                       egs_dir,
+                                       run_opts, chunk_width=args.chunk_width)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
         avg_post_vec_file = train_lib.ComputeAveragePosterior(
-                            args.dir, 'combined', egs_dir,
-                            num_archives, args.prior_subset_size, run_opts)
+            args.dir, 'combined', egs_dir,
+            num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
-        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
-        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        combined_model = "{dir}/combined.mdl".format(dir=args.dir)
+        final_model = "{dir}/final.mdl".format(dir=args.dir)
         train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
                                  final_model, run_opts)
 
@@ -383,16 +391,18 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval = args.preserve_model_interval,
-                         remove_egs = remove_egs)
+        common_train_lib.CleanNnetDir(
+            args.dir, num_iters, egs_dir,
+            preserve_model_interval=args.preserve_model_interval,
+            remove_egs=remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_lib.SendMail(report, "Update : Expt {0} : complete".format(
+                                      args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
     report_handle.write(report)
     report_handle.close()
 
@@ -404,8 +414,8 @@ def Main():
         Train(args, run_opts)
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            common_train_lib.SendMail(message, message, args.email)
+            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
+            common_lib.SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 

From 7dd00f4e824f50ba0beb8afc3138423a9b32e1af Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 14 Nov 2016 22:10:26 -0500
Subject: [PATCH 22/71] raw_python_script: First file in PEP8 standards

---
 egs/wsj/s5/steps/libs/common.py             | 122 ++--
 egs/wsj/s5/steps/libs/nnet3/train/common.py | 729 +++++++++++---------
 2 files changed, 490 insertions(+), 361 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index cb66bba292d..dcc8d4a1fb6 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -1,4 +1,5 @@
 
+
 # Copyright 2016 Vijayaditya Peddinti.
 #           2016 Vimal Manohar
 # Apache 2.0
@@ -11,29 +12,33 @@
 import argparse
 import logging
 import os
-
+import threading
+import math
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
 
-def SendMail(message, subject, email_id):
+def send_mail(message, subject, email_id):
     try:
-        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
-            message = message,
-            subject = subject,
-            email = email_id), shell=True)
+        subprocess.Popen('echo "{message}"| mail -s "{subject}" {email}'.format(
+            message=message,
+            subject=subject,
+            email=email_id), shell=True)
     except Exception as e:
-        logger.info(" Unable to send mail due to error:\n {error}".format(error=str(e)))
+        logger.info(
+            " Unable to send mail due to error:\n {error}".format(
+                error=str(e)))
         pass
 
 
-def StrToBool(value):
+def str_to_bool(value):
     if value == "true":
         return True
     elif value == "false":
@@ -45,11 +50,13 @@ def StrToBool(value):
 class StrToBoolAction(argparse.Action):
     """ A custom action to convert bools from shell format i.e., true/false
         to python format i.e., True/False """
+
     def __call__(self, parser, namespace, values, option_string=None):
         try:
-            setattr(namespace, self.dest, StrToBool(values))
+            setattr(namespace, self.dest, str_to_bool(values))
         except ValueError:
-            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+            raise Exception(
+                "Unknown value {0} for --{1}".format(values, self.dest))
 
 
 class NullstrToNoneAction(argparse.Action):
@@ -57,11 +64,12 @@ class NullstrToNoneAction(argparse.Action):
         to None in python. This is necessary as shell scripts print null strings
         when a variable is not specified. We could use the more apt None
         in python. """
+
     def __call__(self, parser, namespace, values, option_string=None):
-            if values.strip() == "":
-                setattr(namespace, self.dest, None)
-            else:
-                setattr(namespace, self.dest, values)
+        if values.strip() == "":
+            setattr(namespace, self.dest, None)
+        else:
+            setattr(namespace, self.dest, values)
 
 
 def CheckIfCudaCompiled():
@@ -74,6 +82,7 @@ def CheckIfCudaCompiled():
 
 
 class KaldiCommandException(Exception):
+
     def __init__(self, command, err):
         Exception.__init__(self, "There was an error while running the command "
                                  "{0}\n{1}\n{2}".format(command, "-"*10, err))
@@ -87,6 +96,7 @@ class ListNode():
         next_node: A reference to the next object
         previous_node: A reference to the previous object
     """
+
     def __init__(self, data=None, next_node=None, previous_node=None):
         self.data = data
         self.next_node = next_node
@@ -94,6 +104,7 @@ def __init__(self, data=None, next_node=None, previous_node=None):
 
 
 class LinkedListIterator():
+
     def __init__(self, node):
         self.__current = node
 
@@ -111,6 +122,7 @@ def next(self):
 
 
 class LinkedList():
+
     def __init__(self):
         self.__head = None
         self.__tail = None
@@ -179,12 +191,12 @@ def Poll(self):
         for n in self.__process_queue:
             if self.IsProcessDone(n.data):
                 self.EnsureProcessIsDone(n.data)
-        threading.Timer(self.__polling_time, Poll).start()
+        threading.Timer(self.__polling_time, self.Poll).start()
 
     def AddProcess(self, t):
         """ Add a (process handle, command) tuple to the queue
         """
-        self.__process_queue.Push(Node(data=t))
+        self.__process_queue.Push(ListNode(data=t))
 
     def IsProcessDone(self, t):
         p, command = t
@@ -215,14 +227,14 @@ class that is instantiated by the top-level script. If this is
             background_process_handler is provided, this option will be
             ignored and the process will be run in the background.
     """
-    #logger.info("Running the command\n{0}".format(command))
+    # logger.info("Running the command\n{0}".format(command))
     p = subprocess.Popen(command, shell=True,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
 
     if background_process_handler is not None:
         wait = False
-        background_process_handler.AddProcess((p,command))
+        background_process_handler.AddProcess((p, command))
 
     if wait:
         [stdout, stderr] = p.communicate()
@@ -234,7 +246,8 @@ class that is instantiated by the top-level script. If this is
 
 
 def GetNumberOfLeavesFromTree(alidir):
-    [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
+    [stdout, stderr] = RunKaldiCommand(
+        "tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
     parts = stdout.split()
     assert(parts[0] == "num-pdfs")
     num_leaves = int(parts[1])
@@ -244,9 +257,10 @@ def GetNumberOfLeavesFromTree(alidir):
 
 
 def GetNumberOfLeavesFromModel(dir):
-    [stdout, stderr] = RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    [stdout, stderr] = RunKaldiCommand(
+        "am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
-    #number of pdfs 7115
+    # number of pdfs 7115
     assert(' '.join(parts[0:3]) == "number of pdfs")
     num_leaves = int(parts[3])
     if num_leaves == 0:
@@ -256,32 +270,49 @@ def GetNumberOfLeavesFromModel(dir):
 
 def GetNumberOfJobs(alidir):
     try:
-        num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip())
-    except IOError, ValueError:
-        raise Exception('Exception while reading the number of alignment jobs')
+        num_jobs = int(
+            open(
+                '{0}/num_jobs'.format(alidir),
+                'r').readline().strip())
+    except (IOError, ValueError) as e:
+        raise Exception(
+            'Exception while reading the number of alignment jobs: {0}'.format(
+                e.str()))
     return num_jobs
 
 
 def GetIvectorDim(ivector_dir=None):
     if ivector_dir is None:
         return 0
-    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir=ivector_dir))
+    [stdout_val, stderr_val] = RunKaldiCommand(
+        "feat-to-dim --print-args=false "
+        "scp:{dir}/ivector_online.scp -".format(dir=ivector_dir))
     ivector_dim = int(stdout_val)
     return ivector_dim
 
 
 def GetFeatDim(feat_dir):
-    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data=feat_dir))
+    [stdout_val, stderr_val] = RunKaldiCommand(
+        "feat-to-dim --print-args=false "
+        "scp:{data}/feats.scp -".format(data=feat_dir))
     feat_dim = int(stdout_val)
     return feat_dim
 
 
-def GetFeatDimFromScp(feat_scp):
-    [stdout_val, stderr_val] =  RunKaldiCommand("feat-to-dim --print-args=false scp:{feat_scp} -".format(feat_scp = feat_scp))
+def get_feat_dim_from_scp(feat_scp):
+    [stdout_val, stderr_val] = RunKaldiCommand(
+        "feat-to-dim --print-args=false "
+        "scp:{feat_scp} -".format(feat_scp=feat_scp))
     feat_dim = int(stdout_val)
     return feat_dim
 
 
+def split_data(data, num_jobs):
+    RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(
+                        data=data,
+                        num_jobs=num_jobs))
+
+
 def ReadKaldiMatrix(matrix_file):
     try:
         lines = map(lambda x: x.split(), open(matrix_file).readlines())
@@ -290,12 +321,15 @@ def ReadKaldiMatrix(matrix_file):
         lines[0] = lines[0][1:]
         lines[-1] = lines[-1][:-1]
         if not (first_field == "[" and last_field == "]"):
-            raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script")
+            raise Exception(
+                "Kaldi matrix file has incorrect format, "
+                "only text format matrix files can be read by this script")
         for i in range(len(lines)):
             lines[i] = map(lambda x: int(float(x)), lines[i])
         return lines
     except IOError:
-        raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file))
+        raise Exception(
+            "Error while reading the kaldi matrix file {0}".format(matrix_file))
 
 
 def WriteKaldiMatrix(output_file, matrix):
@@ -309,7 +343,8 @@ def WriteKaldiMatrix(output_file, matrix):
 
     for row_index in range(len(matrix)):
         if num_cols != len(matrix[row_index]):
-            raise Exception("All the rows of a matrix are expected to have the same length")
+            raise Exception(
+                "All the rows of a matrix are expected to have the same length")
         file.write(" ".join(map(lambda x: str(x), matrix[row_index])))
         if row_index != num_rows - 1:
             file.write("\n")
@@ -318,9 +353,10 @@ def WriteKaldiMatrix(output_file, matrix):
 
 
 def ForceSymlink(file1, file2):
+    import errno
     try:
         os.symlink(file1, file2)
-    except OSError, e:
+    except OSError as e:
         if e.errno == errno.EEXIST:
             os.remove(file2)
             os.symlink(file1, file2)
@@ -329,7 +365,7 @@ def ForceSymlink(file1, file2):
 def ComputeLifterCoeffs(lifter, dim):
     coeffs = [0] * dim
     for i in range(0, dim):
-        coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter));
+        coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter))
 
     return coeffs
 
@@ -337,20 +373,21 @@ def ComputeLifterCoeffs(lifter, dim):
 def ComputeIdctMatrix(K, N, cepstral_lifter=0):
     matrix = [[0] * K for i in range(N)]
     # normalizer for X_0
-    normalizer = math.sqrt(1.0 / float(N));
+    normalizer = math.sqrt(1.0 / float(N))
     for j in range(0, N):
-        matrix[j][0] = normalizer;
+        matrix[j][0] = normalizer
     # normalizer for other elements
-    normalizer = math.sqrt(2.0 / float(N));
+    normalizer = math.sqrt(2.0 / float(N))
     for k in range(1, K):
-      for n in range(0, N):
-        matrix[n][k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k);
+        for n in range(0, N):
+            matrix[n][
+                k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k)
 
     if cepstral_lifter != 0:
         lifter_coeffs = ComputeLifterCoeffs(cepstral_lifter, K)
         for k in range(0, K):
-          for n in range(0, N):
-            matrix[n][k] = matrix[n][k] / lifter_coeffs[k];
+            for n in range(0, N):
+                matrix[n][k] = matrix[n][k] / lifter_coeffs[k]
 
     return matrix
 
@@ -358,7 +395,8 @@ def ComputeIdctMatrix(K, N, cepstral_lifter=0):
 def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
     # generate the IDCT matrix and write to the file
     idct_matrix = ComputeIdctMatrix(feat_dim, feat_dim, cepstral_lifter)
-    # append a zero column to the matrix, this is the bias of the fixed affine component
+    # append a zero column to the matrix, this is the bias of the fixed affine
+    # component
     for k in range(0, feat_dim):
         idct_matrix[k].append(0)
     WriteKaldiMatrix(file_path, idct_matrix)
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 87ae4cad02f..fb8eb3e8cd7 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -8,13 +8,14 @@
 nnet3 neural networks.
 """
 
-import sys
 import logging
 import math
 import re
 import time
-import argparse
 import shutil
+import glob
+import os
+import argparse
 
 import common as common_lib
 
@@ -22,7 +23,8 @@
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
@@ -44,10 +46,13 @@ def __init__(self):
         self.parallel_train_opts = None
 
 
-def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
+def get_successful_models(num_models, log_file_pattern,
+                          difference_threshold=1.0):
     assert(num_models > 0)
 
-    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
+    parse_regex = re.compile(
+        "LOG .* Overall average objective function for "
+        "'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
     objf = []
     for i in range(num_models):
         model_num = i + 1
@@ -60,8 +65,8 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
             mat_obj = parse_regex.search(lines[-1*line_num])
             if mat_obj is not None:
                 this_objf = float(mat_obj.groups()[0])
-                break;
-        objf.append(this_objf);
+                break
+        objf.append(this_objf)
     max_index = objf.index(max(objf))
     accepted_models = []
     for i in range(num_models):
@@ -69,88 +74,86 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
             accepted_models.append(i+1)
 
     if len(accepted_models) != num_models:
-        logger.warn("""Only {0}/{1} of the models have been accepted
-for averaging, based on log files {2}.""".format(len(accepted_models),
-                                                 num_models, log_file_pattern))
+        logger.warn("Only {0}/{1} of the models have been accepted "
+                    "for averaging, based on log files {2}.".format(
+                        len(accepted_models),
+                        num_models, log_file_pattern))
 
     return [accepted_models, max_index+1]
 
 
-def GetAverageNnetModel(dir, iter, nnets_list, run_opts,
-                        get_raw_nnet_from_am=True, shrink=None):
+def get_average_nnet_model(dir, iter, nnets_list, run_opts,
+                           get_raw_nnet_from_am=True, shrink=None):
     scale = 1.0
     if shrink is not None:
         scale = shrink
 
     next_iter = iter + 1
     if get_raw_nnet_from_am:
-        out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
-{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir=dir, iter=iter,
-                                                 next_iter=next_iter,
-                                                 scale=scale)
+        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
+                        {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(
+                            dir=dir, iter=iter,
+                            next_iter=next_iter,
+                            scale=scale))
     else:
         if shrink is not None:
             out_model = """- \| nnet3-copy --scale={scale} \
-- {dir}/{next_iter}.raw""".format(dir=dir, next_iter=next_iter, scale=scale)
+                           - {dir}/{next_iter}.raw""".format(
+                                   dir=dir, next_iter=next_iter, scale=scale)
         else:
             out_model = "{dir}/{next_iter}.raw".format(dir=dir,
                                                        next_iter=next_iter)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnets_list} \
-{out_model}""".format(command=run_opts.command,
-                      dir=dir,
-                      iter=iter,
-                      nnets_list=nnets_list,
-                      out_model=out_model))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/average.{iter}.log \
+                nnet3-average {nnets_list} \
+                {out_model}""".format(command=run_opts.command,
+                                      dir=dir,
+                                      iter=iter,
+                                      nnets_list=nnets_list,
+                                      out_model=out_model))
 
 
-def GetBestNnetModel(dir, iter, best_model_index, run_opts,
-                     get_raw_nnet_from_am=True, shrink=None):
+def get_best_nnet_model(dir, iter, best_model_index, run_opts,
+                        get_raw_nnet_from_am=True, shrink=None):
     scale = 1.0
     if shrink is not None:
         scale = shrink
 
-    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(
+    best_model = "{dir}/{next_iter}.{best_model_index}.raw".format(
             dir=dir,
             next_iter=iter + 1,
             best_model_index=best_model_index)
 
     if get_raw_nnet_from_am:
-        out_model = """- \| nnet3-am-copy --set-raw-nnet=- \
-{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir=dir, iter=iter,
-                                                 next_iter=iter + 1)
+        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- \
+                        {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(
+                            dir=dir, iter=iter, next_iter=iter + 1))
     else:
-        out_model = '{dir}/{next_iter}.raw'.format(dir=dir,
+        out_model = "{dir}/{next_iter}.raw".format(dir=dir,
                                                    next_iter=iter + 1)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-nnet3-copy --scale={scale} {best_model} \
-{out_model}""".format(command=run_opts.command,
-                      dir=dir, iter=iter,
-                      best_model=best_model,
-                      out_model=out_model, scale=scale))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/select.{iter}.log \
+                nnet3-copy --scale={scale} {best_model} \
+                {out_model}""".format(command=run_opts.command,
+                                      dir=dir, iter=iter,
+                                      best_model=best_model,
+                                      out_model=out_model, scale=scale))
 
 
-def CopyEgsPropertiesToExpDir(egs_dir, dir):
+def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
         for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
             file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
             if os.path.isfile(file_name):
                 shutil.copy2(file_name, dir)
     except IOError:
-        raise Exception("Error while trying to copy egs property files to {dir}".format(dir=dir))
-
+        raise Exception("Error while trying to copy egs "
+                        "property files to {dir}".format(dir=dir))
 
-def SplitData(data, num_jobs):
-   common_lib.RunKaldiCommand(
-           "utils/split_data.sh {data} {num_jobs}".format(data=data,
-                                                          num_jobs=num_jobs))
 
-
-def ParseGenericConfigVarsFile(var_file):
+def parse_generic_config_vars_file(var_file):
     variables = {}
     try:
         var_file_handle = open(var_file, 'r')
@@ -174,88 +177,106 @@ def ParseGenericConfigVarsFile(var_file):
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
 
-def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
+def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                   left_context, right_context):
     try:
-        egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
-        egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline())
-        egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline())
-        egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline())
+        egs_feat_dim = int(open('{0}/info/feat_dim'.format(
+                                egs_dir)).readline())
+        egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(
+                                egs_dir)).readline())
+        egs_left_context = int(open('{0}/info/left_context'.format(
+                                egs_dir)).readline())
+        egs_right_context = int(open('{0}/info/right_context'.format(
+                                egs_dir)).readline())
         if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
-            raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
+            raise Exception("There is mismatch between featdim/ivector_dim of "
+                            "the current experiment and the provided "
+                            "egs directory")
 
-        if (egs_left_context < left_context) or (egs_right_context < right_context):
+        if (egs_left_context < left_context or
+                egs_right_context < right_context):
             raise Exception('The egs have insufficient context')
 
-        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
-        num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
+        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(
+                                egs_dir)).readline())
+        num_archives = int(open('{0}/info/num_archives'.format(
+                                egs_dir)).readline())
 
-        return [egs_left_context, egs_right_context, frames_per_eg, num_archives]
-    except IOError, ValueError:
-        raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir))
+        return [egs_left_context, egs_right_context,
+                frames_per_eg, num_archives]
+    except (IOError, ValueError) as e:
+        raise Exception("The egs dir {0} has missing or "
+                        "malformed files: {1}".format(egs_dir, e.str()))
 
 
-def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
-                                presoftmax_prior_scale_power=-0.25):
+def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
+                                   presoftmax_prior_scale_power=-0.25):
 
     # getting the raw pdf count
-    common_lib.RunKaldiCommand("""
-{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
-ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
-post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB
-     """.format(command = run_opts.command,
-                num_jobs=num_jobs,
-                dir=dir,
-                alidir=alidir))
-
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/sum_pdf_counts.log \
-vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts
-       """.format(command=run_opts.command,  dir=dir))
-
-    import glob
+    common_lib.run_kaldi_command(
+        """{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
+                ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
+                post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- \
+                {dir}/pdf_counts.JOB""".format(command=run_opts.command,
+                                               num_jobs=num_jobs,
+                                               dir=dir,
+                                               alidir=alidir))
+
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/sum_pdf_counts.log \
+                vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts \
+        """.format(command=run_opts.command,  dir=dir))
+
     for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
         os.remove(file)
-    pdf_counts = common_lib.ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
-    scaled_counts = SmoothPresoftmaxPriorScaleVector(
+    pdf_counts = common_lib.read_kaldi_matrix('{0}/pdf_counts'.format(dir))[0]
+    scaled_counts = smooth_presoftmax_prior_scale_vector(
             pdf_counts,
             presoftmax_prior_scale_power=presoftmax_prior_scale_power,
             smooth=0.01)
 
     output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
-    common_lib.WriteKaldiMatrix(output_file, [scaled_counts])
-    common_lib.ForceSymlink("../presoftmax_prior_scale.vec",
-                            "{0}/configs/presoftmax_prior_scale.vec".format(dir))
+    common_lib.write_kaldi_matrix(output_file, [scaled_counts])
+    common_lib.force_symlink("../presoftmax_prior_scale.vec",
+                             "{0}/configs/presoftmax_prior_scale.vec".format(
+                                dir))
 
 
-def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power=-0.25, smooth=0.01):
+def smooth_presoftmax_prior_scale_vector(pdf_counts,
+                                         presoftmax_prior_scale_power=-0.25,
+                                         smooth=0.01):
     total = sum(pdf_counts)
     average_count = total/len(pdf_counts)
     scales = []
     for i in range(len(pdf_counts)):
-        scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
+        scales.append(math.pow(pdf_counts[i] + smooth * average_count,
+                               presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
     scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
     return scaled_counts
 
 
-def PrepareInitialNetwork(dir, run_opts):
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/add_first_layer.log \
-nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config \
-{dir}/0.raw""".format(command=run_opts.command,
-                      dir=dir))
+def prepare_initial_network(dir, run_opts):
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/add_first_layer.log \
+                nnet3-init --srand=-3 {dir}/init.raw \
+                {dir}/configs/layer1.config {dir}/0.raw""".format(
+                    command=run_opts.command,
+                    dir=dir))
 
 
-def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
-                     num_archives, max_models_combine,
-                     add_layers_period, num_jobs_final):
+def verify_iterations(num_iters, num_epochs, num_hidden_layers,
+                      num_archives, max_models_combine,
+                      add_layers_period, num_jobs_final):
     """ Verifies that number of iterations are sufficient for various
         phases of training."""
 
     finish_add_layers_iter = num_hidden_layers * add_layers_period
 
     if num_iters <= (finish_add_layers_iter + 2):
-        raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.')
+        raise Exception("There are insufficient number of epochs. "
+                        "These are not even sufficient for "
+                        "layer-wise discriminatory training.")
 
     approx_iters_per_epoch_final = num_archives/num_jobs_final
     # First work out how many iterations we want to combine over in the final
@@ -288,8 +309,8 @@ def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
     return models_to_combine
 
 
-def GetRealignIters(realign_times, num_iters,
-                    num_jobs_initial, num_jobs_final):
+def get_realign_iters(realign_times, num_iters,
+                      num_jobs_initial, num_jobs_final):
     """ Takes the realign_times string and identifies the approximate
         iterations at which realignments have to be done.
 
@@ -303,8 +324,10 @@ def GetRealignIters(realign_times, num_iters,
         if num_jobs_initial == num_jobs_final:
             realign_iter = int(0.5 + num_iters * realign_time)
         else:
-            realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2)
-                            + realign_time * math.pow(num_jobs_final, 2))
+            realign_iter = math.sqrt((1 - realign_time)
+                                     * math.pow(num_jobs_initial, 2)
+                                     + realign_time * math.pow(num_jobs_final,
+                                                               2))
             realign_iter = realign_iter - num_jobs_initial
             realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
             realign_iter = realign_iter * num_iters
@@ -313,91 +336,105 @@ def GetRealignIters(realign_times, num_iters,
     return realign_iters
 
 
-def Align(dir, data, lang, run_opts, iter=None, transform_dir=None,
+def align(dir, data, lang, run_opts, iter=None, transform_dir=None,
           online_ivector_dir=None):
 
-    alidir = '{dir}/ali{ali_suffix}'.format(dir=dir,
-               ali_suffix="_iter_{0}".format(iter) if iter is not None else "")
+    alidir = '{dir}/ali{ali_suffix}'.format(
+            dir=dir,
+            ali_suffix="_iter_{0}".format(iter) if iter is not None else "")
 
     logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
         gpu=" using gpu " if run_opts.realign_use_gpu else " ",
-        num_jobs=run_opts.realign_num_jobs ))
-    common_lib.RunKaldiCommand("""
-steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \
-        --use-gpu {align_use_gpu} \
-        --transform-dir "{transform_dir}" \
-        --online-ivector-dir "{online_ivector_dir}" \
-        --iter "{iter}" {data} {lang} {dir} {alidir}
-    """.format(dir=dir, align_use_gpu="yes" if run_opts.realign_use_gpu else "no",
-               align_cmd=run_opts.realign_command,
-               align_queue_opt=run_opts.realign_queue_opt,
-               num_jobs_align=run_opts.realign_num_jobs,
-               transform_dir=transform_dir if transform_dir is not None else "",
-               online_ivector_dir=online_ivector_dir if online_ivector_dir is not None else "",
-               iter=iter if iter is not None else "",
-               alidir=alidir,
-               lang=lang, data=data))
+        num_jobs=run_opts.realign_num_jobs))
+    common_lib.run_kaldi_command(
+        """steps/nnet3/align.sh --nj {num_jobs_align} \
+                --cmd "{align_cmd} {align_queue_opt}" \
+                --use-gpu {align_use_gpu} \
+                --transform-dir "{transform_dir}" \
+                --online-ivector-dir "{online_ivector_dir}" \
+                --iter "{iter}" {data} {lang} {dir} {alidir}""".format(
+                    dir=dir, align_use_gpu=("yes"
+                                            if run_opts.realign_use_gpu
+                                            else "no"),
+                    align_cmd=run_opts.realign_command,
+                    align_queue_opt=run_opts.realign_queue_opt,
+                    num_jobs_align=run_opts.realign_num_jobs,
+                    transform_dir=(transform_dir
+                                   if transform_dir is not None
+                                   else ""),
+                    online_ivector_dir=(online_ivector_dir
+                                        if online_ivector_dir is not None
+                                        else ""),
+                    iter=iter if iter is not None else "",
+                    alidir=alidir,
+                    lang=lang, data=data))
     return alidir
 
 
-def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
+def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
             prior_subset_size, num_archives, run_opts,
             transform_dir=None, online_ivector_dir=None):
     raise Exception("Realignment stage has not been implemented in nnet3")
-    logger.info("Getting average posterior for purposes of adjusting the priors.")
+    logger.info("Getting average posterior for purposes of adjusting "
+                "the priors.")
     # Note: this just uses CPUs, using a smallish subset of data.
     # always use the first egs archive, which makes the script simpler;
     # we're using different random subsets of it.
 
-    avg_post_vec_file = ComputeAveragePosterior(
+    avg_post_vec_file = compute_average_posterior(
             dir, iter, prev_egs_dir,
             num_archives, prior_subset_size, run_opts)
 
     avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
     logger.info("Re-adjusting priors based on computed posteriors")
     model = '{0}/{1}.mdl'.format(dir, iter)
-    AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts)
+    adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts)
 
-    alidir = Align(dir, feat_dir, lang, run_opts, iter,
+    alidir = align(dir, feat_dir, lang, run_opts, iter,
                    transform_dir, online_ivector_dir)
-    common_lib.RunKaldiCommand("""
-steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \
-    {prev_egs_dir} {cur_egs_dir}""".format(
-            command=run_opts.command,
-            iter=iter,
-            dir=dir,
-            alidir=alidir,
-            prev_egs_dir=prev_egs_dir,
-            cur_egs_dir=cur_egs_dir))
-
-
-def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
-                    num_archives_to_process,
-                    initial_effective_lrate, final_effective_lrate):
+    common_lib.run_kaldi_command(
+        """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \
+                {alidir} {prev_egs_dir} {cur_egs_dir}""".format(
+                    command=run_opts.command,
+                    iter=iter,
+                    dir=dir,
+                    alidir=alidir,
+                    prev_egs_dir=prev_egs_dir,
+                    cur_egs_dir=cur_egs_dir))
+
+
+def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
+                      num_archives_to_process,
+                      initial_effective_lrate, final_effective_lrate):
     if iter + 1 >= num_iters:
         effective_learning_rate = final_effective_lrate
     else:
         effective_learning_rate = (
-            initial_effective_lrate * math.exp(num_archives_processed *
-                math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process))
+                initial_effective_lrate
+                * math.exp(num_archives_processed
+                           * math.log(final_effective_lrate
+                                      / initial_effective_lrate)
+                           / num_archives_to_process))
 
     return num_jobs * effective_learning_rate
 
 
-def DoShrinkage(iter, model_file, non_linearity, shrink_threshold,
-                get_raw_nnet_from_am=True):
+def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
+                 get_raw_nnet_from_am=True):
 
     if iter == 0:
         return True
 
     try:
         if get_raw_nnet_from_am:
-            output, error = common_lib.RunKaldiCommand(
-                "nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(
+            output, error = common_lib.run_kaldi_command(
+                "nnet3-am-info --print-args=false {model_file} | "
+                "grep {non_linearity}".format(
                     non_linearity=non_linearity, model_file=model_file))
         else:
-            output, error = common_lib.RunKaldiCommand(
-                "nnet3-info --print-args=false {model_file} | grep {non_linearity}".format(
+            output, error = common_lib.run_kaldi_command(
+                "nnet3-info --print-args=false {model_file} | "
+                "grep {non_linearity}".format(
                     non_linearity=non_linearity, model_file=model_file))
         output = output.strip().split("\n")
         # eg.
@@ -409,7 +446,8 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold,
         for line in output:
             mat_obj = mean_pattern.search(line)
             if mat_obj is None:
-                raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line))
+                raise Exception("Something went wrong, unable to find "
+                                "deriv-avg in the line \n{0}".format(line))
             mean_deriv = float(mat_obj.groups()[0])
             total_mean_deriv += mean_deriv
             num_derivs += 1
@@ -421,13 +459,12 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold,
     return False
 
 
-def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts,
-                            get_raw_nnet_from_am=True):
+def compute_average_posterior(dir, iter, egs_dir, num_archives,
+                              prior_subset_size, run_opts,
+                              get_raw_nnet_from_am=True):
     """ Computes the average posterior of the network
     Note: this just uses CPUs, using a smallish subset of data.
     """
-    import glob
     for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
         os.remove(file)
 
@@ -437,29 +474,33 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
         egs_part = 'JOB'
 
     if get_raw_nnet_from_am:
-        model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir=dir)
+        model = "nnet3-am-copy --raw=true {0}/combined.mdl -|".format(dir)
     else:
         model = "{dir}/final.raw".format(dir=dir)
 
-    common_lib.RunKaldiCommand("""
-{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
-    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
-    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
-    nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-    "{model}" ark:- ark:- \| \
-matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
-    """.format(command=run_opts.command,
-               dir=dir, model=model,
-               num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
-               prior_queue_opt=run_opts.prior_queue_opt,
-               iter=iter, prior_subset_size=prior_subset_size,
-               egs_dir=egs_dir, egs_part=egs_part,
-               prior_gpu_opt=run_opts.prior_gpu_opt))
+    common_lib.run_kaldi_command(
+        """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
+                {dir}/log/get_post.{iter}.JOB.log \
+                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
+                ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+                nnet3-merge-egs --measure-output-frames=true \
+                --minibatch-size=128 ark:- ark:- \| \
+                nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
+                "{model}" ark:- ark:- \| \
+                matrix-sum-rows ark:- ark:- \| vector-sum ark:- \
+                {dir}/post.{iter}.JOB.vec""".format(
+                    command=run_opts.command,
+                    dir=dir, model=model,
+                    num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
+                    prior_queue_opt=run_opts.prior_queue_opt,
+                    iter=iter, prior_subset_size=prior_subset_size,
+                    egs_dir=egs_dir, egs_part=egs_part,
+                    prior_gpu_opt=run_opts.prior_gpu_opt))
 
     # make sure there is time for $dir/post.{iter}.*.vec to appear.
     time.sleep(5)
     avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
-    common_lib.RunKaldiCommand("""
+    common_lib.run_kaldi_command("""
 {command} {dir}/log/vector_sum.{iter}.log \
     vector-sum {dir}/post.{iter}.*.vec {output_file}
         """.format(command=run_opts.command,
@@ -470,40 +511,43 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     return avg_post_vec_file
 
 
-def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts):
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/adjust_priors.final.log \
-    nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} "{output_model}"
-    """.format(command=run_opts.command,
-               dir=dir, input_model=input_model,
-               avg_posterior_vector=avg_posterior_vector,
-               output_model=output_model))
+def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
+                     run_opts):
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/adjust_priors.final.log \
+                nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \
+                "{output_model}" """.format(
+                    command=run_opts.command,
+                    dir=dir, input_model=input_model,
+                    avg_posterior_vector=avg_posterior_vector,
+                    output_model=output_model))
 
 
-def RemoveEgs(egs_dir):
-    common_lib.RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir))
+def remove_egs(egs_dir):
+    common_lib.run_kaldi_command("steps/nnet2/remove_egs.sh {egs_dir}".format(
+                                    egs_dir=egs_dir))
 
 
-def CleanNnetDir(nnet_dir, num_iters, egs_dir,
-                 preserve_model_interval=100,
-                 remove_egs=True,
-                 get_raw_nnet_from_am=True):
+def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
+                   preserve_model_interval=100,
+                   remove_egs=True,
+                   get_raw_nnet_from_am=True):
     try:
         if remove_egs:
-            RemoveEgs(egs_dir)
+            remove_egs(egs_dir)
 
         for iter in range(num_iters):
-            RemoveModel(nnet_dir, iter, num_iters, None,
-                        preserve_model_interval,
-                        get_raw_nnet_from_am=get_raw_nnet_from_am)
+            remove_model(nnet_dir, iter, num_iters, None,
+                         preserve_model_interval,
+                         get_raw_nnet_from_am=get_raw_nnet_from_am)
     except (IOError, OSError) as err:
         logger.warning("Error while cleaning up the nnet directory")
         raise err
 
 
-def RemoveModel(nnet_dir, iter, num_iters, models_to_combine=None,
-                preserve_model_interval=100,
-                get_raw_nnet_from_am=True):
+def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
+                 preserve_model_interval=100,
+                 get_raw_nnet_from_am=True):
     if iter % preserve_model_interval == 0:
         return
     if models_to_combine is not None and iter in models_to_combine:
@@ -527,162 +571,209 @@ class CommonParser:
     """
 
     def __init__(self):
-        self.parser = argparser.ArgumentParser(add_help=False)
+        self.parser = argparse.ArgumentParser(add_help=False)
 
         # feat options
-        self.parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                                 default = None, action = NullstrToNoneAction,
-                                 help="""directory with the ivectors extracted in
-                                 an online fashion.""")
-        self.parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                                 default = None, action = NullstrToNoneAction,
-                                 help="A string specifying '--norm-means' and '--norm-vars' values")
+        self.parser.add_argument("--feat.online-ivector-dir", type=str,
+                                 dest='online_ivector_dir', default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help="""directory with the ivectors extracted
+                                 in an online fashion.""")
+        self.parser.add_argument("--feat.cmvn-opts", type=str,
+                                 dest='cmvn_opts', default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help="A string specifying '--norm-means' "
+                                 "and '--norm-vars' values")
 
         # egs extraction options
-        self.parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                                 default = 0,
-                                 help="Number of additional frames of input to the left"
-                                 " of the input chunk. This extra context will be used"
-                                 " in the estimation of RNN state before prediction of"
-                                 " the first label. In the case of FF-DNN this extra"
-                                 " context will be used to allow for frame-shifts")
-        self.parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
-                                 default = 0,
-                                 help="Number of additional frames of input to the right"
-                                 " of the input chunk. This extra context will be used"
-                                 " in the estimation of bidirectional RNN state before"
-                                 " prediction of the first label.")
-        self.parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                                 default = None, action = NullstrToNoneAction,
-                                 help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+        self.parser.add_argument("--egs.chunk-left-context", type=int,
+                                 dest='chunk_left_context', default=0,
+                                 help="""Number of additional frames of input
+                                 to the left of the input chunk. This extra
+                                 context will be used in the estimation of RNN
+                                 state before prediction of the first label. In
+                                 the case of FF-DNN this extra context will be
+                                 used to allow for frame-shifts""")
+        self.parser.add_argument("--egs.chunk-right-context", type=int,
+                                 dest='chunk_right_context', default=0,
+                                 help="""Number of additional frames of input
+                                 to the right of the input chunk. This extra
+                                 context will be used in the estimation of
+                                 bidirectional RNN state before prediction of
+                                 the first label.""")
+        self.parser.add_argument("--egs.transform_dir", type=str,
+                                 dest='transform_dir', default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help="String to provide options directly to "
+                                 "steps/nnet3/get_egs.sh script")
         self.parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                                 default = None, action = NullstrToNoneAction,
-                                 help="""Directory with egs. If specified this directory
-                                 will be used rather than extracting egs""")
+                                 default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help="""Directory with egs. If specified this
+                                 directory will be used rather than extracting
+                                 egs""")
         self.parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                                 default = 0, help="Stage at which get_egs.sh should be restarted")
+                                 default=0,
+                                 help="Stage at which get_egs.sh should be "
+                                 "restarted")
         self.parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                                 default = None, action = NullstrToNoneAction,
-                                 help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+                                 default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help="""String to provide options directly
+                                 to steps/nnet3/get_egs.sh script""")
 
         # trainer options
         self.parser.add_argument("--trainer.srand", type=int, dest='srand',
-                                 default = 0,
-                                 help="Sets the random seed for model initialization and egs shuffling. "
-                                 "Warning: This random seed does not control all aspects of this experiment. "
-                                 "There might be other random seeds used in other stages of the experiment "
-                                 "like data preparation (e.g. volume perturbation).")
-        self.parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                                 default = 8,
+                                 default=0,
+                                 help="""Sets the random seed for model
+                                 initialization and egs shuffling.
+                                 Warning: This random seed does not control all
+                                 aspects of this experiment.  There might be
+                                 other random seeds used in other stages of the
+                                 experiment like data preparation (e.g. volume
+                                 perturbation).""")
+        self.parser.add_argument("--trainer.num-epochs", type=int,
+                                 dest='num_epochs', default=8,
                                  help="Number of epochs to train the model")
-        self.parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                                 default = 20000,
+        self.parser.add_argument("--trainer.prior-subset-size", type=int,
+                                 dest='prior_subset_size', default=20000,
                                  help="Number of samples for computing priors")
-        self.parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                                 default = 10,
-                                 help="The prior computation jobs are single threaded and run on the CPU")
-        self.parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                                 default = 20,
-                                 help="The maximum number of models used in the final model combination stage. "
-                                 "These models will themselves be averages of iteration-number ranges")
-        self.parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                                 default = 5000,
-                                 help=""" Controls randomization of the samples on each
-                                 iteration. If 0 or a large value the randomization is
-                                 complete, but this will consume memory and cause spikes
-                                 in disk I/O.  Smaller is easier on disk and memory but
-                                 less random.  It's not a huge deal though, as samples
-                                 are anyway randomized right at the start.
-                                 (the point of this is to get data in different
-                                 minibatches on different iterations, since in the
-                                 preconditioning method, 2 samples in the same minibatch
-                                 can affect each others' gradients.""")
-        self.parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                                 default=2,
-                                 help="The number of iterations between adding layers"
-                                 "during layer-wise discriminative training.")
-        self.parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                                 default=2.0,
-                                 help="""The maximum change in parameters allowed
-                                 per minibatch, measured in Frobenius norm over
-                                 the entire model""")
-        self.parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                                 default=400000,
-                                 help="This is really the number of egs in each archive.")
-        self.parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                                 default=4.0,
-                                 help="""Value used in preconditioning matrix estimation""")
-        self.parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                                 default=10,
-                                 help="""Max number of jobs used for LDA stats accumulation""")
-        self.parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float,
+        self.parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
+                                 dest='num_jobs_compute_prior', default=10,
+                                 help="The prior computation jobs are single "
+                                 "threaded and run on the CPU")
+        self.parser.add_argument("--trainer.max-models-combine", type=int,
+                                 dest='max_models_combine', default=20,
+                                 help="""The maximum number of models used in
+                                 the final model combination stage.  These
+                                 models will themselves be averages of
+                                 iteration-number ranges""")
+        self.parser.add_argument("--trainer.shuffle-buffer-size", type=int,
+                                 dest='shuffle_buffer_size', default=5000,
+                                 help=""" Controls randomization of the samples
+                                 on each iteration. If 0 or a large value the
+                                 randomization is complete, but this will
+                                 consume memory and cause spikes in disk I/O.
+                                 Smaller is easier on disk and memory but less
+                                 random.  It's not a huge deal though, as
+                                 samples are anyway randomized right at the
+                                 start.  (the point of this is to get data in
+                                 different minibatches on different iterations,
+                                 since in the preconditioning method, 2 samples
+                                 in the same minibatch can affect each others'
+                                 gradients.""")
+        self.parser.add_argument("--trainer.add-layers-period", type=int,
+                                 dest='add_layers_period', default=2,
+                                 help="""The number of iterations between
+                                 adding layers during layer-wise discriminative
+                                 training.""")
+        self.parser.add_argument("--trainer.max-param-change", type=float,
+                                 dest='max_param_change', default=2.0,
+                                 help="""The maximum change in parameters
+                                 allowed per minibatch, measured in Frobenius
+                                 norm over the entire model""")
+        self.parser.add_argument("--trainer.samples-per-iter", type=int,
+                                 dest='samples_per_iter', default=400000,
+                                 help="This is really the number of egs in "
+                                 "each archive.")
+        self.parser.add_argument("--trainer.lda.rand-prune", type=float,
+                                 dest='rand_prune', default=4.0,
+                                 help="Value used in preconditioning "
+                                 "matrix estimation")
+        self.parser.add_argument("--trainer.lda.max-lda-jobs", type=float,
+                                 dest='max_lda_jobs', default=10,
+                                 help="Max number of jobs used for "
+                                 "LDA stats accumulation")
+        self.parser.add_argument("--trainer.presoftmax-prior-scale-power",
+                                 type=float,
                                  dest='presoftmax_prior_scale_power',
                                  default=-0.25,
-                                 help="")
-
+                                 help="Scale on presofmax prior")
 
         # Parameters for the optimization
-        self.parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                                 default = 0.0003,
-                                 help="Learning rate used during the initial iteration")
-        self.parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                                 default = 0.00003,
-                                 help="Learning rate used during the final iteration")
-        self.parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                                 default = 1,
-                                 help="Number of neural net jobs to run in parallel at the start of training")
-        self.parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                                 default = 8,
-                                 help="Number of neural net jobs to run in parallel at the end of training")
-        self.parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                                 default = 20,
-                                 help = """The is the maximum number of models we give to the
-                                           final 'combine' stage, but these models will themselves
-                                           be averages of iteration-number ranges. """)
-        self.parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                                 default = 0.0,
+        self.parser.add_argument(
+            "--trainer.optimization.initial-effective-lrate", type=float,
+            dest='initial_effective_lrate', default=0.0003,
+            help="Learning rate used during the initial iteration")
+        self.parser.add_argument(
+            "--trainer.optimization.final-effective-lrate", type=float,
+            dest='final_effective_lrate', default=0.00003,
+            help="Learning rate used during the final iteration")
+        self.parser.add_argument("--trainer.optimization.num-jobs-initial",
+                                 type=int, dest='num_jobs_initial', default=1,
+                                 help="Number of neural net jobs to run in "
+                                 "parallel at the start of training")
+        self.parser.add_argument("--trainer.optimization.num-jobs-final",
+                                 type=int, dest='num_jobs_final', default=8,
+                                 help="Number of neural net jobs to run in "
+                                 "parallel at the end of training")
+        self.parser.add_argument("--trainer.optimization.max-models-combine",
+                                 type=int, dest='max_models_combine',
+                                 default=20,
+                                 help="""The is the maximum number of models we
+                                 give to the final 'combine' stage, but these
+                                 models will themselves be averages of
+                                 iteration-number ranges.""")
+        self.parser.add_argument("--trainer.optimization.momentum", type=float,
+                                 dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
-                                 Note: we implemented it in such a way that
-                                 it doesn't increase the effective learning rate.""")
+                                 Note: we implemented it in such a way that it
+                                 doesn't increase the effective learning
+                                 rate.""")
+
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,
-                                 help="Specifies the stage of the experiment to execution from")
+                                 help="Specifies the stage of the experiment "
+                                 "to execution from")
         self.parser.add_argument("--exit-stage", type=int, default=None,
-                                 help="If specified, training exits before running this stage")
-        self.parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                                 dest = "command",
+                                 help="If specified, training exits before "
+                                 "running this stage")
+        self.parser.add_argument("--cmd", type=str, dest="command",
+                                 action=common_lib.NullstrToNoneAction,
                                  help="""Specifies the script to launch jobs.
                                  e.g. queue.pl for launching on SGE cluster
                                         run.pl for launching on local machine
-                                 """, default = "queue.pl")
-        self.parser.add_argument("--egs.cmd", type=str, action = NullstrToNoneAction,
-                                 dest = "egs_command",
-                                 help="""Script to launch egs jobs""", default = "queue.pl")
-        self.parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                                 choices = ["true", "false"],
+                                 """, default="queue.pl")
+        self.parser.add_argument("--egs.cmd", type=str, dest="egs_command",
+                                 action=common_lib.NullstrToNoneAction,
+                                 default="queue.pl",
+                                 help="Script to launch egs jobs")
+        self.parser.add_argument("--use-gpu", type=str,
+                                 action=common_lib.StrToBoolAction,
+                                 choices=["true", "false"],
                                  help="Use GPU for training", default=True)
-        self.parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                                 choices = ["true", "false"],
-                                 help="Clean up models after training", default=True)
-        self.parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                                 default = True, action = StrToBoolAction,
-                                 choices = ["true", "false"],
-                                 help="""If true, remove egs after experiment""")
-        self.parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+        self.parser.add_argument("--cleanup", type=str,
+                                 action=common_lib.StrToBoolAction,
+                                 choices=["true", "false"], default=True,
+                                 help="Clean up models after training")
+        self.parser.add_argument("--cleanup.remove-egs", type=str,
+                                 dest='remove_egs', default=True,
+                                 action=common_lib.StrToBoolAction,
+                                 choices=["true", "false"],
+                                 help="If true, remove egs after experiment")
+        self.parser.add_argument("--cleanup.preserve-model-interval",
+                                 dest="preserve_model_interval",
                                  type=int, default=100,
-                                 help="Determines iterations for which models will be preserved during cleanup. "
-                                 "If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-        self.parser.add_argument("--reporting.email", dest = "email",
-                                 type=str, default=None, action = NullstrToNoneAction,
-                                 help=""" Email-id to report about the progress of the experiment.
-                                         NOTE: It assumes the machine on which the script is being run can send
-                                         emails from command line via. mail program. The
-                                         Kaldi mailing list will not support this feature.
-                                         It might require local expertise to setup. """)
-        self.parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                                 help="""Determines iterations for which models
+                                 will be preserved during cleanup.
+                                 If mod(iter,preserve_model_interval) == 0
+                                 model will be preserved.""")
+
+        self.parser.add_argument("--reporting.email", dest="email",
+                                 type=str, default=None,
+                                 action=common_lib.NullstrToNoneAction,
+                                 help=""" Email-id to report about the progress
+                                 of the experiment.  NOTE: It assumes the
+                                 machine on which the script is being run can
+                                 send emails from command line via. mail
+                                 program. The Kaldi mailing list will not
+                                 support this feature.  It might require local
+                                 expertise to setup. """)
+        self.parser.add_argument("--reporting.interval",
+                                 dest="reporting_interval",
                                  type=int, default=0.1,
-                                 help="Frequency with which reports have to be sent, "
-                                 "measured in terms of fraction of iterations. "
-                                 "If 0 and reporting mail has been specified then only failure notifications are sent")
-
+                                 help="""Frequency with which reports have to
+                                 be sent, measured in terms of fraction of
+                                 iterations.
+                                 If 0 and reporting mail has been specified
+                                 then only failure notifications are sent""")

From f77a3ce736e4c1f6baa71b508f977bc1c11ee40a Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 15 Nov 2016 10:22:05 -0500
Subject: [PATCH 23/71] raw_python_script: Reorganized libraries

---
 egs/wsj/s5/steps/libs/__init__.py             |   8 +
 egs/wsj/s5/steps/libs/common.py               | 154 ++--
 egs/wsj/s5/steps/libs/nnet3/__init__.py       |   7 +
 .../s5/steps/libs/nnet3/report/__init__.py    |   6 +
 .../s5/steps/libs/nnet3/report/log_parse.py   | 320 ++++++++
 .../libs/nnet3/train/chain_objf/__init__.py   |  10 +
 .../nnet3/train/chain_objf/acoustic_model.py  | 633 ++++++++++-----
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 180 +----
 .../nnet3/train/frame_level_objf/__init__.py  |   2 +
 .../train/frame_level_objf/acoustic_model.py  |  86 +-
 .../nnet3/train/frame_level_objf/common.py    | 667 +++++++++------
 .../nnet3/train/frame_level_objf/raw_model.py | 123 +--
 egs/wsj/s5/steps/nnet3/chain/train.py         | 760 +++++++-----------
 .../steps/nnet3/report/nnet3_log_parse_lib.py | 231 ------
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 334 ++++----
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       | 396 +++++----
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       | 478 ++++++-----
 egs/wsj/s5/steps/nnet3/train_rnn.py           | 388 +++++----
 18 files changed, 2644 insertions(+), 2139 deletions(-)
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/report/__init__.py
 create mode 100755 egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
 delete mode 100755 egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py

diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py
index 37aa01b75c1..2a472386568 100644
--- a/egs/wsj/s5/steps/libs/__init__.py
+++ b/egs/wsj/s5/steps/libs/__init__.py
@@ -1 +1,9 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+""" This package contains modules and subpackages used in kaldi scripts.
+"""
+
 __all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index dcc8d4a1fb6..c966b941ee1 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -8,12 +8,12 @@
 commonly used in many kaldi python scripts.
 """
 
-import subprocess
 import argparse
 import logging
+import math
 import os
+import subprocess
 import threading
-import math
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -27,14 +27,14 @@
 
 def send_mail(message, subject, email_id):
     try:
-        subprocess.Popen('echo "{message}"| mail -s "{subject}" {email}'.format(
-            message=message,
-            subject=subject,
-            email=email_id), shell=True)
+        subprocess.Popen(
+            'echo "{message}" | mail -s "{subject}" {email}'.format(
+                message=message,
+                subject=subject,
+                email=email_id), shell=True)
     except Exception as e:
-        logger.info(
-            " Unable to send mail due to error:\n {error}".format(
-                error=str(e)))
+        logger.info("Unable to send mail due to error:\n {error}".format(
+                        error=str(e)))
         pass
 
 
@@ -60,10 +60,9 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 
 class NullstrToNoneAction(argparse.Action):
-    """ A custom action to convert empty strings passed by shell
-        to None in python. This is necessary as shell scripts print null strings
-        when a variable is not specified. We could use the more apt None
-        in python. """
+    """ A custom action to convert empty strings passed by shell to None in
+    python. This is necessary as shell scripts print null strings when a
+    variable is not specified. We could use the more apt None in python. """
 
     def __call__(self, parser, namespace, values, option_string=None):
         if values.strip() == "":
@@ -72,7 +71,7 @@ def __call__(self, parser, namespace, values, option_string=None):
             setattr(namespace, self.dest, values)
 
 
-def CheckIfCudaCompiled():
+def check_if_cuda_compiled():
     p = subprocess.Popen("cuda-compiled")
     p.communicate()
     if p.returncode == 1:
@@ -84,8 +83,9 @@ def CheckIfCudaCompiled():
 class KaldiCommandException(Exception):
 
     def __init__(self, command, err):
-        Exception.__init__(self, "There was an error while running the command "
-                                 "{0}\n{1}\n{2}".format(command, "-"*10, err))
+        Exception.__init__(self,
+                           "There was an error while running the command "
+                           "{0}\n{1}\n{2}".format(command, "-"*10, err))
 
 
 class ListNode():
@@ -130,7 +130,7 @@ def __init__(self):
     def __iter__(self):
         return LinkedListIterator(self.__head)
 
-    def Push(self, node):
+    def push(self, node):
         """Pushes the node <node> at the "front" of the linked list
         """
         node.next_node = self.__head
@@ -138,7 +138,7 @@ def Push(self, node):
         self.__head.previous_node = node
         self.__head = node
 
-    def Pop(self):
+    def pop(self):
         """Pops the last node out of the list"""
         old_last_node = self.__tail
         to_be_last = self.__tail.previous_node
@@ -150,7 +150,7 @@ def Pop(self):
 
         return old_last_node
 
-    def Remove(self, node):
+    def remove(self, node):
         """Removes and returns node, and connects the previous and next
         nicely
         """
@@ -171,7 +171,7 @@ class BackgroundProcessHandler():
     script waits until all the processes end before exiting
 
     A top-level script is expected to instantiate an object of this class
-    and pass it to all calls of RunKaldiCommand that are to be run in the
+    and pass it to all calls of run_kaldi_command that are to be run in the
     background. The background processes are queued and these are polled
     in a parallel thread at set interval to check for failures.
     The top-level script can ensure at the end ensure that all processes are
@@ -185,37 +185,38 @@ class BackgroundProcessHandler():
     def __init__(self, polling_time=600):
         self.__process_queue = LinkedList()
         self.__polling_time = polling_time
-        self.Poll()
+        self.poll()
 
-    def Poll(self):
+    def poll(self):
         for n in self.__process_queue:
-            if self.IsProcessDone(n.data):
-                self.EnsureProcessIsDone(n.data)
-        threading.Timer(self.__polling_time, self.Poll).start()
+            if self.is_process_done(n.data):
+                self.ensure_process_is_done(n.data)
+                self.__process_queue.remove(n)
+        threading.Timer(self.__polling_time, self.poll).start()
 
-    def AddProcess(self, t):
+    def add_process(self, t):
         """ Add a (process handle, command) tuple to the queue
         """
         self.__process_queue.Push(ListNode(data=t))
 
-    def IsProcessDone(self, t):
+    def is_process_done(self, t):
         p, command = t
         if p.poll() is None:
             return False
         return True
 
-    def EnsureProcessIsDone(self, t):
+    def ensure_process_is_done(self, t):
         p, command = t
         [stdout, stderr] = p.communicate()
         if p.returncode is not 0:
             raise KaldiCommandException(command, stderr)
 
-    def EnsureProcessesAreDone(self):
+    def ensure_processes_are_done(self):
         for n in self.__process_queue:
-            self.EnsureProcessIsDone(n.data)
+            self.ensure_process_is_done(n.data)
 
 
-def RunKaldiCommand(command, wait=True, background_process_handler=None):
+def run_kaldi_command(command, wait=True, background_process_handler=None):
     """ Runs commands frequently seen in Kaldi scripts. These are usually a
         sequence of commands connected by pipes, so we use shell=True.
 
@@ -234,7 +235,7 @@ class that is instantiated by the top-level script. If this is
 
     if background_process_handler is not None:
         wait = False
-        background_process_handler.AddProcess((p, command))
+        background_process_handler.add_process((p, command))
 
     if wait:
         [stdout, stderr] = p.communicate()
@@ -245,8 +246,8 @@ class that is instantiated by the top-level script. If this is
         return p
 
 
-def GetNumberOfLeavesFromTree(alidir):
-    [stdout, stderr] = RunKaldiCommand(
+def get_number_of_leaves_from_tree(alidir):
+    [stdout, stderr] = run_kaldi_command(
         "tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
     parts = stdout.split()
     assert(parts[0] == "num-pdfs")
@@ -256,8 +257,8 @@ def GetNumberOfLeavesFromTree(alidir):
     return num_leaves
 
 
-def GetNumberOfLeavesFromModel(dir):
-    [stdout, stderr] = RunKaldiCommand(
+def get_number_of_leaves_from_model(dir):
+    [stdout, stderr] = run_kaldi_command(
         "am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
     parts = stdout.split()
     # number of pdfs 7115
@@ -268,31 +269,27 @@ def GetNumberOfLeavesFromModel(dir):
     return num_leaves
 
 
-def GetNumberOfJobs(alidir):
+def get_number_of_jobs(alidir):
     try:
-        num_jobs = int(
-            open(
-                '{0}/num_jobs'.format(alidir),
-                'r').readline().strip())
+        num_jobs = int(open('{0}/num_jobs'.format(alidir)).readline().strip())
     except (IOError, ValueError) as e:
-        raise Exception(
-            'Exception while reading the number of alignment jobs: {0}'.format(
-                e.str()))
+        raise Exception("Exception while reading the "
+                        "number of alignment jobs: {0}".format(e.str()))
     return num_jobs
 
 
-def GetIvectorDim(ivector_dir=None):
+def get_ivector_dim(ivector_dir=None):
     if ivector_dir is None:
         return 0
-    [stdout_val, stderr_val] = RunKaldiCommand(
+    [stdout_val, stderr_val] = run_kaldi_command(
         "feat-to-dim --print-args=false "
         "scp:{dir}/ivector_online.scp -".format(dir=ivector_dir))
     ivector_dim = int(stdout_val)
     return ivector_dim
 
 
-def GetFeatDim(feat_dir):
-    [stdout_val, stderr_val] = RunKaldiCommand(
+def get_feat_dim(feat_dir):
+    [stdout_val, stderr_val] = run_kaldi_command(
         "feat-to-dim --print-args=false "
         "scp:{data}/feats.scp -".format(data=feat_dir))
     feat_dim = int(stdout_val)
@@ -300,7 +297,7 @@ def GetFeatDim(feat_dir):
 
 
 def get_feat_dim_from_scp(feat_scp):
-    [stdout_val, stderr_val] = RunKaldiCommand(
+    [stdout_val, stderr_val] = run_kaldi_command(
         "feat-to-dim --print-args=false "
         "scp:{feat_scp} -".format(feat_scp=feat_scp))
     feat_dim = int(stdout_val)
@@ -308,12 +305,12 @@ def get_feat_dim_from_scp(feat_scp):
 
 
 def split_data(data, num_jobs):
-    RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(
+    run_kaldi_command("utils/split_data.sh {data} {num_jobs}".format(
                         data=data,
                         num_jobs=num_jobs))
 
 
-def ReadKaldiMatrix(matrix_file):
+def read_kaldi_matrix(matrix_file):
     try:
         lines = map(lambda x: x.split(), open(matrix_file).readlines())
         first_field = lines[0][0]
@@ -328,31 +325,30 @@ def ReadKaldiMatrix(matrix_file):
             lines[i] = map(lambda x: int(float(x)), lines[i])
         return lines
     except IOError:
-        raise Exception(
-            "Error while reading the kaldi matrix file {0}".format(matrix_file))
+        raise Exception("Error while reading the kaldi matrix file "
+                        "{0}".format(matrix_file))
 
 
-def WriteKaldiMatrix(output_file, matrix):
+def write_kaldi_matrix(output_file, matrix):
     # matrix is a list of lists
-    file = open(output_file, 'w')
-    file.write("[ ")
-    num_rows = len(matrix)
-    if num_rows == 0:
-        raise Exception("Matrix is empty")
-    num_cols = len(matrix[0])
-
-    for row_index in range(len(matrix)):
-        if num_cols != len(matrix[row_index]):
-            raise Exception(
-                "All the rows of a matrix are expected to have the same length")
-        file.write(" ".join(map(lambda x: str(x), matrix[row_index])))
-        if row_index != num_rows - 1:
-            file.write("\n")
-    file.write(" ]")
-    file.close()
-
-
-def ForceSymlink(file1, file2):
+    with open(output_file, 'w') as f:
+        f.write("[ ")
+        num_rows = len(matrix)
+        if num_rows == 0:
+            raise Exception("Matrix is empty")
+        num_cols = len(matrix[0])
+
+        for row_index in range(len(matrix)):
+            if num_cols != len(matrix[row_index]):
+                raise Exception("All the rows of a matrix are expected to "
+                                "have the same length")
+            f.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+            if row_index != num_rows - 1:
+                f.write("\n")
+        f.write(" ]")
+
+
+def force_symlink(file1, file2):
     import errno
     try:
         os.symlink(file1, file2)
@@ -362,7 +358,7 @@ def ForceSymlink(file1, file2):
             os.symlink(file1, file2)
 
 
-def ComputeLifterCoeffs(lifter, dim):
+def compute_lifter_coeffs(lifter, dim):
     coeffs = [0] * dim
     for i in range(0, dim):
         coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter))
@@ -370,7 +366,7 @@ def ComputeLifterCoeffs(lifter, dim):
     return coeffs
 
 
-def ComputeIdctMatrix(K, N, cepstral_lifter=0):
+def compute_idct_matrix(K, N, cepstral_lifter=0):
     matrix = [[0] * K for i in range(N)]
     # normalizer for X_0
     normalizer = math.sqrt(1.0 / float(N))
@@ -384,7 +380,7 @@ def ComputeIdctMatrix(K, N, cepstral_lifter=0):
                 k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k)
 
     if cepstral_lifter != 0:
-        lifter_coeffs = ComputeLifterCoeffs(cepstral_lifter, K)
+        lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, K)
         for k in range(0, K):
             for n in range(0, N):
                 matrix[n][k] = matrix[n][k] / lifter_coeffs[k]
@@ -392,11 +388,11 @@ def ComputeIdctMatrix(K, N, cepstral_lifter=0):
     return matrix
 
 
-def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
+def write_idct_matrix(feat_dim, cepstral_lifter, file_path):
     # generate the IDCT matrix and write to the file
-    idct_matrix = ComputeIdctMatrix(feat_dim, feat_dim, cepstral_lifter)
+    idct_matrix = compute_idct_matrix(feat_dim, feat_dim, cepstral_lifter)
     # append a zero column to the matrix, this is the bias of the fixed affine
     # component
     for k in range(0, feat_dim):
         idct_matrix[k].append(0)
-    WriteKaldiMatrix(file_path, idct_matrix)
+    write_kaldi_matrix(file_path, idct_matrix)
diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py
index e69de29bb2d..ff8146520db 100644
--- a/egs/wsj/s5/steps/libs/nnet3/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/__init__.py
@@ -0,0 +1,7 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+""" This is package containing modules and subpackages for training
+deep neural networks in nnet3 framework"""
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/__init__.py b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py
new file mode 100644
index 00000000000..2c94aa7e20b
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py
@@ -0,0 +1,6 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+__all__ = ["log_parse"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
new file mode 100755
index 00000000000..794acd8a8d8
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -0,0 +1,320 @@
+
+
+# Copyright 2016    Vijayaditya Peddinti
+#                   Vimal Manohar
+# Apache 2.0.
+
+from __future__ import division
+import datetime
+import re
+
+import libs.common as common_lib
+
+
+def parse_progress_logs_for_nonlinearity_stats(exp_dir):
+    """ Parse progress logs for mean and std stats for non-linearities.
+
+    e.g. for a line that is parsed from progress.*.log:
+    exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i
+    type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05,
+    value-avg=[percentiles(0,1,2,5 10,20,50,80,90
+    95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83
+    0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23],
+    deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
+    95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18
+    0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
+    """
+
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    stats_per_component_per_iter = {}
+
+    progress_log_lines = common_lib.run_kaldi_command(
+        'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files))[0]
+
+    parse_regex = re.compile(
+        ".*progress.([0-9]+).log:component name=(.+) "
+        "type=(.*)Component,.*"
+        "value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*"
+        "deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]")
+
+    for line in progress_log_lines.split("\n"):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23',
+        # '0.134', '0.0397')
+        groups = mat_obj.groups()
+        iteration = int(groups[0])
+        component_name = groups[1]
+        component_type = groups[2]
+        value_mean = float(groups[3])
+        value_stddev = float(groups[4])
+        deriv_mean = float(groups[5])
+        deriv_stddev = float(groups[6])
+        try:
+            stats_per_component_per_iter[component_name][
+                'stats'][iteration] = [value_mean, value_stddev,
+                                       deriv_mean, deriv_stddev]
+        except KeyError:
+            stats_per_component_per_iter[component_name] = {}
+            stats_per_component_per_iter[component_name][
+                'type'] = component_type
+            stats_per_component_per_iter[component_name]['stats'] = {}
+            stats_per_component_per_iter[component_name][
+                'stats'][iteration] = [value_mean, value_stddev,
+                                       deriv_mean, deriv_stddev]
+
+    return stats_per_component_per_iter
+
+
+def parse_difference_string(string):
+    dict = {}
+    for parts in string.split():
+        sub_parts = parts.split(":")
+        dict[sub_parts[0]] = float(sub_parts[1])
+    return dict
+
+
+class MalformedClippedProportionLineException(Exception):
+    def __init__(self, line):
+        Exception.__init__(self,
+                           "Malformed line encountered while trying to "
+                           "extract clipped-proportions.\n{0}".format(line))
+
+
+def parse_progress_logs_for_clipped_proportion(exp_dir):
+    """ Parse progress logs for clipped proportion stats.
+
+    e.g. for a line that is parsed from progress.*.log:
+    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component
+    name=BLstm1_forward_c type=ClipGradientComponent, dim=512,
+    norm-based-clipping=true, clipping-threshold=30,
+    clipped-proportion=0.000565527,
+    self-repair-clipped-proportion-threshold=0.01, self-repair-target=0,
+    self-repair-scale=1
+    """
+
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    component_names = set([])
+    progress_log_lines = common_lib.run_kaldi_command(
+        'grep -e "{0}" {1}'.format(
+            "clipped-proportion", progress_log_files))[0]
+    parse_regex = re.compile(".*progress\.([0-9]+)\.log:component "
+                             "name=(.*) type=.* "
+                             "clipped-proportion=([0-9\.e\-]+)")
+
+    cp_per_component_per_iter = {}
+
+    max_iteration = 0
+    component_names = set([])
+    for line in progress_log_lines.split("\n"):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            if line.strip() == "":
+                continue
+            raise MalformedClippedProportionLineException(line)
+        groups = mat_obj.groups()
+        iteration = int(groups[0])
+        max_iteration = max(max_iteration, iteration)
+        name = groups[1]
+        clipped_proportion = float(groups[2])
+        if clipped_proportion > 1:
+            raise MalformedClippedProportionLineException(line)
+        if iteration not in cp_per_component_per_iter:
+            cp_per_component_per_iter[iteration] = {}
+        cp_per_component_per_iter[iteration][name] = clipped_proportion
+        component_names.add(name)
+    component_names = list(component_names)
+    component_names.sort()
+
+    # re arranging the data into an array
+    # and into an cp_per_iter_per_component
+    cp_per_iter_per_component = {}
+    for component_name in component_names:
+        cp_per_iter_per_component[component_name] = []
+    data = []
+    data.append(["iteration"]+component_names)
+    for iter in range(max_iteration+1):
+        if iter not in cp_per_component_per_iter:
+            continue
+        comp_dict = cp_per_component_per_iter[iter]
+        row = [iter]
+        for component in component_names:
+            try:
+                row.append(comp_dict[component])
+                cp_per_iter_per_component[component].append(
+                    [iter, comp_dict[component]])
+            except KeyError:
+                # if clipped proportion is not available for a particular
+                # component it is set to None
+                # this usually happens during layer-wise discriminative
+                # training
+                row.append(None)
+        data.append(row)
+
+    return {'table': data,
+            'cp_per_component_per_iter': cp_per_component_per_iter,
+            'cp_per_iter_per_component': cp_per_iter_per_component}
+
+
+def parse_progress_logs_for_param_diffP(exp_dir, pattern, logger=None):
+    """ Parse progress logs for per-component parameter differences.
+
+    e.g. for a line that is parsed from progress.*.log:
+    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG
+    (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter
+    differences per layer are [ Cwrnn1_T3_W_r:0.0171537
+    Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07
+    Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521
+    Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978
+    Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588
+    Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754
+    Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
+    """
+
+    if pattern not in set(["Relative parameter differences",
+                           "Parameter differences"]):
+        raise Exception("Unknown value for pattern : {0}".format(pattern))
+
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    progress_per_iter = {}
+    component_names = set([])
+    progress_log_lines = common_lib.run_kaldi_command(
+        'grep -e "{0}" {1}'.format(pattern, progress_log_files))[0]
+    parse_regex = re.compile(".*progress\.([0-9]+)\.log:"
+                             "LOG.*{0}.*\[(.*)\]".format(pattern))
+    for line in progress_log_lines.split("\n"):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        iteration = groups[0]
+        differences = parse_difference_string(groups[1])
+        component_names = component_names.union(differences.keys())
+        progress_per_iter[int(iteration)] = differences
+
+    component_names = list(component_names)
+    component_names.sort()
+    # rearranging the parameter differences available per iter
+    # into parameter differences per component
+    progress_per_component = {}
+    for cn in component_names:
+        progress_per_component[cn] = {}
+
+    max_iter = max(progress_per_iter.keys())
+    total_missing_iterations = 0
+    gave_user_warning = False
+    for iter in range(max_iter + 1):
+        try:
+            component_dict = progress_per_iter[iter]
+        except KeyError:
+            continue
+
+        for component_name in component_names:
+            try:
+                progress_per_component[component_name][iter] = component_dict[
+                    component_name]
+            except KeyError:
+                total_missing_iterations += 1
+                # the component was not found this iteration, may be because of
+                # layerwise discriminative training
+                pass
+        if (total_missing_iterations/len(component_names) > 20
+                and not gave_user_warning and logger is not None):
+            logger.warning("There are more than {0} missing iterations per "
+                           "component. Something might be wrong.".format(
+                                total_missing_iterations/len(component_names)))
+            gave_user_warning = True
+
+    return {'progress_per_component': progress_per_component,
+            'component_names': component_names,
+            'max_iter': max_iter}
+
+
+def parse_train_logs(exp_dir):
+    train_log_files = "%s/log/train.*.log" % (exp_dir)
+    train_log_lines = common_lib.run_kaldi_command(
+        'grep -e Accounting {0}'.format(train_log_files))[0]
+    parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# "
+                             "Accounting: time=([0-9]+) thread.*")
+
+    train_times = {}
+    for line in train_log_lines.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            try:
+                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+            except KeyError:
+                train_times[int(groups[0])] = {}
+                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+    iters = train_times.keys()
+    for iter in iters:
+        values = train_times[iter].values()
+        train_times[iter] = max(values)
+    return train_times
+
+
+def parse_prob_logs(exp_dir, key='accuracy'):
+    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
+    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
+    train_prob_strings = common_lib.run_kaldi_command(
+        'grep -e {0} {1}'.format(key, train_prob_files), wait=True)[0]
+    valid_prob_strings = common_lib.run_kaldi_command(
+        'grep -e {0} {1}'.format(key, valid_prob_files))[0]
+
+    # LOG
+    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149)
+    # Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832
+    # per frame, over 20000 fra
+
+    # LOG
+    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144)
+    # Overall log-probability for 'output' is -0.307255 per frame, over 20000
+    # frames.
+
+    parse_regex = re.compile(
+        ".*compute_prob_.*\.([0-9]+).log:LOG "
+        ".nnet3.*compute-prob:PrintTotalStats..:"
+        "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for "
+        "'output'.*is ([0-9.\-e]+) .*per frame")
+
+    train_loss = {}
+    valid_loss = {}
+
+    for line in train_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                train_loss[int(groups[0])] = groups[2]
+    for line in valid_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                valid_loss[int(groups[0])] = groups[2]
+    iters = list(set(valid_loss.keys()).intersection(train_loss.keys()))
+    iters.sort()
+    return map(lambda x: (int(x), float(train_loss[x]),
+                          float(valid_loss[x])), iters)
+
+
+def generate_accuracy_report(exp_dir, key="accuracy"):
+    times = parse_train_logs(exp_dir)
+    data = parse_prob_logs(exp_dir, key)
+    report = []
+    report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    for x in data:
+        try:
+            report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
+                                                  x[1], x[2], x[2]-x[1]))
+        except KeyError:
+            continue
+
+    total_time = 0
+    for iter in times.keys():
+        total_time += times[iter]
+    report.append("Total training time is {0}\n".format(
+                    str(datetime.timedelta(seconds=total_time))))
+    return ["\n".join(report), times, data]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
new file mode 100644
index 00000000000..b2010518d2a
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
@@ -0,0 +1,10 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+""" This is a subpackage containing modules for training of
+deep neural network acoustic model with chain objective.
+"""
+
+__all__ = ["acoustic_model"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 092e9c66ff3..dfbe46aaa55 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -1,7 +1,7 @@
 
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
 """ This is a module with methods which will be used by scripts for training of
@@ -9,173 +9,177 @@
 """
 
 import logging
-import math
-import imp
 import os
 import sys
-import libs.nnet3.train.common as common_train_lib
+
 import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-#handler = logging.StreamHandler()
-#handler.setLevel(logging.INFO)
-#formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
-#handler.setFormatter(formatter)
-#logger.addHandler(handler)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
 
 
-def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts=None):
+def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
     """Create a phone LM for chain training
 
     This method trains a phone LM for chain training using the alignments
     in "tree_dir"
     """
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/make_phone_lm.log \
-    chain-est-phone-lm {lm_opts} \
-    "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
-    {dir}/phone_lm.fst""".format(command=run_opts.command,
-                                 dir=dir,
-                                 lm_opts=lm_opts if lm_opts is not None else '',
-                                 tree_dir=tree_dir))
-
-
-def CreateDenominatorFst(dir, tree_dir, run_opts):
-    common_lib.RunKaldiCommand("""
-copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
-{command} {dir}/log/make_den_fst.log \
-    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \
-    {dir}/den.fst {dir}/normalization.fst
-    """.format(tree_dir=tree_dir, dir=dir, command=run_opts.command))
-
-
-def GenerateChainEgs(dir, data, lat_dir, egs_dir,
-                    left_context, right_context,
-                    run_opts, stage=0,
-                    valid_left_context=None, valid_right_context=None,
-                    left_tolerance=None, right_tolerance=None,
-                    frame_subsampling_factor=3,
-                    alignment_subsampling_factor=3,
-                    feat_type='raw', online_ivector_dir=None,
-                    frames_per_iter=20000, frames_per_eg=20, srand=0,
-                    egs_opts=None, cmvn_opts=None, transform_dir=None):
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/make_phone_lm.log \
+                chain-est-phone-lm {lm_opts} \
+                "ark:gunzip -c {tree_dir}/ali.*.gz | """
+        """ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
+                {dir}/phone_lm.fst""".format(
+                    command=run_opts.command, dir=dir,
+                    lm_opts=lm_opts if lm_opts is not None else '',
+                    tree_dir=tree_dir))
+
+
+def create_denominator_fst(dir, tree_dir, run_opts):
+    common_lib.run_kaldi_command(
+        """copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
+           {command} {dir}/log/make_den_fst.log \
+                   chain-make-den-fst {dir}/tree {dir}/0.trans_mdl \
+                   {dir}/phone_lm.fst \
+                   {dir}/den.fst {dir}/normalization.fst""".format(
+                       tree_dir=tree_dir, dir=dir, command=run_opts.command))
+
+
+def generate_chain_egs(dir, data, lat_dir, egs_dir,
+                       left_context, right_context,
+                       run_opts, stage=0,
+                       valid_left_context=None, valid_right_context=None,
+                       left_tolerance=None, right_tolerance=None,
+                       frame_subsampling_factor=3,
+                       alignment_subsampling_factor=3,
+                       feat_type='raw', online_ivector_dir=None,
+                       frames_per_iter=20000, frames_per_eg=20, srand=0,
+                       egs_opts=None, cmvn_opts=None, transform_dir=None):
     """Wrapper for steps/nnet3/chain/get_egs.sh
 
     See options in that script.
     """
 
-    common_lib.RunKaldiCommand("""
-steps/nnet3/chain/get_egs.sh {egs_opts} \
-  --cmd "{command}" \
-  --cmvn-opts "{cmvn_opts}" \
-  --feat-type {feat_type} \
-  --transform-dir "{transform_dir}" \
-  --online-ivector-dir "{ivector_dir}" \
-  --left-context {left_context} --right-context {right_context} \
-  --valid-left-context '{valid_left_context}' \
-  --valid-right-context '{valid_right_context}' \
-  --left-tolerance '{left_tolerance}' \
-  --right-tolerance '{right_tolerance}' \
-  --frame-subsampling-factor {frame_subsampling_factor} \
-  --alignment-subsampling-factor {alignment_subsampling_factor} \
-  --stage {stage} \
-  --frames-per-iter {frames_per_iter} \
-  --frames-per-eg {frames_per_eg} \
-  --srand {srand} \
-  {data} {dir} {lat_dir} {egs_dir}
-      """.format(command=run_opts.command,
-                 cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
-                 feat_type=feat_type,
-                 transform_dir=transform_dir
-                               if transform_dir is not None
-                               else '',
-                 ivector_dir=online_ivector_dir
-                             if online_ivector_dir is not None
-                             else '',
-                 left_context=left_context, right_context=right_context,
-                 valid_left_context=valid_left_context
-                                    if valid_left_context is not None
-                                    else '',
-                 valid_right_context=valid_right_context
-                                     if valid_right_context is not None
-                                     else '',
-                 left_tolerance=left_tolerance
-                                if left_tolerance is not None
-                                else '',
-                 right_tolerance=right_tolerance
-                                 if right_tolerance is not None
-                                 else '',
-                 frame_subsampling_factor=frame_subsampling_factor,
-                 alignment_subsampling_factor=alignment_subsampling_factor,
-                 stage=stage, frames_per_iter=frames_per_iter,
-                 frames_per_eg=frames_per_eg, srand=srand,
-                 data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
-                 egs_opts=egs_opts if egs_opts is not None else ''))
-
-
-def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
-                                 max_lda_jobs=None, rand_prune=4.0,
-                                 lda_opts=None):
-    """ Function for calling binaries to estimate and write LDA matrix from cegs
-
-    This function is exactly similar to the version in
-    libs/nnet3/train/common.py
-    except it uses egs files in place of cegs files.
-
+    common_lib.run_kaldi_command(
+        """steps/nnet3/chain/get_egs.sh {egs_opts} \
+                --cmd "{command}" \
+                --cmvn-opts "{cmvn_opts}" \
+                --feat-type {feat_type} \
+                --transform-dir "{transform_dir}" \
+                --online-ivector-dir "{ivector_dir}" \
+                --left-context {left_context} --right-context {right_context} \
+                --valid-left-context '{valid_left_context}' \
+                --valid-right-context '{valid_right_context}' \
+                --left-tolerance '{left_tolerance}' \
+                --right-tolerance '{right_tolerance}' \
+                --frame-subsampling-factor {frame_subsampling_factor} \
+                --alignment-subsampling-factor {alignment_subsampling_factor} \
+                --stage {stage} \
+                --frames-per-iter {frames_per_iter} \
+                --frames-per-eg {frames_per_eg} \
+                --srand {srand} \
+                {data} {dir} {lat_dir} {egs_dir}""".format(
+                    command=run_opts.command,
+                    cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+                    feat_type=feat_type,
+                    transform_dir=(transform_dir
+                                   if transform_dir is not None
+                                   else ''),
+                    ivector_dir=(online_ivector_dir
+                                 if online_ivector_dir is not None
+                                 else ''),
+                    left_context=left_context, right_context=right_context,
+                    valid_left_context=(valid_left_context
+                                        if valid_left_context is not None
+                                        else ''),
+                    valid_right_context=(valid_right_context
+                                         if valid_right_context is not None
+                                         else ''),
+                    left_tolerance=(left_tolerance
+                                    if left_tolerance is not None
+                                    else ''),
+                    right_tolerance=(right_tolerance
+                                     if right_tolerance is not None
+                                     else ''),
+                    frame_subsampling_factor=frame_subsampling_factor,
+                    alignment_subsampling_factor=alignment_subsampling_factor,
+                    stage=stage, frames_per_iter=frames_per_iter,
+                    frames_per_eg=frames_per_eg, srand=srand,
+                    data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
+                    egs_opts=egs_opts if egs_opts is not None else ''))
+
+
+def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                   max_lda_jobs=None, rand_prune=4.0,
+                                   lda_opts=None):
+    """ Function to estimate and write LDA matrix from cegs
+
+    This function is exactly similar to the version in module
+    libs.nnet3.train.frame_level_objf.common except this uses cegs instead of
+    egs files.
     """
-
     if max_lda_jobs is not None:
         if num_lda_jobs > max_lda_jobs:
             num_lda_jobs = max_lda_jobs
 
-  # Write stats with the same format as stats for LDA.
-    common_lib.RunKaldiCommand("""
-{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
-    nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
-    {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format(
-        command=run_opts.command,
-        num_lda_jobs=num_lda_jobs,
-        dir=dir,
-        egs_dir=egs_dir,
-        rand_prune=rand_prune))
+    # Write stats with the same format as stats for LDA.
+    common_lib.run_kaldi_command(
+        """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+                nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
+                {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \
+                {dir}/JOB.lda_stats""".format(
+                    command=run_opts.command,
+                    num_lda_jobs=num_lda_jobs,
+                    dir=dir,
+                    egs_dir=egs_dir,
+                    rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files=map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                       range(1, num_lda_jobs + 1))
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/sum_transform_stats.log \
-    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
-        command=run_opts.command,
-        dir=dir, lda_stat_files=" ".join(lda_stat_files)))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/sum_transform_stats.log \
+                sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+                    command=run_opts.command,
+                    dir=dir, lda_stat_files=" ".join(lda_stat_files)))
 
     for file in lda_stat_files:
         try:
             os.remove(file)
         except OSError:
-            raise Exception("There was error while trying to remove lda stat files.")
-    # this computes a fixed affine transform computed in the way we described in
-    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
-    # of an LDA transform but without dimensionality reduction.
+            raise Exception("There was error while trying to remove "
+                            "lda stat files.")
+    # this computes a fixed affine transform computed in the way we described
+    # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
+    # variant of an LDA transform but without dimensionality reduction.
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/get_transform.log \
-    nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
-    """.format(command=run_opts.command, dir=dir,
-               lda_opts=lda_opts if lda_opts is not None else ""))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/get_transform.log \
+                nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
+                {dir}/lda_stats""".format(
+                    command=run_opts.command, dir=dir,
+                    lda_opts=lda_opts if lda_opts is not None else ""))
 
-    common_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+    common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
-def PrepareInitialAcousticModel(dir, run_opts):
+
+def prepare_initial_acoustic_model(dir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/add_first_layer.log \
-   nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw
-   """.format(command=run_opts.command, dir=dir))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/add_first_layer.log \
+                nnet3-init --srand=-1 {dir}/init.raw \
+                {dir}/configs/layer1.config {dir}/0.raw
+        """.format(command=run_opts.command, dir=dir))
 
     # The model-format for a 'chain' acoustic model is just the transition
     # model and then the raw nnet, so we can use 'cat' to create this, as
@@ -183,15 +187,15 @@ def PrepareInitialAcousticModel(dir, run_opts):
     # We ensure that they have the same mode (even if someone changed the
     # script to make one or both of them text mode) by copying them both
     # before concatenating them.
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/init_mdl.log \
-    nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format(
-        command=run_opts.command, dir=dir))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/init_mdl.log \
+                nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \
+                {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
 
 
-def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
-                  egs_dir, leaky_hmm_coefficient, l2_regularize,
-                  xent_regularize, run_opts):
+def combine_models(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
+                   egs_dir, leaky_hmm_coefficient, l2_regularize,
+                   xent_regularize, run_opts, background_process_handler=None):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -202,81 +206,308 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
 
     raw_model_strings = []
     for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-      model_file = '{0}/{1}.mdl'.format(dir, iter)
-      if os.path.exists(model_file):
-          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
-      else:
-          print('{0}: warning: model file {1} does not exist (final combination)'.format(
-                  sys.argv[0], model_file))
-    common_lib.RunKaldiCommand("""
-{command} {combine_queue_opt} {dir}/log/combine.log \
-nnet3-chain-combine --num-iters=40 \
-   --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-   --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {dir}/den.fst {raw_models} """
-   """ "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
-    "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
-    """.format(command=run_opts.command,
-               combine_queue_opt=run_opts.combine_queue_opt,
-               l2=l2_regularize, leaky=leaky_hmm_coefficient,
-               dir=dir, raw_models=" ".join(raw_model_strings),
-               num_chunk_per_minibatch=num_chunk_per_minibatch,
-               num_iters=num_iters,
-               egs_dir=egs_dir))
+        model_file = '{0}/{1}.mdl'.format(dir, iter)
+        if os.path.exists(model_file):
+            raw_model_strings.append(
+                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+        else:
+            print("{0}: warning: model file {1} does not exist "
+                  "(final combination)".format(sys.argv[0], model_file))
+
+    common_lib.run_kaldi_command(
+        """{command} {combine_queue_opt} {dir}/log/combine.log \
+                nnet3-chain-combine --num-iters=40 \
+                --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                --verbose=3 {dir}/den.fst {raw_models} """
+        """ "ark,bg:nnet3-chain-merge-egs """
+        """--minibatch-size={num_chunk_per_minibatch} """
+        """ark:{egs_dir}/combine.cegs ark:-|" - \| \
+                nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
+                {dir}/final.mdl""".format(
+                    command=run_opts.command,
+                    combine_queue_opt=run_opts.combine_queue_opt,
+                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
+                    dir=dir, raw_models=" ".join(raw_model_strings),
+                    num_chunk_per_minibatch=num_chunk_per_minibatch,
+                    num_iters=num_iters,
+                    egs_dir=egs_dir))
 
     # Compute the probability of the final, combined model with
     # the same subset we used for the previous compute_probs, as the
     # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize,
-                                leaky_hmm_coefficient, run_opts, wait=False)
+    compute_train_cv_probabilities(
+        dir, 'final', egs_dir, l2_regularize, xent_regularize,
+        leaky_hmm_coefficient, run_opts, wait=False,
+        background_process_handler=background_process_handler)
 
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
-                                leaky_hmm_coefficient, run_opts, wait=False):
+def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
+                                   xent_regularize, leaky_hmm_coefficient,
+                                   run_opts, wait=False,
+                                   background_process_handler=None):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/compute_prob_valid.{iter}.log \
-    nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-    --xent-regularize={xent_reg} \
-    "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-    "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
-    """.format(command=run_opts.command,
-               dir=dir, iter=iter, model=model,
-               l2=l2_regularize, leaky=leaky_hmm_coefficient,
-               xent_reg=xent_regularize,
-               egs_dir=egs_dir), wait=wait)
-
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/compute_prob_train.{iter}.log \
-    nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-    --xent-regularize={xent_reg} \
-    "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-    "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
-    """.format(command=run_opts.command,
-               dir=dir,
-               iter=iter,
-               model=model,
-               l2=l2_regularize, leaky=leaky_hmm_coefficient,
-               xent_reg=xent_regularize,
-               egs_dir=egs_dir), wait=wait)
-
-
-def ComputeProgress(dir, iter, run_opts, wait=False):
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/compute_prob_valid.{iter}.log \
+                nnet3-chain-compute-prob --l2-regularize={l2} \
+                --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
+                "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+                "ark,bg:nnet3-chain-merge-egs """
+        """ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+        """.format(command=run_opts.command,
+                   dir=dir, iter=iter, model=model,
+                   l2=l2_regularize, leaky=leaky_hmm_coefficient,
+                   xent_reg=xent_regularize,
+                   egs_dir=egs_dir), wait=wait,
+        background_process_handler=background_process_handler)
+
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/compute_prob_train.{iter}.log \
+                nnet3-chain-compute-prob --l2-regularize={l2} \
+                --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
+                "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+                "ark,bg:nnet3-chain-merge-egs """
+        """ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   l2=l2_regularize, leaky=leaky_hmm_coefficient,
+                   xent_reg=xent_regularize,
+                   egs_dir=egs_dir), wait=wait,
+        background_process_handler=background_process_handler)
+
+
+def compute_progress(dir, iter, run_opts, wait=False,
+                     background_process_handler=None):
 
     prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
     model = '{0}/{1}.mdl'.format(dir, iter)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/progress.{iter}.log \
-    nnet3-am-info {model} '&&' \
-    nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" \
-    "nnet3-am-copy --raw=true {model} - |"
-    """.format(command=run_opts.command,
-               dir=dir,
-               iter=iter,
-               model=model,
-               prev_model=prev_model), wait=wait)
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/progress.{iter}.log \
+                nnet3-am-info {model} '&&' \
+                nnet3-show-progress --use-gpu=no \
+                    "nnet3-am-copy --raw=true {prev_model} - |" \
+                    "nnet3-am-copy --raw=true {model} - |"
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   prev_model=prev_model), wait=wait,
+        background_process_handler=background_process_handler)
+
+# Called from TrainOneIteration, this model does one iteration of training
+# with 'num_jobs' jobs, and
+# writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    deriv_time_opts=""
+    if left_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
+    if right_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        if job == 1:
+            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = common_train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-chain-train {parallel_train_opts} \
+  --apply-deriv-weights={app_deriv_wts} \
+  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+   "{raw_model}" {dir}/den.fst \
+  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     deriv_time_opts = deriv_time_opts,
+                     trunc_deriv = truncate_deriv_weights,
+                     app_deriv_wts = apply_deriv_weights,
+                     fr_shft = frame_shift, l2 = l2_regularize,
+                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     cache_io_opts = cur_cache_io_opts,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      frame_subsampling_factor, truncate_deriv_weights,
+                      run_opts):
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
+            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+
+    if iter > 0:
+        chain_lib.ComputeProgress(dir, iter, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    TrainNewModels(dir = dir, iter = iter, srand = srand, num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   apply_deriv_weights = apply_deriv_weights,
+                   left_deriv_truncate = left_deriv_truncate,
+                   right_deriv_truncate = right_deriv_truncate,
+                   l2_regularize = l2_regularize,
+                   xent_regularize = xent_regularize,
+                   leaky_hmm_coefficient = leaky_hmm_coefficient,
+                   momentum = momentum,
+                   max_param_change = cur_max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor = frame_subsampling_factor,
+                   truncate_deriv_weights = truncate_deriv_weights,
+                   cache_io_opts = cache_io_opts, run_opts = run_opts)
+
+    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        common_train_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
+    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
 
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index fb8eb3e8cd7..c85efd72ca2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -1,23 +1,23 @@
 
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0
 
 """This module contains classes and methods common to training of
 nnet3 neural networks.
 """
 
+import argparse
+import glob
 import logging
+import os
 import math
 import re
-import time
 import shutil
-import glob
-import os
-import argparse
+import time
 
-import common as common_lib
+import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -309,100 +309,6 @@ def verify_iterations(num_iters, num_epochs, num_hidden_layers,
     return models_to_combine
 
 
-def get_realign_iters(realign_times, num_iters,
-                      num_jobs_initial, num_jobs_final):
-    """ Takes the realign_times string and identifies the approximate
-        iterations at which realignments have to be done.
-
-    realign_times is a space seperated string of values between 0 and 1
-    """
-
-    realign_iters = []
-    for realign_time in realign_times.split():
-        realign_time = float(realign_time)
-        assert(realign_time > 0 and realign_time < 1)
-        if num_jobs_initial == num_jobs_final:
-            realign_iter = int(0.5 + num_iters * realign_time)
-        else:
-            realign_iter = math.sqrt((1 - realign_time)
-                                     * math.pow(num_jobs_initial, 2)
-                                     + realign_time * math.pow(num_jobs_final,
-                                                               2))
-            realign_iter = realign_iter - num_jobs_initial
-            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
-            realign_iter = realign_iter * num_iters
-        realign_iters.append(int(realign_iter))
-
-    return realign_iters
-
-
-def align(dir, data, lang, run_opts, iter=None, transform_dir=None,
-          online_ivector_dir=None):
-
-    alidir = '{dir}/ali{ali_suffix}'.format(
-            dir=dir,
-            ali_suffix="_iter_{0}".format(iter) if iter is not None else "")
-
-    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
-        gpu=" using gpu " if run_opts.realign_use_gpu else " ",
-        num_jobs=run_opts.realign_num_jobs))
-    common_lib.run_kaldi_command(
-        """steps/nnet3/align.sh --nj {num_jobs_align} \
-                --cmd "{align_cmd} {align_queue_opt}" \
-                --use-gpu {align_use_gpu} \
-                --transform-dir "{transform_dir}" \
-                --online-ivector-dir "{online_ivector_dir}" \
-                --iter "{iter}" {data} {lang} {dir} {alidir}""".format(
-                    dir=dir, align_use_gpu=("yes"
-                                            if run_opts.realign_use_gpu
-                                            else "no"),
-                    align_cmd=run_opts.realign_command,
-                    align_queue_opt=run_opts.realign_queue_opt,
-                    num_jobs_align=run_opts.realign_num_jobs,
-                    transform_dir=(transform_dir
-                                   if transform_dir is not None
-                                   else ""),
-                    online_ivector_dir=(online_ivector_dir
-                                        if online_ivector_dir is not None
-                                        else ""),
-                    iter=iter if iter is not None else "",
-                    alidir=alidir,
-                    lang=lang, data=data))
-    return alidir
-
-
-def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
-            prior_subset_size, num_archives, run_opts,
-            transform_dir=None, online_ivector_dir=None):
-    raise Exception("Realignment stage has not been implemented in nnet3")
-    logger.info("Getting average posterior for purposes of adjusting "
-                "the priors.")
-    # Note: this just uses CPUs, using a smallish subset of data.
-    # always use the first egs archive, which makes the script simpler;
-    # we're using different random subsets of it.
-
-    avg_post_vec_file = compute_average_posterior(
-            dir, iter, prev_egs_dir,
-            num_archives, prior_subset_size, run_opts)
-
-    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
-    logger.info("Re-adjusting priors based on computed posteriors")
-    model = '{0}/{1}.mdl'.format(dir, iter)
-    adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts)
-
-    alidir = align(dir, feat_dir, lang, run_opts, iter,
-                   transform_dir, online_ivector_dir)
-    common_lib.run_kaldi_command(
-        """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \
-                {alidir} {prev_egs_dir} {cur_egs_dir}""".format(
-                    command=run_opts.command,
-                    iter=iter,
-                    dir=dir,
-                    alidir=alidir,
-                    prev_egs_dir=prev_egs_dir,
-                    cur_egs_dir=cur_egs_dir))
-
-
 def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
                       num_archives_to_process,
                       initial_effective_lrate, final_effective_lrate):
@@ -438,7 +344,13 @@ def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
                     non_linearity=non_linearity, model_file=model_file))
         output = output.strip().split("\n")
         # eg.
-        # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
+        # component name=Lstm1_f type=SigmoidComponent, dim=1280,
+        # count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90
+        # 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79
+        # 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198],
+        # deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
+        # 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25
+        # 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
 
         mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*")
         total_mean_deriv = 0
@@ -459,70 +371,6 @@ def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
     return False
 
 
-def compute_average_posterior(dir, iter, egs_dir, num_archives,
-                              prior_subset_size, run_opts,
-                              get_raw_nnet_from_am=True):
-    """ Computes the average posterior of the network
-    Note: this just uses CPUs, using a smallish subset of data.
-    """
-    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
-        os.remove(file)
-
-    if run_opts.num_jobs_compute_prior > num_archives:
-        egs_part = 1
-    else:
-        egs_part = 'JOB'
-
-    if get_raw_nnet_from_am:
-        model = "nnet3-am-copy --raw=true {0}/combined.mdl -|".format(dir)
-    else:
-        model = "{dir}/final.raw".format(dir=dir)
-
-    common_lib.run_kaldi_command(
-        """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
-                {dir}/log/get_post.{iter}.JOB.log \
-                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
-                ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
-                nnet3-merge-egs --measure-output-frames=true \
-                --minibatch-size=128 ark:- ark:- \| \
-                nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-                "{model}" ark:- ark:- \| \
-                matrix-sum-rows ark:- ark:- \| vector-sum ark:- \
-                {dir}/post.{iter}.JOB.vec""".format(
-                    command=run_opts.command,
-                    dir=dir, model=model,
-                    num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
-                    prior_queue_opt=run_opts.prior_queue_opt,
-                    iter=iter, prior_subset_size=prior_subset_size,
-                    egs_dir=egs_dir, egs_part=egs_part,
-                    prior_gpu_opt=run_opts.prior_gpu_opt))
-
-    # make sure there is time for $dir/post.{iter}.*.vec to appear.
-    time.sleep(5)
-    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
-    common_lib.run_kaldi_command("""
-{command} {dir}/log/vector_sum.{iter}.log \
-    vector-sum {dir}/post.{iter}.*.vec {output_file}
-        """.format(command=run_opts.command,
-                   dir=dir, iter=iter, output_file=avg_post_vec_file))
-
-    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
-        os.remove(file)
-    return avg_post_vec_file
-
-
-def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
-                     run_opts):
-    common_lib.run_kaldi_command(
-        """{command} {dir}/log/adjust_priors.final.log \
-                nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \
-                "{output_model}" """.format(
-                    command=run_opts.command,
-                    dir=dir, input_model=input_model,
-                    avg_posterior_vector=avg_posterior_vector,
-                    output_model=output_model))
-
-
 def remove_egs(egs_dir):
     common_lib.run_kaldi_command("steps/nnet2/remove_egs.sh {egs_dir}".format(
                                     egs_dir=egs_dir))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
index 172030d8297..d5148f3c396 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
@@ -6,3 +6,5 @@
 """ This library has classes and methods commonly used for training nnet3
 neural networks with frame-level objectives.
 """
+
+__all__ = ["common", "raw_model", "acoustic_model"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 40e224b0672..3ce8c8033fc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -1,7 +1,7 @@
 
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
 """ This is a module with method which will be used by scripts for
@@ -9,30 +9,26 @@
 """
 
 import logging
-import math
-import imp
-import os
-import sys
 
-sys.path.append("steps/libs")
-import common as common_lib
+import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
 
-def GenerateEgs(data, alidir, egs_dir,
-                left_context, right_context,
-                valid_left_context, valid_right_context,
-                run_opts, stage=0,
-                feat_type='raw', online_ivector_dir=None,
-                samples_per_iter=20000, frames_per_eg=20, srand=0,
-                egs_opts=None, cmvn_opts=None, transform_dir=None):
+def generate_egs(data, alidir, egs_dir,
+                 left_context, right_context,
+                 valid_left_context, valid_right_context,
+                 run_opts, stage=0,
+                 feat_type='raw', online_ivector_dir=None,
+                 samples_per_iter=20000, frames_per_eg=20, srand=0,
+                 egs_opts=None, cmvn_opts=None, transform_dir=None):
 
     """ Wrapper for calling steps/nnet3/get_egs.sh
 
@@ -40,30 +36,34 @@ def GenerateEgs(data, alidir, egs_dir,
     the model final.mdl and alignments.
     """
 
-    common_lib.RunKaldiCommand("""
-steps/nnet3/get_egs.sh {egs_opts} \
-  --cmd "{command}" \
-  --cmvn-opts "{cmvn_opts}" \
-  --feat-type {feat_type} \
-  --transform-dir "{transform_dir}" \
-  --online-ivector-dir "{ivector_dir}" \
-  --left-context {left_context} --right-context {right_context} \
-  --valid-left-context {valid_left_context} \
-  --valid-right-context {valid_right_context} \
-  --stage {stage} \
-  --samples-per-iter {samples_per_iter} \
-  --frames-per-eg {frames_per_eg} \
-  --srand {srand} \
-  {data} {alidir} {egs_dir}
-  """.format(command=run_opts.command,
-             cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
-             feat_type=feat_type,
-             transform_dir=transform_dir if transform_dir is not None else '',
-             ivector_dir=online_ivector_dir if online_ivector_dir is not None else '',
-             left_context=left_context, right_context=right_context,
-             valid_left_context=valid_left_context,
-             valid_right_context=valid_right_context,
-             stage=stage, samples_per_iter=samples_per_iter,
-             frames_per_eg=frames_per_eg, srand=srand, data=data, alidir=alidir,
-             egs_dir=egs_dir,
-             egs_opts=egs_opts if egs_opts is not None else ''))
+    common_lib.run_kaldi_command(
+        """steps/nnet3/get_egs.sh {egs_opts} \
+                --cmd "{command}" \
+                --cmvn-opts "{cmvn_opts}" \
+                --feat-type {feat_type} \
+                --transform-dir "{transform_dir}" \
+                --online-ivector-dir "{ivector_dir}" \
+                --left-context {left_context} --right-context {right_context} \
+                --valid-left-context {valid_left_context} \
+                --valid-right-context {valid_right_context} \
+                --stage {stage} \
+                --samples-per-iter {samples_per_iter} \
+                --frames-per-eg {frames_per_eg} \
+                --srand {srand} \
+                {data} {alidir} {egs_dir}
+        """.format(command=run_opts.command,
+                   cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+                   feat_type=feat_type,
+                   transform_dir=(transform_dir
+                                  if transform_dir is not None else
+                                  ''),
+                   ivector_dir=(online_ivector_dir
+                                if online_ivector_dir is not None
+                                else ''),
+                   left_context=left_context, right_context=right_context,
+                   valid_left_context=valid_left_context,
+                   valid_right_context=valid_right_context,
+                   stage=stage, samples_per_iter=samples_per_iter,
+                   frames_per_eg=frames_per_eg, srand=srand, data=data,
+                   alidir=alidir, egs_dir=egs_dir,
+                   egs_opts=egs_opts if egs_opts is not None else ''))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 952a797c14f..b49a59f5a3b 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -1,7 +1,7 @@
 
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
 """ This is a module with methods which will be used by scripts for training of
@@ -11,34 +11,33 @@
 
 import logging
 import math
-import imp
 import os
-import sys
 
-sys.path.append("steps/libs")
-import nnet3.train.common as common_train_lib
-import common as common_lib
+import libs.common as common_lib
+import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
 
-def TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size,
-                   cache_read_opt, run_opts,
-                   frames_per_eg=-1, min_deriv_time=None):
-    """ Called from TrainOneIteration, this model does one iteration of training
-    with 'num_jobs' jobs, and
-    writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
+def train_new_models(dir, iter, srand, num_jobs,
+                     num_archives_processed, num_archives,
+                     raw_model_string, egs_dir,
+                     left_context, right_context,
+                     momentum, max_param_change,
+                     shuffle_buffer_size, minibatch_size,
+                     cache_read_opt, run_opts,
+                     frames_per_eg=-1, min_deriv_time=None,
+                     background_process_handler=None):
+    """ Called from train_one_iteration(), this model does one iteration of
+    training with 'num_jobs' jobs, and writes files like
+    exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
 
     We cannot easily use a single parallel SGE job to do the main training,
     because the computation of which archive and which --frame option
@@ -53,60 +52,72 @@ def TrainNewModels(dir, iter, srand, num_jobs,
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
         min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its value
-            is set to chunk_width - num_bptt_steps in the training script.
+            implies a min_deriv_time of 0 is used. During RNN training, its
+            value is set to chunk_width - num_bptt_steps in the training
+            script.
     """
 
     chunk_level_training = False if frames_per_eg > 0 else True
 
     deriv_time_opts = (""
                        if min_deriv_time is None
-                       else "--optimization.min-deriv-time={0}".format(min_deriv_time)
+                       else "--optimization.min-deriv-time={0}".format(
+                           min_deriv_time)
                        )
 
     context_opts = "--left-context={0} --right-context={1}".format(
         left_context, right_context)
 
     processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+    for job in range(1, num_jobs+1):
+        # k is a zero-based index that we will derive the other indexes from.
+        k = num_archives_processed + job - 1
+
+        # work out the 1-based archive index.
+        archive_index = (k % num_archives) + 1
 
         if not chunk_level_training:
             frame = (k / num_archives) % frames_per_eg
 
         cache_write_opt = ""
         if job == 1:
-            # an option for writing cache (storing pairs of nnet-computations and
-            # computation-requests) during training.
-            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = common_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-    nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-    --print-interval=10 --momentum={momentum} \
-    --max-param-change={max_param_change} \
-    {deriv_time_opts} "{raw_model}" \
-    "ark,bg:nnet3-copy-egs {frame_opts} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- |"""
-    """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| """
-    """nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false """
-    """--discard-partial-minibatches=true ark:- ark:- |" \
-    {dir}/{next_iter}.{job}.raw
-    """.format(command=run_opts.command,
-               train_queue_opt=run_opts.train_queue_opt,
-               dir=dir, iter=iter, srand=iter + srand, next_iter=iter + 1,
-               job=job, parallel_train_opts=run_opts.parallel_train_opts,
-               cache_read_opt=cache_read_opt, cache_write_opt=cache_write_opt,
-               frame_opts=""
-                          if chunk_level_training
-                          else "--frame={0}".format(frame),
-               momentum=momentum, max_param_change=max_param_change,
-               deriv_time_opts=deriv_time_opts,
-               raw_model=raw_model_string, context_opts=context_opts,
-               egs_dir=egs_dir, archive_index=archive_index,
-               shuffle_buffer_size=shuffle_buffer_size,
-               minibatch_size=minibatch_size),
+            # an option for writing cache (storing pairs of nnet-computations
+            # and computation-requests) during training.
+            cache_write_opt = "--write-cache={dir}/cache.{iter}".format(
+                dir=dir, iter=iter+1)
+
+        process_handle = common_lib.run_kaldi_command(
+            """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+                    nnet3-train {parallel_train_opts} {cache_read_opt} \
+                    {cache_write_opt} --print-interval=10 \
+                    --momentum={momentum} \
+                    --max-param-change={max_param_change} \
+                    {deriv_time_opts} "{raw_model}" \
+                    "ark,bg:nnet3-copy-egs {frame_opts} {context_opts} """
+            """ark:{egs_dir}/egs.{archive_index}.ark ark:- |"""
+            """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} """
+            """--srand={srand} ark:- ark:- | """
+            """nnet3-merge-egs --minibatch-size={minibatch_size} """
+            """--measure-output-frames=false """
+            """--discard-partial-minibatches=true ark:- ark:- |" \
+                    {dir}/{next_iter}.{job}.raw""".format(
+                        command=run_opts.command,
+                        train_queue_opt=run_opts.train_queue_opt,
+                        dir=dir, iter=iter, srand=iter + srand,
+                        next_iter=iter + 1,
+                        job=job,
+                        parallel_train_opts=run_opts.parallel_train_opts,
+                        cache_read_opt=cache_read_opt,
+                        cache_write_opt=cache_write_opt,
+                        frame_opts=(""
+                                    if chunk_level_training
+                                    else "--frame={0}".format(frame)),
+                        momentum=momentum, max_param_change=max_param_change,
+                        deriv_time_opts=deriv_time_opts,
+                        raw_model=raw_model_string, context_opts=context_opts,
+                        egs_dir=egs_dir, archive_index=archive_index,
+                        shuffle_buffer_size=shuffle_buffer_size,
+                        minibatch_size=minibatch_size), wait=False,
             background_process_handler=background_process_handler)
 
         processes.append(process_handle)
@@ -121,20 +132,23 @@ def TrainNewModels(dir, iter, srand, num_jobs,
 
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, minibatch_size,
-                      num_hidden_layers, add_layers_period,
-                      left_context, right_context,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      run_opts,
-                      cv_minibatch_size=256, frames_per_eg=-1,
-                      min_deriv_time=None, shrinkage_value=1.0,
-                      get_raw_nnet_from_am=True):
-    """ Called from steps/nnet3/train_*.py scripts for one iteration of training
+        raise Exception("There was error during training "
+                        "iteration {0}".format(iter))
+
+
+def train_one_iteration(dir, iter, srand, egs_dir,
+                        num_jobs, num_archives_processed, num_archives,
+                        learning_rate, minibatch_size,
+                        num_hidden_layers, add_layers_period,
+                        left_context, right_context,
+                        momentum, max_param_change, shuffle_buffer_size,
+                        run_opts,
+                        cv_minibatch_size=256, frames_per_eg=-1,
+                        min_deriv_time=None, shrinkage_value=1.0,
+                        get_raw_nnet_from_am=True,
+                        background_process_handler=None):
+    """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
+    network training
 
     Args:
         frames_per_eg: The default value -1 implies chunk_level_training, which
@@ -143,8 +157,9 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
         min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its value
-            is set to chunk_width - num_bptt_steps in the training script.
+            implies a min_deriv_time of 0 is used. During RNN training, its
+            value is set to chunk_width - num_bptt_steps in the training
+            script.
         shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
             parameter values are scaled by this value.
         get_raw_nnet_from_am: If True, then the network is read and stored as
@@ -152,7 +167,6 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
             as against a raw network e.g. 10.raw when the value is False.
     """
 
-
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     logger.info("Training neural net (pass {0})".format(iter))
@@ -160,11 +174,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     # check if different iterations use the same random seed
     if os.path.exists('{0}/srand'.format(dir)):
         try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
+            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
+        except (IOError, ValueError) as e:
+            raise Exception("Exception while reading the random seed "
+                            "for training: {0}".format(e.str()))
         if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+            logger.warning("The random seed provided to this iteration "
+                           "(srand={0}) is different from the one saved last "
+                           "time (srand={1}). Using srand={0}.".format(
+                               srand, saved_srand))
     else:
         f = open('{0}/srand'.format(dir), 'w')
         f.write(str(srand))
@@ -172,49 +190,73 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     # Sets off some background jobs to compute train and
     # validation set objectives
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
-                                mb_size=cv_minibatch_size,
-                                get_raw_nnet_from_am=get_raw_nnet_from_am)
+    compute_train_cv_probabilities(
+        dir, iter, egs_dir, run_opts,
+        mb_size=cv_minibatch_size,
+        get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
+        background_process_handler=background_process_handler)
 
     if iter > 0:
         # Runs in the background
-        ComputeProgress(dir, iter, egs_dir, run_opts,
-                        mb_size=cv_minibatch_size,
-                        get_raw_nnet_from_am=get_raw_nnet_from_am)
+        compute_progress(dir, iter, egs_dir, run_opts,
+                         mb_size=cv_minibatch_size, wait=False,
+                         get_raw_nnet_from_am=get_raw_nnet_from_am,
+                         background_process_handler=background_process_handler)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
     cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+    if (iter > 0 and iter <= (num_hidden_layers-1) * add_layers_period
+            and iter % add_layers_period == 0):
+
+        # if we've just added new hiden layer, don't do averaging but take the
+        # best.
+        do_average = False
 
-        do_average = False # if we've just added new hiden layer, don't do
-                           # averaging but take the best.
         cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        config_file = "{0}/configs/layer{1}.config".format(
+                dir,
+                cur_num_hidden_layers)
         if get_raw_nnet_from_am:
-            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+            raw_model_string = ("nnet3-am-copy --raw=true "
+                                "--learning-rate={lr} {dir}/{iter}.mdl - | "
+                                "nnet3-init --srand={srand} - "
+                                "{config} - |".format(
+                                    lr=learning_rate, dir=dir, iter=iter,
+                                    srand=iter + srand, config=config_file))
         else:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+            raw_model_string = ("nnet3-copy --learning-rate={lr} "
+                                "{dir}/{iter}.raw - | "
+                                "nnet3-init --srand={srand} - "
+                                "{config} - |".format(
+                                    lr=learning_rate, dir=dir, iter=iter,
+                                    srand=iter + srand, config=config_file))
     else:
         do_average = True
         if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
+            # on iteration 0, pick the best, don't average.
+            do_average = False
         else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(
+                dir=dir, iter=iter)
         if get_raw_nnet_from_am:
-            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+            raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
+                                "{1}/{2}.mdl - |".format(learning_rate,
+                                                         dir, iter))
         else:
-            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr=learning_rate, dir=dir, iter=iter)
+            raw_model_string = ("nnet3-copy --learning-rate={lr} "
+                                "{dir}/{iter}.raw - |".format(
+                                    lr=learning_rate, dir=dir, iter=iter))
 
     if do_average:
         cur_minibatch_size = minibatch_size
         cur_max_param_change = max_param_change
     else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
+        # on iteration zero or when we just added a layer, use a smaller
+        # minibatch size (and we will later choose the output of just one of
+        # the jobs): the model-averaging isn't always helpful when the model is
+        # changing too fast (i.e. it can worsen the objective function), and
+        # the smaller minibatch size will help to keep the update stable.
         cur_minibatch_size = minibatch_size / 2
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
@@ -223,25 +265,26 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size,
-                   cache_read_opt, run_opts,
-                   frames_per_eg=frames_per_eg,
-                   min_deriv_time=min_deriv_time)
-
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(
-            num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    train_new_models(dir, iter, srand, num_jobs,
+                     num_archives_processed, num_archives,
+                     raw_model_string, egs_dir,
+                     left_context, right_context,
+                     momentum, cur_max_param_change,
+                     shuffle_buffer_size, cur_minibatch_size,
+                     cache_read_opt, run_opts,
+                     frames_per_eg=frames_per_eg,
+                     min_deriv_time=min_deriv_time,
+                     background_process_handler=background_process_handler)
+
+    [models_to_average, best_model] = common_train_lib.get_successful_models(
+            num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
 
     if do_average:
         # average the output of the different jobs.
-        common_train_lib.GetAverageNnetModel(
+        common_train_lib.get_average_nnet_model(
             dir=dir, iter=iter,
             nnets_list=" ".join(nnets_list),
             run_opts=run_opts,
@@ -250,7 +293,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
 
     else:
         # choose the best model from different jobs
-        common_train_lib.GetBestNnetModel(
+        common_train_lib.get_best_nnet_model(
             dir=dir, iter=iter,
             best_model_index=best_model,
             run_opts=run_opts,
@@ -269,137 +312,147 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         new_model = "{0}/{1}.raw".format(dir, iter + 1)
 
     if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+        raise Exception("Could not find {0}, at the end of "
+                        "iteration {1}".format(new_model, iter))
     elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+        raise Exception("{0} has size 0. Something went wrong in "
+                        "iteration {1}".format(new_model, iter))
     if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
         os.remove("{0}/cache.{1}".format(dir, iter))
 
 
-def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
-                                 max_lda_jobs=None, rand_prune=4.0,
-                                 lda_opts=None):
+def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                   max_lda_jobs=None, rand_prune=4.0,
+                                   lda_opts=None):
     if max_lda_jobs is not None:
         if num_lda_jobs > max_lda_jobs:
             num_lda_jobs = max_lda_jobs
 
-    common_lib.RunKaldiCommand("""
-{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
-nnet3-acc-lda-stats --rand-prune={rand_prune} \
-    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
-        command=run_opts.command,
-        num_lda_jobs=num_lda_jobs,
-        dir=dir,
-        egs_dir=egs_dir,
-        rand_prune=rand_prune))
+    common_lib.run_kaldi_command(
+        """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+                nnet3-acc-lda-stats --rand-prune={rand_prune} \
+                {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \
+                {dir}/JOB.lda_stats""".format(
+                    command=run_opts.command,
+                    num_lda_jobs=num_lda_jobs,
+                    dir=dir,
+                    egs_dir=egs_dir,
+                    rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
     lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
                          range(1, num_lda_jobs + 1))
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/sum_transform_stats.log \
-    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
-        command=run_opts.command,
-        dir=dir, lda_stat_files=" ".join(lda_stat_files)))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/sum_transform_stats.log \
+                sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+                    command=run_opts.command,
+                    dir=dir, lda_stat_files=" ".join(lda_stat_files)))
 
     for file in lda_stat_files:
         try:
             os.remove(file)
         except OSError:
-            raise Exception("There was error while trying to remove lda stat files.")
-    # this computes a fixed affine transform computed in the way we described in
-    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
-    # of an LDA transform but without dimensionality reduction.
+            raise Exception("There was error while trying to remove "
+                            "lda stat files.")
+    # this computes a fixed affine transform computed in the way we described
+    # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
+    # variant of an LDA transform but without dimensionality reduction.
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/get_transform.log \
-    nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
-    """.format(command=run_opts.command,dir=dir,
-               lda_opts=lda_opts if lda_opts is not None else ""))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/get_transform.log \
+                nnet-get-feature-transform \
+                {lda_opts} {dir}/lda.mat {dir}/lda_stats
+        """.format(command=run_opts.command, dir=dir,
+                   lda_opts=lda_opts if lda_opts is not None else ""))
 
-    common_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+    common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
 
-def PrepareInitialAcousticModel(dir, alidir, run_opts):
+def prepare_initial_acoustic_model(dir, alidir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    common_lib.PrepareInitialNetwork(dir, run_opts)
+    common_lib.prepare_initial_network(dir, run_opts)
 
-  # Convert to .mdl, train the transitions, set the priors.
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/init_mdl.log \
-    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
-    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
-    """.format(command=run_opts.command,
-               dir=dir, alidir=alidir))
+    # Convert to .mdl, train the transitions, set the priors.
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/init_mdl.log \
+                nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+                nnet3-am-train-transitions - \
+                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command=run_opts.command,
+                   dir=dir, alidir=alidir))
 
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
-                                mb_size=256, background_process_handler=None,
-                                get_raw_nnet_from_am=True):
+def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, mb_size=256,
+                                   wait=False, background_process_handler=None,
+                                   get_raw_nnet_from_am=True):
 
     if get_raw_nnet_from_am:
-        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir=dir, iter=iter)
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(
+                    dir=dir, iter=iter)
     else:
         model = "{dir}/{iter}.raw".format(dir=dir, iter=iter)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/compute_prob_valid.{iter}.log \
-    nnet3-compute-prob "{model}" \
-    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
-    """.format(command=run_opts.command,
-               dir=dir,
-               iter=iter,
-               mb_size=mb_size,
-               model=model,
-               egs_dir=egs_dir),
-            background_process_handler=background_process_handler)
-
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/compute_prob_train.{iter}.log \
-    nnet3-compute-prob "{model}" \
-    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
-    """.format(command=run_opts.command,
-               dir=dir,
-               iter=iter,
-               mb_size=mb_size,
-               model=model,
-               egs_dir=egs_dir),
-            background_process_handler=background_process_handler)
-
-
-def ComputeProgress(dir, iter, egs_dir, run_opts,
-                    mb_size=256, background_process_handler=None,
-                    get_raw_nnet_from_am=True):
+    common_lib.run_kaldi_command(
+        """ {command} {dir}/log/compute_prob_valid.{iter}.log \
+                nnet3-compute-prob "{model}" \
+                "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
+        """ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   mb_size=mb_size,
+                   model=model,
+                   egs_dir=egs_dir), wait=wait,
+        background_process_handler=background_process_handler)
+
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/compute_prob_train.{iter}.log \
+                nnet3-compute-prob "{model}" \
+                "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
+        """ark:{egs_dir}/train_diagnostic.egs ark:- |"
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   mb_size=mb_size,
+                   model=model,
+                   egs_dir=egs_dir), wait=wait,
+        background_process_handler=background_process_handler)
+
+
+def compute_progress(dir, iter, egs_dir, run_opts, mb_size=256,
+                     background_process_handler=None, wait=False,
+                     get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
-        prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter - 1)
+        prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(
+                        dir, iter - 1)
         model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(dir, iter)
     else:
         prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
         model = '{0}/{1}.raw'.format(dir, iter)
 
-    common_lib.RunKaldiCommand("""
-{command} {dir}/log/progress.{iter}.log \
-    nnet3-info {model} '&&' \
-    nnet3-show-progress --use-gpu=no {prev_model} {model} \
-    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
-    """.format(command=run_opts.command,
-               dir=dir,
-               iter=iter,
-               model=model,
-               mb_size=mb_size,
-               prev_model=prev_model,
-               egs_dir=egs_dir),
+    common_lib.run_kaldi_command(
+            """{command} {dir}/log/progress.{iter}.log \
+                    nnet3-info {model} '&&' \
+                    nnet3-show-progress --use-gpu=no {prev_model} {model} \
+                    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
+            """ark:{egs_dir}/train_diagnostic.egs ark:-|"
+            """.format(command=run_opts.command,
+                       dir=dir,
+                       iter=iter,
+                       model=model,
+                       mb_size=mb_size,
+                       prev_model=prev_model,
+                       egs_dir=egs_dir), wait=wait,
             background_process_handler=background_process_handler)
 
 
-def CombineModels(dir, num_iters, models_to_combine, egs_dir,
-                  run_opts, background_process_handler=None,
-                  chunk_width=None,
-                  get_raw_nnet_from_am=True):
+def combine_models(dir, num_iters, models_to_combine, egs_dir,
+                   run_opts, background_process_handler=None,
+                   chunk_width=None, get_raw_nnet_from_am=True):
     """
     Now do combination.  In the nnet3 setup, the logic
     for doing averaging of subsets of the models in the case where
@@ -412,16 +465,17 @@ def CombineModels(dir, num_iters, models_to_combine, egs_dir,
     models_to_combine.add(num_iters)
 
     for iter in models_to_combine:
-      if get_raw_nnet_from_am:
-          model_file = '{0}/{1}.mdl'.format(dir, iter)
-          if not os.path.exists(model_file):
-              raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
-      else:
-          model_file = '{0}/{1}.raw'.format(dir, iter)
-          if not os.path.exists(model_file):
-              raise Exception('Model file {0} missing'.format(model_file))
-          raw_model_strings.append(model_file)
+        if get_raw_nnet_from_am:
+            model_file = '{0}/{1}.mdl'.format(dir, iter)
+            if not os.path.exists(model_file):
+                raise Exception('Model file {0} missing'.format(model_file))
+            raw_model_strings.append(
+                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+        else:
+            model_file = '{0}/{1}.raw'.format(dir, iter)
+            if not os.path.exists(model_file):
+                raise Exception('Model file {0} missing'.format(model_file))
+            raw_model_strings.append(model_file)
 
     if chunk_width is not None:
         # this is an RNN model
@@ -430,32 +484,193 @@ def CombineModels(dir, num_iters, models_to_combine, egs_dir,
         mbsize = 1024
 
     if get_raw_nnet_from_am:
-        out_model = "| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir=dir, num_iters=num_iters)
+        out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl "
+                     "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters))
     else:
         out_model = '{dir}/final.raw'.format(dir=dir)
 
-    common_lib.RunKaldiCommand("""
-{command} {combine_queue_opt} {dir}/log/combine.log \
-nnet3-combine --num-iters=40 \
-   --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
-   "{out_model}"
-   """.format(command=run_opts.command,
-               combine_queue_opt=run_opts.combine_queue_opt,
-               dir=dir, raw_models=" ".join(raw_model_strings),
-               mbsize=mbsize,
-               out_model=out_model,
-               egs_dir=egs_dir))
+    common_lib.run_kaldi_command(
+        """{command} {combine_queue_opt} {dir}/log/combine.log \
+                nnet3-combine --num-iters=40 \
+                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                --verbose=3 {raw_models} \
+                "ark,bg:nnet3-merge-egs --measure-output-frames=false """
+        """--minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+                "{out_model}"
+        """.format(command=run_opts.command,
+                   combine_queue_opt=run_opts.combine_queue_opt,
+                   dir=dir, raw_models=" ".join(raw_model_strings),
+                   mbsize=mbsize,
+                   out_model=out_model,
+                   egs_dir=egs_dir))
 
     # Compute the probability of the final, combined model with
     # the same subset we used for the previous compute_probs, as the
     # different subsets will lead to different probs.
     if get_raw_nnet_from_am:
-        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts,
-                                    background_process_handler=background_process_handler)
+        compute_train_cv_probabilities(
+            dir, 'combined', egs_dir, run_opts, wait=False,
+            background_process_handler=background_process_handler)
     else:
-        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
-                                    background_process_handler=background_process_handler,
-                                    get_raw_nnet_from_am=False)
+        compute_train_cv_probabilities(
+            dir, 'final', egs_dir, run_opts, wait=False,
+            background_process_handler=background_process_handler,
+            get_raw_nnet_from_am=False)
+
+
+def get_realign_iters(realign_times, num_iters,
+                      num_jobs_initial, num_jobs_final):
+    """ Takes the realign_times string and identifies the approximate
+        iterations at which realignments have to be done.
 
+    realign_times is a space seperated string of values between 0 and 1
+    """
 
+    realign_iters = []
+    for realign_time in realign_times.split():
+        realign_time = float(realign_time)
+        assert(realign_time > 0 and realign_time < 1)
+        if num_jobs_initial == num_jobs_final:
+            realign_iter = int(0.5 + num_iters * realign_time)
+        else:
+            realign_iter = math.sqrt((1 - realign_time)
+                                     * math.pow(num_jobs_initial, 2)
+                                     + realign_time * math.pow(num_jobs_final,
+                                                               2))
+            realign_iter = realign_iter - num_jobs_initial
+            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter * num_iters
+        realign_iters.append(int(realign_iter))
+
+    return realign_iters
+
+
+def align(dir, data, lang, run_opts, iter=None, transform_dir=None,
+          online_ivector_dir=None):
+
+    alidir = '{dir}/ali{ali_suffix}'.format(
+            dir=dir,
+            ali_suffix="_iter_{0}".format(iter) if iter is not None else "")
+
+    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
+        gpu=" using gpu " if run_opts.realign_use_gpu else " ",
+        num_jobs=run_opts.realign_num_jobs))
+    common_lib.run_kaldi_command(
+        """steps/nnet3/align.sh --nj {num_jobs_align} \
+                --cmd "{align_cmd} {align_queue_opt}" \
+                --use-gpu {align_use_gpu} \
+                --transform-dir "{transform_dir}" \
+                --online-ivector-dir "{online_ivector_dir}" \
+                --iter "{iter}" {data} {lang} {dir} {alidir}""".format(
+                    dir=dir, align_use_gpu=("yes"
+                                            if run_opts.realign_use_gpu
+                                            else "no"),
+                    align_cmd=run_opts.realign_command,
+                    align_queue_opt=run_opts.realign_queue_opt,
+                    num_jobs_align=run_opts.realign_num_jobs,
+                    transform_dir=(transform_dir
+                                   if transform_dir is not None
+                                   else ""),
+                    online_ivector_dir=(online_ivector_dir
+                                        if online_ivector_dir is not None
+                                        else ""),
+                    iter=iter if iter is not None else "",
+                    alidir=alidir,
+                    lang=lang, data=data))
+    return alidir
+
+
+def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
+            prior_subset_size, num_archives, run_opts,
+            transform_dir=None, online_ivector_dir=None):
+    raise Exception("Realignment stage has not been implemented in nnet3")
+    logger.info("Getting average posterior for purposes of adjusting "
+                "the priors.")
+    # Note: this just uses CPUs, using a smallish subset of data.
+    # always use the first egs archive, which makes the script simpler;
+    # we're using different random subsets of it.
+
+    avg_post_vec_file = compute_average_posterior(
+            dir, iter, prev_egs_dir,
+            num_archives, prior_subset_size, run_opts)
+
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    logger.info("Re-adjusting priors based on computed posteriors")
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts)
+
+    alidir = align(dir, feat_dir, lang, run_opts, iter,
+                   transform_dir, online_ivector_dir)
+    common_lib.run_kaldi_command(
+        """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \
+                {alidir} {prev_egs_dir} {cur_egs_dir}""".format(
+                    command=run_opts.command,
+                    iter=iter,
+                    dir=dir,
+                    alidir=alidir,
+                    prev_egs_dir=prev_egs_dir,
+                    cur_egs_dir=cur_egs_dir))
+
+
+def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
+                     run_opts):
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/adjust_priors.final.log \
+                nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \
+                "{output_model}" """.format(
+                    command=run_opts.command,
+                    dir=dir, input_model=input_model,
+                    avg_posterior_vector=avg_posterior_vector,
+                    output_model=output_model))
+
+
+def compute_average_posterior(dir, iter, egs_dir, num_archives,
+                              prior_subset_size, run_opts,
+                              get_raw_nnet_from_am=True):
+    """ Computes the average posterior of the network
+    Note: this just uses CPUs, using a smallish subset of data.
+    """
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+
+    if run_opts.num_jobs_compute_prior > num_archives:
+        egs_part = 1
+    else:
+        egs_part = 'JOB'
+
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {0}/combined.mdl -|".format(dir)
+    else:
+        model = "{dir}/final.raw".format(dir=dir)
+
+    common_lib.run_kaldi_command(
+        """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
+                {dir}/log/get_post.{iter}.JOB.log \
+                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
+                ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+                nnet3-merge-egs --measure-output-frames=true \
+                --minibatch-size=128 ark:- ark:- \| \
+                nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
+                "{model}" ark:- ark:- \| \
+                matrix-sum-rows ark:- ark:- \| vector-sum ark:- \
+                {dir}/post.{iter}.JOB.vec""".format(
+                    command=run_opts.command,
+                    dir=dir, model=model,
+                    num_jobs_compute_prior=run_opts.num_jobs_compute_prior,
+                    prior_queue_opt=run_opts.prior_queue_opt,
+                    iter=iter, prior_subset_size=prior_subset_size,
+                    egs_dir=egs_dir, egs_part=egs_part,
+                    prior_gpu_opt=run_opts.prior_gpu_opt))
+
+    # make sure there is time for $dir/post.{iter}.*.vec to appear.
+    time.sleep(5)
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    common_lib.run_kaldi_command("""
+{command} {dir}/log/vector_sum.{iter}.log \
+    vector-sum {dir}/post.{iter}.*.vec {output_file}
+        """.format(command=run_opts.command,
+                   dir=dir, iter=iter, output_file=avg_post_vec_file))
+
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+    return avg_post_vec_file
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 244672692e9..aa74520fc55 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -1,7 +1,7 @@
 
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
 """ This is a module with method which will be used by scripts for
@@ -10,76 +10,83 @@
 """
 
 import logging
-import math
-import imp
-import os
-import sys
 
-sys.path.append("steps/libs")
-import common as common_lib
+import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 
 
-def GenerateEgsUsingTargets(data, targets_scp, egs_dir,
-                            left_context, right_context,
-                            valid_left_context, valid_right_context,
-                            run_opts, stage=0,
-                            feat_type='raw', online_ivector_dir=None,
-                            target_type='dense', num_targets=-1,
-                            samples_per_iter=20000, frames_per_eg=20, srand=0,
-                            egs_opts=None, cmvn_opts=None, transform_dir=None):
-    """
+def generate_egs_using_targets(data, targets_scp, egs_dir,
+                               left_context, right_context,
+                               valid_left_context, valid_right_context,
+                               run_opts, stage=0,
+                               feat_type='raw', online_ivector_dir=None,
+                               target_type='dense', num_targets=-1,
+                               samples_per_iter=20000, frames_per_eg=20,
+                               srand=0, egs_opts=None, cmvn_opts=None,
+                               transform_dir=None):
+    """ Wrapper for calling steps/nnet3/get_egs_targets.sh
+
     This method generates egs directly from an scp file of targets, instead of
-    getting them from the alignments (as with the method GenerateEgs).
-    The targets are in matrix format for target_type="dense" and in posterior
-    format for target_type="sparse".
-    If using sparse targets, num_targets must be explicity specified.
-    If using dense targets, num_targets is computed by reading the feature matrix dimension.
+    getting them from the alignments (as with the method generate_egs() in
+    module nnet3.train.frame_level_objf.acoustic_model).
+
+    Args:
+        target_type: "dense" if the targets are in matrix format
+                     "sparse" if the targets are in posterior format
+        num_targets: must be explicitly specified for "sparse" targets.
+            For "dense" targets, this option is ignored and the target dim
+            is computed from the target matrix dimension
+        For other options, see the file steps/nnet3/get_egs_targets.sh
     """
 
     if target_type == 'dense':
-        num_targets = common_lib.GetFeatDimFromScp(targets_scp)
+        num_targets = common_lib.get_feat_dim_from_scp(targets_scp)
     else:
         if num_targets == -1:
-            raise Exception("--num-targets is required if target-type is dense")
-
-    common_lib.RunKaldiCommand("""
-steps/nnet3/get_egs_targets.sh {egs_opts} \
-  --cmd "{command}" \
-  --cmvn-opts "{cmvn_opts}" \
-  --feat-type {feat_type} \
-  --transform-dir "{transform_dir}" \
-  --online-ivector-dir "{ivector_dir}" \
-  --left-context {left_context} --right-context {right_context} \
-  --valid-left-context {valid_left_context} \
-  --valid-right-context {valid_right_context} \
-  --stage {stage} \
-  --samples-per-iter {samples_per_iter} \
-  --frames-per-eg {frames_per_eg} \
-  --srand {srand} \
-  --target-type {target_type} \
-  --num-targets {num_targets} \
-  {data} {targets_scp} {egs_dir}
-      """.format(command=run_opts.egs_command,
-          cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
-          feat_type=feat_type,
-          transform_dir=transform_dir if transform_dir is not None else '',
-          ivector_dir=online_ivector_dir if online_ivector_dir is not None else '',
-          left_context=left_context, right_context=right_context,
-          valid_left_context=valid_left_context,
-          valid_right_context=valid_right_context,
-          stage=stage, samples_per_iter=samples_per_iter,
-          frames_per_eg=frames_per_eg, srand=srand,
-          num_targets=num_targets,
-          data=data,
-          targets_scp=targets_scp, target_type=target_type,
-          egs_dir=egs_dir,
-          egs_opts=egs_opts if egs_opts is not None else '' ))
+            raise Exception("--num-targets is required if "
+                            "target-type is sparse")
 
+    common_lib.run_kaldi_command(
+        """steps/nnet3/get_egs_targets.sh {egs_opts} \
+                --cmd "{command}" \
+                --cmvn-opts "{cmvn_opts}" \
+                --feat-type {feat_type} \
+                --transform-dir "{transform_dir}" \
+                --online-ivector-dir "{ivector_dir}" \
+                --left-context {left_context} --right-context {right_context} \
+                --valid-left-context {valid_left_context} \
+                --valid-right-context {valid_right_context} \
+                --stage {stage} \
+                --samples-per-iter {samples_per_iter} \
+                --frames-per-eg {frames_per_eg} \
+                --srand {srand} \
+                --target-type {target_type} \
+                --num-targets {num_targets} \
+                {data} {targets_scp} {egs_dir}
+        """.format(command=run_opts.egs_command,
+                   cmvn_opts=cmvn_opts if cmvn_opts is not None else '',
+                   feat_type=feat_type,
+                   transform_dir=(transform_dir
+                                  if transform_dir is not None
+                                  else ''),
+                   ivector_dir=(online_ivector_dir
+                                if online_ivector_dir is not None
+                                else ''),
+                   left_context=left_context, right_context=right_context,
+                   valid_left_context=valid_left_context,
+                   valid_right_context=valid_right_context,
+                   stage=stage, samples_per_iter=samples_per_iter,
+                   frames_per_eg=frames_per_eg, srand=srand,
+                   num_targets=num_targets,
+                   data=data,
+                   targets_scp=targets_scp, target_type=target_type,
+                   egs_dir=egs_dir,
+                   egs_opts=egs_opts if egs_opts is not None else ''))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index f341eddfad3..afeb8084e9e 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -1,162 +1,183 @@
 #!/usr/bin/env python
 
-
-# Copyright 2016 Vijayaditya Peddinti.
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
+""" This script is based on steps/nnet3/chain/train.sh
+"""
 
-# this script is based on steps/nnet3/chain/train.sh
-
-import os
-import subprocess
 import argparse
-import sys
-import pprint
 import logging
-import imp
-import traceback
+import os
+import pprint
 import shutil
-import math
+import sys
+import traceback
+
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+import libs.nnet3.train.chain_objf.acoustic_model as chain_lib
+import libs.nnet3.report.log_parse as nnet3_log_parse
 
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
-chain_lib = imp.load_source('ncl', 'steps/nnet3/libs/chain_train_lib.py')
-nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Starting chain model trainer (train.py)')
 
 
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="""
-    Trains RNN and DNN acoustic models using the 'chain' objective function.
-    """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve',
-    parents=[common_train_lib.common_parser])
-    # For common options defined in common_train_lib.common_parser,
-    # see steps/nnet3/libs/common_train_lib.py
+def get_args():
+    """ Get args from stdin.
+
+    We add compulsary arguments as named arguments for readability
+
+    The common options are defined in the object
+    libs.nnet3.train.common.CommonParser.parser.
+    See steps/libs/nnet3/train/common.py
+    """
+
+    parser = argparse.ArgumentParser(
+        description="""Trains RNN and DNN acoustic models using the 'chain'
+        objective function.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        conflict_handler='resolve',
+        parents=[common_train_lib.CommonParser.parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default = 150,
-                        help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.")
+                        default=150,
+                        help="""Number of output labels in each example.
+                        Caution: if you double this you should halve
+                        --trainer.samples-per-iter.""")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
-                        default = None, action = common_train_lib.NullstrToNoneAction,
+                        default=None, action=common_lib.NullstrToNoneAction,
                         help="options to be be passed to chain-est-phone-lm")
-    parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize',
-                        default = 0.0,
-                        help="Weight of regularization function which is the"
-                        " l2-norm of the output of the network. It should be"
-                        " used without the log-softmax layer for the outputs."
-                        " As l2-norm of the log-softmax outputs can dominate"
-                        " the objective function.")
-    parser.add_argument("--chain.xent-regularize", type=float, dest='xent_regularize',
-                        default = 0.0,
-                        help="Weight of regularization function which is the"
-                        " cross-entropy cost the outputs.")
-    parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance',
-                        default = 5, help="")
-    parser.add_argument("--chain.left-tolerance", type=int, dest='left_tolerance',
-                        default = 5, help="")
-    parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient',
-                        default = 0.00001, help="")
-    parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights',
-                        default=True, action=common_train_lib.StrToBoolAction,
-                        choices = ["true", "false"],
+    parser.add_argument("--chain.l2-regularize", type=float,
+                        dest='l2_regularize', default=0.0,
+                        help="""Weight of regularization function which is the
+                        l2-norm of the output of the network. It should be used
+                        without the log-softmax layer for the outputs.  As
+                        l2-norm of the log-softmax outputs can dominate the
+                        objective function.""")
+    parser.add_argument("--chain.xent-regularize", type=float,
+                        dest='xent_regularize', default=0.0,
+                        help="Weight of regularization function which is the "
+                        "cross-entropy cost the outputs.")
+    parser.add_argument("--chain.right-tolerance", type=int,
+                        dest='right_tolerance', default=5, help="")
+    parser.add_argument("--chain.left-tolerance", type=int,
+                        dest='left_tolerance', default=5, help="")
+    parser.add_argument("--chain.leaky-hmm-coefficient", type=float,
+                        dest='leaky_hmm_coefficient', default=0.00001,
+                        help="")
+    parser.add_argument("--chain.apply-deriv-weights", type=str,
+                        dest='apply_deriv_weights', default=True,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
                         help="")
-    parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights',
-                        default =0,
-                        help="Can be used to set to zero the weights of derivs"
-                        " from frames near the edges.  (counts subsampled frames)")
+    parser.add_argument("--chain.truncate-deriv-weights", type=float,
+                        dest='truncate_deriv_weights', default=0,
+                        help="""Can be used to set to zero the weights of
+                        derivs from frames near the edges.  (counts subsampled
+                        frames)""")
     parser.add_argument("--chain.frame-subsampling-factor", type=int,
-                        dest='frame_subsampling_factor',
-                        default = 3,
-                        help="ratio of frames-per-second of features we train"
-                        " on, to chain model's output")
+                        dest='frame_subsampling_factor', default=3,
+                        help="ratio of frames-per-second of features we "
+                        "train on, to chain model's output")
     parser.add_argument("--chain.alignment-subsampling-factor", type=int,
                         dest='alignment_subsampling_factor',
-                        default = 3,
-                        help="ratio of frames-per-second of input alignments to"
-                        " chain model's output")
+                        default=3,
+                        help="ratio of frames-per-second of input "
+                        "alignments to chain model's output")
     parser.add_argument("--chain.left-deriv-truncate", type=int,
                         dest='left_deriv_truncate',
-                        default = None, help="")
+                        default=None, help="")
     parser.add_argument("--chain.right-deriv-truncate", type=int,
                         dest='right_deriv_truncate',
-                        default = None, help="")
-
+                        default=None, help="")
 
     # trainer options
     parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 10,
+                        default=10,
                         help="Number of epochs to train the model")
-    parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter',
-                        default=800000,
-                        help ="Each iteration of training, see this many [input]"
-                        " frames per job.  This option is passed to get_egs.sh."
-                        " Aim for about a minute of training time")
+    parser.add_argument("--trainer.frames-per-iter", type=int,
+                        dest='frames_per_iter', default=800000,
+                        help="""Each iteration of training, see this many
+                        [input] frames per job.  This option is passed to
+                        get_egs.sh.  Aim for about a minute of training
+                        time""")
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0002,
+    parser.add_argument("--trainer.optimization.initial-effective-lrate",
+                        type=float, dest='initial_effective_lrate',
+                        default=0.0002,
                         help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00002,
+    parser.add_argument("--trainer.optimization.final-effective-lrate",
+                        type=float, dest='final_effective_lrate',
+                        default=0.00002,
                         help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
-                        default = 1.0,
-                        help="Scaling factor used for scaling the parameter"
-                        " matrices when the derivative averages are below the"
-                        " shrink-threshold at the non-linearities")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
-                        default = 0.15,
-                        help="If the derivative averages are below this"
-                        " threshold we scale the parameter matrices with the"
-                        " shrink-value. It is less than 0.25 for sigmoid non-linearities.")
-    parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str, dest='shrink_nonlinearity',
-                        default = "SigmoidComponent", choices = ["TanhComponent", "SigmoidComponent"],
-                        help="The non-linear component from which the"
-                        " deriv-avg values are going to used to compute"
-                        " mean-deriv-avg. The mean-deriv-avg is going to be"
-                        " compared with shrink-threshold. Be careful to specify"
-                        " a shrink-threshold which is dependent on the"
-                        " shrink-nonlinearity type")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float,
+                        dest='shrink_value', default=0.99,
+                        help="""Scaling factor used for scaling the parameter
+                        matrices when the derivative averages are below the
+                        shrink-threshold at the non-linearities""")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
+                        dest='shrink_threshold', default=0.15,
+                        help="""If the derivative averages are below this
+                        threshold we scale the parameter matrices with the
+                        shrink-value. It is less than 0.25 for sigmoid
+                        non-linearities.""")
+    parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str,
+                        dest='shrink_nonlinearity', default="SigmoidComponent",
+                        choices=["TanhComponent", "SigmoidComponent"],
+                        help="""The non-linear component from which the
+                        deriv-avg values are going to used to compute
+                        mean-deriv-avg. The mean-deriv-avg is going to be
+                        compared with shrink-threshold. Be careful to specify a
+                        shrink-threshold which is dependent on the
+                        shrink-nonlinearity type""")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
-                        default=512,
-                        help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
+                        dest='num_chunk_per_minibatch', default=512,
+                        help="Number of sequences to be processed in "
+                        "parallel every minibatch")
 
     # General options
-    parser.add_argument("--feat-dir", type=str, required = True,
-                        help="Directory with features used for training the neural network.")
-    parser.add_argument("--tree-dir", type=str, required = True,
+    parser.add_argument("--feat-dir", type=str, required=True,
+                        help="Directory with features used for training "
+                        "the neural network.")
+    parser.add_argument("--tree-dir", type=str, required=True,
                         help="Tree directory")
-    parser.add_argument("--lat-dir", type=str, required = True,
-                        help="Directory with alignments used for training the neural network.")
-    parser.add_argument("--dir", type=str, required = True,
-                        help="Directory to store the models and all other files.")
+    parser.add_argument("--lat-dir", type=str, required=True,
+                        help="Directory with numerator lattices "
+                        "used for training the neural network.")
+    parser.add_argument("--dir", type=str, required=True,
+                        help="Directory to store the models and "
+                        "all other files.")
 
     print(' '.join(sys.argv))
     print(sys.argv)
 
     args = parser.parse_args()
 
-    [args, run_opts] = ProcessArgs(args)
+    [args, run_opts] = process_args(args)
 
     return [args, run_opts]
 
-def ProcessArgs(args):
-    # process the options
+
+def process_args(args):
+    """ Process the options got from get_args()
+    """
+
     if args.chunk_width < 1:
         raise Exception("--egs.chunk-width should have a minimum value of 1")
 
@@ -166,282 +187,87 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("""This scripts expects {0} to exist and have a configs
-        directory which is the output of make_configs.py script""")
+    if (not os.path.exists(args.dir)
+            or not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs "
+                        "directory which is the output of "
+                        "make_configs.py script")
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
     # set the options corresponding to args.use_gpu
-    run_opts = common_train_lib.RunOpts()
+    run_opts = common_lib.RunOpts()
     if args.use_gpu:
-        if not common_train_lib.CheckIfCudaCompiled():
-            logger.warning("""
-    You are running with one thread but you have not compiled
-    for CUDA.  You may be running a setup optimized for GPUs.  If you have
-    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+        if not common_lib.check_if_cuda_compiled():
+            logger.warning(
+                """You are running with one thread but you have not compiled
+                   for CUDA.  You may be running a setup optimized for GPUs.
+                   If you have GPUs and have nvcc installed, go to src/ and do
+                   ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
         run_opts.combine_queue_opt = "--gpu 1"
 
     else:
-        logger.warning("""
-    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+        logger.warning("Without using a GPU this will be very slow. "
+                       "nnet3 does not yet support multiple threads.")
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
 
     run_opts.command = args.command
+    run_opts.egs_command = (args.egs_command
+                            if args.egs_command is not None else
+                            args.command)
 
     return [args, run_opts]
 
-# Called from TrainOneIteration, this model does one iteration of training
-# with 'num_jobs' jobs, and
-# writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
-def TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts):
-    # We cannot easily use a single parallel SGE job to do the main training,
-    # because the computation of which archive and which --frame option
-    # to use for each job is a little complex, so we spawn each one separately.
-    # this is no longer true for RNNs as we use do not use the --frame option
-    # but we use the same script for consistency with FF-DNN code
-
-    deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
-
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
-        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        if job == 1:
-            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
-        else:
-            cur_cache_io_opts = cache_io_opts
-
-        process_handle = common_train_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-chain-train {parallel_train_opts} \
-  --apply-deriv-weights={app_deriv_wts} \
-  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-   "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     deriv_time_opts = deriv_time_opts,
-                     trunc_deriv = truncate_deriv_weights,
-                     app_deriv_wts = apply_deriv_weights,
-                     fr_shft = frame_shift, l2 = l2_regularize,
-                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     cache_io_opts = cur_cache_io_opts,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
-                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      frame_subsampling_factor, truncate_deriv_weights,
-                      run_opts):
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
-
-    if iter > 0:
-        chain_lib.ComputeProgress(dir, iter, run_opts)
 
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+def train(args, run_opts, background_process_handler):
+    """ The main function for training.
 
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        cache_io_opts = ""
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-        cur_max_param_change = max_param_change
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-        cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    TrainNewModels(dir = dir, iter = iter, srand = srand, num_jobs = num_jobs,
-                   num_archives_processed = num_archives_processed,
-                   num_archives = num_archives,
-                   raw_model_string = raw_model_string,
-                   egs_dir = egs_dir,
-                   apply_deriv_weights = apply_deriv_weights,
-                   left_deriv_truncate = left_deriv_truncate,
-                   right_deriv_truncate = right_deriv_truncate,
-                   l2_regularize = l2_regularize,
-                   xent_regularize = xent_regularize,
-                   leaky_hmm_coefficient = leaky_hmm_coefficient,
-                   momentum = momentum,
-                   max_param_change = cur_max_param_change,
-                   shuffle_buffer_size = shuffle_buffer_size,
-                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor = frame_subsampling_factor,
-                   truncate_deriv_weights = truncate_deriv_weights,
-                   cache_io_opts = cache_io_opts, run_opts = run_opts)
-
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
+    Args:
+        args: a Namespace object with the required parameters
+            obtained from the function process_args()
+        run_opts: RunOpts object obtained from the process_args()
+    """
 
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
-    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
-                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
-                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
-                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
-        if not os.path.isfile(file):
-            raise Exception('Expected {0} to exist.'.format(file))
-
-
-# args is a Namespace with the required parameters
-def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Check files
-    CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir)
+    chain_lib.check_for_required_files(args.feat_dir, args.tree_dir,
+                                       args.lat_dir)
 
     # Set some variables.
-    num_jobs = common_train_lib.GetNumberOfJobs(args.tree_dir)
-    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
+    num_jobs = common_lib.get_number_of_leaves_from_tree(args.tree_dir)
+    feat_dim = common_lib.get_feat_dim(args.feat_dir)
+    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    common_train_lib.SplitData(args.feat_dir, num_jobs)
+    common_lib.split_data(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
-    f = open('{0}/num_jobs'.format(args.dir), 'w')
-    f.write(str(num_jobs))
-    f.close()
+    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
+        f.write(str(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.parse_generic_config_vars_file(var_file)
 
     # Set some variables.
-
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
+        # this is really the number of times we add layers to the network for
+        # discriminative pretraining
         num_hidden_layers = variables['num_hidden_layers']
     except KeyError as e:
-        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
-            str(e), '{0}/configs'.format(args.dir)))
+        raise Exception("KeyError {0}: Variables need to be defined in "
+                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
@@ -452,40 +278,49 @@ def Train(args, run_opts):
     # transform.
     if (args.stage <= -6):
         logger.info("Creating phone language-model")
-        chain_lib.CreatePhoneLm(args.dir, args.tree_dir, run_opts, lm_opts = args.lm_opts)
+        chain_lib.create_phone_lm(args.dir, args.tree_dir, run_opts,
+                                  lm_opts=args.lm_opts)
 
     if (args.stage <= -5):
         logger.info("Creating denominator FST")
-        chain_lib.CreateDenominatorFst(args.dir, args.tree_dir, run_opts)
+        chain_lib.create_denominator_fst(args.dir, args.tree_dir, run_opts)
 
     if (args.stage <= -4):
-        logger.info("Initializing a basic network for estimating preconditioning matrix")
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/nnet_init.log \
-    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command = run_opts.command,
-               dir = args.dir))
+        logger.info("Initializing a basic network for estimating "
+                    "preconditioning matrix")
+        common_lib.run_kaldi_command(
+            """{command} {dir}/log/nnet_init.log \
+                    nnet3-init --srand=-2 {dir}/configs/init.config \
+                    {dir}/init.raw""".format(command=run_opts.command,
+                                             dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
         # this is where get_egs.sh is called.
-        chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
-                                   left_context + args.frame_subsampling_factor/2,
-                                   right_context + args.frame_subsampling_factor/2,
-                                   run_opts,
-                                   left_tolerance = args.left_tolerance,
-                                   right_tolerance = args.right_tolerance,
-                                   frame_subsampling_factor = args.frame_subsampling_factor,
-                                   alignment_subsampling_factor = args.alignment_subsampling_factor,
-                                   frames_per_eg = args.chunk_width,
-                                   egs_opts = args.egs_opts,
-                                   cmvn_opts = args.cmvn_opts,
-                                   online_ivector_dir = args.online_ivector_dir,
-                                   frames_per_iter = args.frames_per_iter,
-                                   srand = args.srand,
-                                   transform_dir = args.transform_dir,
-                                   stage = args.egs_stage)
+        chain_lib.generate_chain_egs(
+            dir=args.dir, data=args.feat_dir,
+            latdir=args.lat_dir, egs_dir=default_egs_dir,
+            left_context=left_context + args.frame_subsampling_factor/2,
+            right_context=right_context + args.frame_subsampling_factor/2,
+            valid_left_context=(left_context + args.frame_subsampling_factor/2
+                                + args.chunk_width),
+            valid_right_context=(right_context
+                                 + args.frame_subsampling_factor/2
+                                 + args.chunk_width),
+            run_opts=run_opts,
+            left_tolerance=args.left_tolerance,
+            right_tolerance=args.right_tolerance,
+            frame_subsampling_factor=args.frame_subsampling_factor,
+            alignment_subsampling_factor=args.alignment_subsampling_factor,
+            frames_per_eg=args.chunk_width,
+            srand=args.srand,
+            egs_opts=args.egs_opts,
+            cmvn_opts=args.cmvn_opts,
+            online_ivector_dir=args.online_ivector_dir,
+            frames_per_iter=args.frames_per_iter,
+            transform_dir=args.transform_dir,
+            stage=args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -494,154 +329,183 @@ def Train(args, run_opts):
 
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
-             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
-                                           left_context, right_context) )
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                                        left_context, right_context))
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
     if (args.num_jobs_final > num_archives_expanded):
-        raise Exception('num_jobs_final cannot exceed the expanded number of archives')
+        raise Exception('num_jobs_final cannot exceed the '
+                        'expanded number of archives')
 
     # copy the properties of the egs to dir for
     # use during decoding
-    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
 
-        chain_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs = args.max_lda_jobs,
-                                               rand_prune = args.rand_prune)
+        chain_lib.compute_preconditioning_matrix(
+            args.dir, egs_dir, num_archives, run_opts,
+            max_lda_jobs=args.max_lda_jobs,
+            rand_prune=args.rand_prune)
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        chain_lib.PrepareInitialAcousticModel(args.dir, run_opts)
+        chain_lib.prepare_initial_acoustic_model(args.dir, run_opts)
 
-    file_handle = open("{0}/frame_subsampling_factor".format(args.dir),"w")
-    file_handle.write(str(args.frame_subsampling_factor))
-    file_handle.close()
+    with open("{0}/frame_subsampling_factor".format(args.dir), "w") as f:
+        f.write(str(args.frame_subsampling_factor))
 
-    # set num_iters so that as close as possible, we process the data $num_epochs
-    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
-    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    # set num_iters so that as close as possible, we process the data
+    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
+    # $num_epochs*$num_archives, where
+    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_to_process = args.num_epochs * num_archives_expanded
     num_archives_processed = 0
-    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
-
-    num_iters_combine = common_train_lib.VerifyIterations(
-                                         num_iters, args.num_epochs,
-                                         num_hidden_layers, num_archives_expanded,
-                                         args.max_models_combine, args.add_layers_period,
-                                         args.num_jobs_final)
-
-    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
-                        common_train_lib.GetLearningRate(
-                                         iter, current_num_jobs, num_iters,
-                                         num_archives_processed,
-                                         num_archives_to_process,
-                                         args.initial_effective_lrate,
-                                         args.final_effective_lrate)
-                    )
-
-    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    num_iters = ((num_archives_to_process * 2)
+                 / (args.num_jobs_initial + args.num_jobs_final))
+
+    models_to_combine = common_train_lib.verify_iterations(
+        num_iters, args.num_epochs,
+        num_hidden_layers, num_archives_expanded,
+        args.max_models_combine, args.add_layers_period,
+        args.num_jobs_final)
+
+    def learning_rate(iter, current_num_jobs, num_archives_processed):
+        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
+                                           num_archives_processed,
+                                           num_archives_to_process,
+                                           args.initial_effective_lrate,
+                                           args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = "
+                "{1} iterations".format(args.num_epochs, num_iters))
+
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+        current_num_jobs = int(0.5 + args.num_jobs_initial
+                               + (args.num_jobs_final - args.num_jobs_initial)
+                               * float(iter) / num_iters)
 
         if args.stage <= iter:
+            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
             shrinkage_value = 1.0
             if args.shrink_value != 1.0:
-                model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
                 shrinkage_value = (args.shrink_value
-                                   if common_train_lib.DoShrinkage(iter, model_file,
-                                                                   args.shrink_nonlinearity,
-                                                                   args.shrink_threshold)
+                                   if common_train_lib.do_shrinkage(
+                                        iter, model_file,
+                                        args.shrink_nonlinearity,
+                                        args.shrink_threshold)
                                    else 1
                                    )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
-
-            TrainOneIteration(dir = args.dir, iter = iter, srand = args.srand,
-                              egs_dir = egs_dir,
-                              num_jobs = current_num_jobs,
-                              num_archives_processed =  num_archives_processed,
-                              num_archives = num_archives,
-                              learning_rate = LearningRate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value = shrinkage_value,
-                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                              num_hidden_layers = num_hidden_layers,
-                              add_layers_period = args.add_layers_period,
-                              apply_deriv_weights = args.apply_deriv_weights,
-                              left_deriv_truncate = args.left_deriv_truncate,
-                              right_deriv_truncate = args.right_deriv_truncate,
-                              l2_regularize = args.l2_regularize,
-                              xent_regularize = args.xent_regularize,
-                              leaky_hmm_coefficient = args.leaky_hmm_coefficient,
-                              momentum = args.momentum,
-                              max_param_change = args.max_param_change,
-                              shuffle_buffer_size = args.shuffle_buffer_size,
-                              frame_subsampling_factor = args.frame_subsampling_factor,
-                              truncate_deriv_weights = args.truncate_deriv_weights,
-                              run_opts = run_opts)
+            logger.info("On iteration {0}, learning rate is {1} and "
+                        "shrink value is {2}.".format(
+                            iter, learning_rate(iter, current_num_jobs,
+                                                num_archives_processed),
+                            shrinkage_value))
+
+            chain_lib.train_one_iteration(
+                dir=args.dir,
+                iter=iter,
+                srand=args.srand,
+                egs_dir=egs_dir,
+                num_jobs=current_num_jobs,
+                num_archives_processed=num_archives_processed,
+                num_archives=num_archives,
+                learning_rate=learning_rate(iter, current_num_jobs,
+                                            num_archives_processed),
+                shrinkage_value=shrinkage_value,
+                num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                num_hidden_layers=num_hidden_layers,
+                add_layers_period=args.add_layers_period,
+                apply_deriv_weights=args.apply_deriv_weights,
+                left_deriv_truncate=args.left_deriv_truncate,
+                right_deriv_truncate=args.right_deriv_truncate,
+                l2_regularize=args.l2_regularize,
+                xent_regularize=args.xent_regularize,
+                leaky_hmm_coefficient=args.leaky_hmm_coefficient,
+                momentum=args.momentum,
+                max_param_change=args.max_param_change,
+                shuffle_buffer_size=args.shuffle_buffer_size,
+                frame_subsampling_factor=args.frame_subsampling_factor,
+                truncate_deriv_weights=args.truncate_deriv_weights,
+                run_opts=run_opts,
+                background_process_handler=background_process_handler)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(
-                                 args.dir, iter-2, num_iters, num_iters_combine,
-                                 args.preserve_model_interval)
+                # do a clean up everythin but the last 2 models, under certain
+                # conditions
+                common_train_lib.remove_model(
+                    args.dir, iter-2, num_iters, models_to_combine,
+                    args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
                 if iter % reporting_iter_interval == 0:
-                # lets do some reporting
-                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability")
+                    # lets do some reporting
+                    [report, times, data] = (
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    common_train_lib.SendMail(message, subject, args.email)
+                    subject = ("Update : Expt {dir} : "
+                               "Iter {iter}".format(dir=args.dir, iter=iter))
+                    common_lib.send_mail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
-                                args.num_chunk_per_minibatch, egs_dir,
-                                args.leaky_hmm_coefficient, args.l2_regularize,
-                                args.xent_regularize, run_opts)
+        chain_lib.combine_models(
+            args.dir, num_iters, models_to_combine,
+            args.num_chunk_per_minibatch, egs_dir,
+            args.leaky_hmm_coefficient, args.l2_regularize,
+            args.xent_regularize, run_opts,
+            background_process_handler=background_process_handler)
 
     if args.cleanup:
-        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        logger.info("Cleaning up the experiment directory "
+                    "{0}".format(args.dir))
         remove_egs = args.remove_egs
         if args.egs_dir is not None:
             # this egs_dir was not created by this experiment so we will not
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval = args.preserve_model_interval,
-                         remove_egs = remove_egs)
+        common_train_lib.clean_nnet_dir(
+            args.dir, num_iters, egs_dir,
+            preserve_model_interval=args.preserve_model_interval,
+            remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability")
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
-        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_lib.send_mail(report, "Update : Expt {0} : "
+                                     "complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
-    report_handle.write(report)
-    report_handle.close()
+    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
+        f.write(report)
 
-    os.system("steps/info/chain_dir_info.pl " + args.dir)
+    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
+                                 "{0}".format(args.dir))
 
-def Main():
-    [args, run_opts] = GetArgs()
+
+def main():
+    [args, run_opts] = get_args()
     try:
-        Train(args, run_opts)
+        background_process_handler = common_lib.BackgroundProcessHandler(
+            polling_time=args.background_polling_time)
+        train(args, run_opts, background_process_handler)
+        background_process_handler.ensure_processes_are_done()
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            common_train_lib.SendMail(message, message, args.email)
+            message = ("Training session for experiment {dir} "
+                       "died due to an error.".format(dir=args.dir))
+            common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
         raise e
 
+
 if __name__ == "__main__":
-    Main()
+    main()
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
deleted file mode 100755
index dd76edc5b33..00000000000
--- a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2016 Vijayaditya Peddinti.
-# Apache 2.0.
-
-from __future__ import division
-import sys, glob, re, math, datetime, argparse
-import imp
-
-ntl = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
-
-#exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
-def ParseProgressLogsForNonlinearityStats(exp_dir):
-    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
-    stats_per_component_per_iter = {}
-
-    progress_log_lines  = ntl.RunKaldiCommand('grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files))[0]
-
-    parse_regex = re.compile(".*progress.([0-9]+).log:component name=(.+) type=(.*)Component,.*value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]")
-    for line in progress_log_lines.split("\n") :
-        mat_obj = parse_regex.search(line)
-        if mat_obj is None:
-            continue
-        groups = mat_obj.groups()
-        # groups  = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', '0.134', '0.0397')
-        iteration = int(groups[0])
-        component_name = groups[1]
-        component_type = groups[2]
-        value_mean = float(groups[3])
-        value_stddev = float(groups[4])
-        deriv_mean = float(groups[5])
-        deriv_stddev = float(groups[6])
-        try:
-            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
-        except KeyError:
-            stats_per_component_per_iter[component_name] = {}
-            stats_per_component_per_iter[component_name]['type'] = component_type
-            stats_per_component_per_iter[component_name]['stats'] = {}
-            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
-
-    return stats_per_component_per_iter
-
-def ParseDifferenceString(string):
-    dict = {}
-    for parts in string.split():
-        sub_parts = parts.split(":")
-        dict[sub_parts[0]] = float(sub_parts[1])
-    return dict
-
-#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component name=BLstm1_forward_c type=ClipGradientComponent, dim=512, norm-based-clipping=true, clipping-threshold=30, clipped-proportion=0.000565527, self-repair-clipped-proportion-threshold=0.01, self-repair-target=0, self-repair-scale=1
-
-class MalformedClippedProportionLineException(Exception):
-    def __init__(self, line):
-        Exception.__init__(self, "Malformed line encountered while trying to"
-                                 " extract clipped-proportions.\n"+line)
-
-def ParseProgressLogsForClippedProportion(exp_dir):
-
-    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
-    component_names = set([])
-    progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format("clipped-proportion", progress_log_files))[0]
-    parse_regex = re.compile(".*progress\.([0-9]+)\.log:component name=(.*) type=.* clipped-proportion=([0-9\.e\-]+)")
-
-    cp_per_component_per_iter = {}
-
-    max_iteration = 0
-    component_names = set([])
-    for line in progress_log_lines.split("\n") :
-        mat_obj = parse_regex.search(line)
-        if mat_obj is None:
-            if line.strip() == "":
-                continue
-            raise MalformedClippedProportionLineException(line)
-        groups = mat_obj.groups()
-        iteration = int(groups[0])
-        max_iteration = max(max_iteration, iteration)
-        name = groups[1]
-        clipped_proportion = float(groups[2])
-        if clipped_proportion > 1:
-            raise MalformedClippedProportionLineException(line)
-        if not cp_per_component_per_iter.has_key(iteration):
-            cp_per_component_per_iter[iteration] = {}
-        cp_per_component_per_iter[iteration][name] = clipped_proportion
-        component_names.add(name)
-    component_names = list(component_names)
-    component_names.sort()
-
-    # re arranging the data into an array
-    # and into an cp_per_iter_per_component
-    cp_per_iter_per_component = {}
-    for component_name in component_names:
-        cp_per_iter_per_component[component_name] = []
-    data = []
-    data.append(["iteration"]+component_names)
-    for iter in range(max_iteration+1):
-        if not cp_per_component_per_iter.has_key(iter):
-            continue
-        comp_dict = cp_per_component_per_iter[iter]
-        row = [iter]
-        for component in component_names:
-            try:
-                row.append(comp_dict[component])
-                cp_per_iter_per_component[component].append([iter, comp_dict[component]])
-            except KeyError:
-                # if clipped proportion is not available for a particular
-                # component it is set to None
-                # this usually happens during layer-wise discriminative training
-                row.append(None)
-        data.append(row)
-
-
-    return {'table' : data,
-            'cp_per_component_per_iter' : cp_per_component_per_iter,
-            'cp_per_iter_per_component' : cp_per_iter_per_component}
-
-#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
-def ParseProgressLogsForParamDiff(exp_dir, pattern, logger = None):
-    if pattern not in set(["Relative parameter differences", "Parameter differences"]):
-        raise Exception("Unknown value for pattern : {0}".format(pattern))
-
-    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
-    progress_per_iter = {}
-    component_names = set([])
-    progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format(pattern, progress_log_files))[0]
-    parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern))
-    for line in progress_log_lines.split("\n") :
-        mat_obj = parse_regex.search(line)
-        if mat_obj is None:
-            continue
-        groups = mat_obj.groups()
-        iteration = groups[0]
-        differences = ParseDifferenceString(groups[1])
-        component_names  = component_names.union(differences.keys())
-        progress_per_iter[int(iteration)] = differences
-
-    component_names = list(component_names)
-    component_names.sort()
-    # rearranging the parameter differences available per iter
-    # into parameter differences per component
-    progress_per_component = {}
-    for cn in component_names:
-        progress_per_component[cn] = {}
-
-    max_iter = max(progress_per_iter.keys())
-    total_missing_iterations = 0
-    gave_user_warning = False
-    for iter in range(max_iter + 1):
-        try:
-            component_dict = progress_per_iter[iter]
-        except KeyError:
-            continue
-
-        for component_name in component_names:
-            try:
-                progress_per_component[component_name][iter] = component_dict[component_name]
-            except KeyError:
-                total_missing_iterations += 1
-                # the component was not found this iteration, may be because of layerwise discriminative training
-                pass
-        if (total_missing_iterations/len(component_names) > 20) and not gave_user_warning and logger is not None:
-            logger.warning("There are more than {0} missing iterations per component. Something might be wrong.".format(total_missing_iterations/len(component_names)))
-            gave_user_warning = True
-
-    return {'progress_per_component' : progress_per_component,
-            'component_names' : component_names,
-            'max_iter' : max_iter}
-
-def ParseTrainLogs(exp_dir):
-  train_log_files = "%s/log/train.*.log" % (exp_dir)
-  train_log_lines = ntl.RunKaldiCommand('grep -e Accounting {0}'.format(train_log_files))[0]
-  parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*")
-
-  train_times = {}
-  for line in train_log_lines.split('\n'):
-    mat_obj = parse_regex.search(line)
-    if mat_obj is not None:
-        groups = mat_obj.groups()
-        try:
-            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
-        except KeyError:
-            train_times[int(groups[0])] = {}
-            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
-  iters = train_times.keys()
-  for iter in iters:
-      values = train_times[iter].values()
-      train_times[iter] = max(values)
-  return train_times
-
-def ParseProbLogs(exp_dir, key = 'accuracy'):
-    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
-    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
-    train_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, train_prob_files), wait = True)[0]
-    valid_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, valid_prob_files))[0]
-
-    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra
-    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames.
-    parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-e]+) .*per frame")
-    train_loss={}
-    valid_loss={}
-
-
-    for line in train_prob_strings.split('\n'):
-        mat_obj = parse_regex.search(line)
-        if mat_obj is not None:
-            groups = mat_obj.groups()
-            if groups[1] == key:
-                train_loss[int(groups[0])] = groups[2]
-    for line in valid_prob_strings.split('\n'):
-        mat_obj = parse_regex.search(line)
-        if mat_obj is not None:
-            groups = mat_obj.groups()
-            if groups[1] == key:
-                valid_loss[int(groups[0])] = groups[2]
-    iters = list(set(valid_loss.keys()).intersection(train_loss.keys()))
-    iters.sort()
-    return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters)
-
-def GenerateAccuracyReport(exp_dir, key = "accuracy"):
-    times = ParseTrainLogs(exp_dir)
-    data = ParseProbLogs(exp_dir, key)
-    report = []
-    report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
-    for x in data:
-        try:
-            report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1]))
-        except KeyError:
-            continue
-
-    total_time = 0
-    for iter in times.keys():
-        total_time += times[iter]
-    report.append("Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time))))
-    return ["\n".join(report), times, data]
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 2fd5f871f55..77a37d52fb2 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -1,108 +1,121 @@
 #!/usr/bin/env python
 
-
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
+""" This script is based on steps/nnet3/tdnn/train.sh
+"""
 
-# this script is based on steps/nnet3/tdnn/train.sh
-
-
-import os
-import subprocess
 import argparse
-import sys
-import pprint
 import logging
-import imp
-import traceback
+import os
+import pprint
 import shutil
+import sys
+import traceback
+
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+import libs.nnet3.train.frame_level_objf as train_lib
+import libs.nnet3.report.log_parse as nnet3_log_parse
 
-common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Starting DNN trainer (train_dnn.py)')
 
 
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="""
-    Trains a feed forward DNN acoustic model using the cross-entropy objective.
-    DNNs include simple DNNs, TDNNs and CNNs.
-    """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler='resolve',
-    parents=[common_train_lib.common_parser])
-    # For common options defined in common_train_lib.common_parser,
-    # see steps/nnet3/libs/common_train_lib.py
+def get_args():
+    """ Get args from stdin.
+
+    We add compulsary arguments as named arguments for readability
+
+    The common options are defined in the object
+    libs.nnet3.train.common.CommonParser.parser.
+    See steps/libs/nnet3/train/common.py
+    """
+
+    parser = argparse.ArgumentParser(
+        description="""Trains a feed forward DNN acoustic model using the
+        cross-entropy objective.  DNNs include simple DNNs, TDNNs and CNNs.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        conflict_handler='resolve',
+        parents=[common_train_lib.CommonParser.parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default=8,
                         help="Number of output labels per example")
 
-    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
-                        default=512,
-                        help="Size of the minibatch used to compute the gradient")
+    parser.add_argument("--trainer.optimization.minibatch-size",
+                        type=float, dest='minibatch_size', default=512,
+                        help="Size of the minibatch used to compute the "
+                        "gradient")
 
     # General options
     parser.add_argument("--feat-dir", type=str, required=True,
-                        help="Directory with features used for training the neural network.")
+                        help="Directory with features used for training "
+                        "the neural network.")
     parser.add_argument("--lang", type=str, required=True,
                         help="Language directory")
     parser.add_argument("--ali-dir", type=str, required=True,
-                        help="Directory with alignments used for training the neural network.")
+                        help="Directory with alignments used for training "
+                        "the neural network.")
     parser.add_argument("--dir", type=str, required=True,
-                        help="Directory to store the models and all other files.")
+                        help="Directory to store the models and "
+                        "all other files.")
 
     print(' '.join(sys.argv))
     print(sys.argv)
 
     args = parser.parse_args()
 
-    [args, run_opts] = ProcessArgs(args)
+    [args, run_opts] = process_args(args)
 
     return [args, run_opts]
 
-def ProcessArgs(args):
-    # process the options
+
+def process_args(args):
+    """ Process the options got from get_args()
+    """
+
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
-    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("This scripts expects {0} to exist and have a configs"
-        " directory which is the output of make_configs.py script")
+    if (not os.path.exists(args.dir)
+            or not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs "
+                        "directory which is the output of "
+                        "make_configs.py script")
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_train_lib.RunOpts()
+    run_opts = common_lib.RunOpts()
     if args.use_gpu:
-        if not common_train_lib.CheckIfCudaCompiled():
-            logger.warning("""
-    You are running with one thread but you have not compiled
-    for CUDA.  You may be running a setup optimized for GPUs.  If you have
-    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+        if not common_lib.check_if_cuda_compiled():
+            logger.warning(
+                """You are running with one thread but you have not compiled
+                   for CUDA.  You may be running a setup optimized for GPUs.
+                   If you have GPUs and have nvcc installed, go to src/ and do
+                   ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
         run_opts.combine_queue_opt = "--gpu 1"
         run_opts.prior_gpu_opt = "--use-gpu=yes"
         run_opts.prior_queue_opt = "--gpu 1"
-
     else:
-        logger.warning("""
-    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+        logger.warning("Without using a GPU this will be very slow. "
+                       "nnet3 does not yet support multiple threads.")
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
@@ -111,44 +124,55 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = ""
 
     run_opts.command = args.command
-    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.egs_command = (args.egs_command
+                            if args.egs_command is not None else
+                            args.command)
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-# args is a Namespace with the required parameters
-def Train(args, run_opts):
+
+def train(args, run_opts, background_process_handler):
+    """ The main function for training.
+
+    Args:
+        args: a Namespace object with the required parameters
+            obtained from the function process_args()
+        run_opts: RunOpts object obtained from the process_args()
+    """
+
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
-    num_jobs = common_train_lib.GetNumberOfJobs(args.ali_dir)
-    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
+    # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
+    num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
+    feat_dim = common_lib.get_feat_dim(args.feat_dir)
+    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    common_train_lib.SplitData(args.feat_dir, num_jobs)
+    common_lib.split_data(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
-    f = open('{0}/num_jobs'.format(args.dir), 'w')
-    f.write(str(num_jobs))
-    f.close()
+
+    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
+        f.write(str(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.parse_generic_config_vars_file(var_file)
 
     # Set some variables.
-
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
+        # this is really the number of times we add layers to the network for
+        # discriminative pretraining
+        num_hidden_layers = variables['num_hidden_layers']
     except KeyError as e:
-        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
-            str(e), '{0}/configs'.format(args.dir)))
+        raise Exception("KeyError {0}: Variables need to be defined in "
+                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
@@ -159,28 +183,32 @@ def Train(args, run_opts):
     # transform.
 
     if (args.stage <= -5):
-        logger.info("Initializing a basic network for estimating preconditioning matrix")
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/nnet_init.log \
-    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command=run_opts.command,
-               dir=args.dir))
+        logger.info("Initializing a basic network for estimating "
+                    "preconditioning matrix")
+        common_lib.run_kaldi_command(
+            """{command} {dir}/log/nnet_init.log \
+                    nnet3-init --srand=-2 {dir}/configs/init.config \
+                    {dir}/init.raw""".format(command=run_opts.command,
+                                             dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        train_lib.GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
-                              left_context, right_context,
-                              left_context, right_context, run_opts,
-                              frames_per_eg=args.frames_per_eg,
-                              srand=args.srand,
-                              egs_opts=args.egs_opts,
-                              cmvn_opts=args.cmvn_opts,
-                              online_ivector_dir=args.online_ivector_dir,
-                              samples_per_iter=args.samples_per_iter,
-                              transform_dir=args.transform_dir,
-                              stage=args.egs_stage)
+        train_lib.acoustic_model.generate_egs(
+            data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
+            left_context=left_context, right_context=right_context,
+            valid_left_context=left_context,
+            valid_right_context=right_context,
+            run_opts=run_opts,
+            frames_per_eg=args.frames_per_eg,
+            srand=args.srand,
+            egs_opts=args.egs_opts,
+            cmvn_opts=args.cmvn_opts,
+            online_ivector_dir=args.online_ivector_dir,
+            samples_per_iter=args.samples_per_iter,
+            transform_dir=args.transform_dir,
+            stage=args.egs_stage)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -189,77 +217,81 @@ def Train(args, run_opts):
 
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
-             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
-                                           left_context, right_context) )
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                                        left_context, right_context))
     assert(args.frames_per_eg == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
-        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+        raise Exception('num_jobs_final cannot exceed the number of archives '
+                        'in the egs directory')
 
     # copy the properties of the egs to dir for
     # use during decoding
-    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs=args.max_lda_jobs,
-                                               rand_prune=args.rand_prune)
+        train_lib.common.compute_preconditioning_matrix(
+            args.dir, egs_dir, num_archives, run_opts,
+            max_lda_jobs=args.max_lda_jobs,
+            rand_prune=args.rand_prune)
 
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
                     " softmax, using priors^{prior_scale} and rescaling to"
-                    " average 1".format(prior_scale=args.presoftmax_prior_scale_power))
+                    " average 1".format(
+                        prior_scale=args.presoftmax_prior_scale_power))
 
-        common_train_lib.ComputePresoftmaxPriorScale(
+        common_train_lib.compute_presoftmax_prior_scale(
                 args.dir, args.ali_dir, num_jobs, run_opts,
                 presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)
 
-
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        train_lib.PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+        train_lib.acoustic_model.prepare_initial_acoustic_model(
+            args.dir, args.ali_dir, run_opts)
 
-
-    # set num_iters so that as close as possible, we process the data $num_epochs
-    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
-    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    # set num_iters so that as close as possible, we process the data
+    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
+    # $num_epochs*$num_archives, where
+    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
     num_archives_to_process = args.num_epochs * num_archives_expanded
     num_archives_processed = 0
-    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+    num_iters = ((num_archives_to_process * 2)
+                 / (args.num_jobs_initial + args.num_jobs_final))
 
-    models_to_combine = common_train_lib.VerifyIterations(
+    models_to_combine = common_train_lib.verify_iterations(
         num_iters, args.num_epochs,
         num_hidden_layers, num_archives_expanded,
         args.max_models_combine, args.add_layers_period,
         args.num_jobs_final)
 
-    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
-                        common_train_lib.GetLearningRate(
-                                         iter, current_num_jobs, num_iters,
-                                         num_archives_processed,
-                                         num_archives_to_process,
-                                         args.initial_effective_lrate,
-                                         args.final_effective_lrate)
-                    )
+    def learning_rate(iter, current_num_jobs, num_archives_processed):
+        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
+                                           num_archives_processed,
+                                           num_archives_to_process,
+                                           args.initial_effective_lrate,
+                                           args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = "
+                "{1} iterations".format(args.num_epochs, num_iters))
 
-    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+        current_num_jobs = int(0.5 + args.num_jobs_initial
+                               + (args.num_jobs_final - args.num_jobs_initial)
+                               * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
-
             logger.info("On iteration {0}, learning rate is {1}.".format(
-                iter, LearningRate(iter, current_num_jobs,
-                                   num_archives_processed)))
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
 
-            train_lib.TrainOneIteration(
+            train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
                 srand=args.srand,
@@ -267,7 +299,8 @@ def Train(args, run_opts):
                 num_jobs=current_num_jobs,
                 num_archives_processed=num_archives_processed,
                 num_archives=num_archives,
-                learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                learning_rate=learning_rate(iter, current_num_jobs,
+                                            num_archives_processed),
                 minibatch_size=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 num_hidden_layers=num_hidden_layers,
@@ -277,77 +310,92 @@ def Train(args, run_opts):
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                run_opts=run_opts)
+                run_opts=run_opts,
+                background_process_handler=background_process_handler)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(
+                # do a clean up everythin but the last 2 models, under certain
+                # conditions
+                common_train_lib.remove_model(
                     args.dir, iter-2, num_iters, models_to_combine,
                     args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
                 if iter % reporting_iter_interval == 0:
-                # lets do some reporting
-                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    # lets do some reporting
+                    [report, times, data] = (
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
-                    common_train_lib.SendMail(message, subject, args.email)
+                    subject = ("Update : Expt {dir} : "
+                               "Iter {iter}".format(dir=args.dir, iter=iter))
+                    common_lib.send_mail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        train_lib.CombineModels(
+        train_lib.common.combine_models(
             args.dir, num_iters, models_to_combine,
-            num_iters_combine, egs_dir, run_opts)
+            egs_dir, run_opts,
+            background_process_handler=background_process_handler)
 
     if args.stage <= num_iters + 1:
-        logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = train_lib.ComputeAveragePosterior(
-                            args.dir, 'combined', egs_dir,
-                            num_archives, args.prior_subset_size, run_opts)
+        logger.info("Getting average posterior for purposes of "
+                    "adjusting the priors.")
+        avg_post_vec_file = train_lib.common.compute_average_posterior(
+            args.dir, 'combined', egs_dir,
+            num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir=args.dir)
         final_model = "{dir}/final.mdl".format(dir=args.dir)
-        train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
-                                 final_model, run_opts)
+        train_lib.common.adjust_am_priors(args.dir, combined_model,
+                                          avg_post_vec_file, final_model,
+                                          run_opts)
 
     if args.cleanup:
-        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        logger.info("Cleaning up the experiment directory "
+                    "{0}".format(args.dir))
         remove_egs = args.remove_egs
         if args.egs_dir is not None:
             # this egs_dir was not created by this experiment so we will not
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(
+        common_train_lib.clean_nnet_dir(
             args.dir, num_iters, egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
-        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_lib.send_mail(report, "Update : Expt {0} : "
+                                     "complete".format(args.dir), args.email)
+
+    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
+        f.write(report)
 
-    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
-    report_handle.write(report)
-    report_handle.close()
+    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
+                                 "{0}".format(args.dir))
 
-    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
 
-def Main():
-    [args, run_opts] = GetArgs()
+def main():
+    [args, run_opts] = get_args()
     try:
-        Train(args, run_opts)
+        background_process_handler = common_lib.BackgroundProcessHandler(
+            polling_time=args.background_polling_time)
+        train(args, run_opts, background_process_handler)
+        background_process_handler.ensure_processes_are_done()
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
-            common_train_lib.SendMail(message, message, args.email)
+            message = ("Training session for experiment {dir} "
+                       "died due to an error.".format(dir=args.dir))
+            common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
         raise e
 
+
 if __name__ == "__main__":
-    Main()
+    main()
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 65d5b56bfc2..8eefe174f99 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -1,98 +1,112 @@
 #!/usr/bin/env python
 
-
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
+""" This script is similar to steps/nnet3/train_dnn.py but trains a
+raw neural network instead of an acoustic model.
+"""
 
-# this script is based on steps/nnet3/tdnn/train_raw_nnet.sh
-
-
+import argparse
+import logging
+import pprint
 import os
 import subprocess
-import argparse
 import sys
-import pprint
-import logging
-import imp
 import traceback
 
-common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+import libs.nnet3.train.frame_level_objf as train_lib
+import libs.nnet3.report.log_parse as nnet3_log_parse
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Starting raw DNN trainer (train_raw_dnn.py)')
 
 
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="""
-    Trains a feed forward raw DNN (without transition model)
-    using the cross-entropy objective.
-    DNNs include simple DNNs, TDNNs and CNNs.
-    """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler='resolve',
-    parents=[common_train_lib.common_parser])
-    # For common options defined in common_train_lib.common_parser,
-    # see steps/nnet3/libs/common_train_lib.py
+def get_args():
+    """ Get args from stdin.
+
+    The common options are defined in the object
+    libs.nnet3.train.common.CommonParser.parser.
+    See steps/libs/nnet3/train/common.py
+    """
+
+    parser = argparse.ArgumentParser(
+        description="""Trains a feed forward raw DNN (without transition model)
+        using frame-level objectives like cross-entropy and mean-squared-error.
+        DNNs include simple DNNs, TDNNs and CNNs.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        conflict_handler='resolve',
+        parents=[common_train_lib.CommonParser.parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default=8,
                         help="Number of output labels per example")
 
-    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
-                        default=512,
-                        help="Size of the minibatch used to compute the gradient")
+    parser.add_argument("--trainer.optimization.minibatch-size",
+                        type=float, dest='minibatch_size', default=512,
+                        help="Size of the minibatch used to compute the "
+                        "gradient")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
-    parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
-                       default=True, choices=["true", "false"],
-                       help="Train neural network using dense targets")
+    parser.add_argument("--use-dense-targets", type=str,
+                        action=common_lib.StrToBoolAction,
+                        default=True, choices=["true", "false"],
+                        help="Train neural network using dense targets")
     parser.add_argument("--feat-dir", type=str, required=True,
-                        help="Directory with features used for training the neural network.")
+                        help="Directory with features used for training "
+                        "the neural network.")
     parser.add_argument("--targets-scp", type=str, required=True,
                         help="Target for training neural network.")
     parser.add_argument("--dir", type=str, required=True,
-                        help="Directory to store the models and all other files.")
+                        help="Directory to store the models and "
+                        "all other files.")
 
     print(' '.join(sys.argv))
     print(sys.argv)
 
     args = parser.parse_args()
 
-    [args, run_opts] = ProcessArgs(args)
+    [args, run_opts] = process_args(args)
 
     return [args, run_opts]
 
-def ProcessArgs(args):
-    # process the options
+
+def process_args(args):
+    """ Process the options got from get_args()
+    """
+
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
-    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("""This scripts expects {0} to exist and have a configs
-        directory which is the output of make_configs.py script""")
+    if (not os.path.exists(args.dir)
+            or not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs "
+                        "directory which is the output of "
+                        "make_configs.py script")
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_train_lib.RunOpts()
+    run_opts = common_lib.RunOpts()
     if args.use_gpu:
-        if not common_train_lib.CheckIfCudaCompiled():
-            logger.warning("""
-    You are running with one thread but you have not compiled
-    for CUDA.  You may be running a setup optimized for GPUs.  If you have
-    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+        if not common_lib.check_if_cuda_compiled():
+            logger.warning(
+                """You are running with one thread but you have not compiled
+                   for CUDA.  You may be running a setup optimized for GPUs.
+                   If you have GPUs and have nvcc installed, go to src/ and do
+                   ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
@@ -101,8 +115,8 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
-        logger.warning("""
-    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+        logger.warning("Without using a GPU this will be very slow. "
+                       "nnet3 does not yet support multiple threads.")
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
@@ -111,39 +125,53 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = ""
 
     run_opts.command = args.command
-    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.egs_command = (args.egs_command
+                            if args.egs_command is not None else
+                            args.command)
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-# args is a Namespace with the required parameters
-def Train(args, run_opts):
+
+def train(args, run_opts, background_process_handler):
+    """ The main function for training.
+
+    Args:
+        args: a Namespace object with the required parameters
+            obtained from the function process_args()
+        run_opts: RunOpts object obtained from the process_args()
+    """
+
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
+    feat_dim = common_lib.get_feat_dim(args.feat_dir)
+    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
-    common_train_lib.SplitData(args.feat_dir, args.nj)
+    common_lib.split_data(args.feat_dir, num_jobs)
+    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
+        f.write(str(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.parse_generic_config_vars_file(var_file)
 
     # Set some variables.
-
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
-        add_lda = common_train_lib.StrToBool(variables['add_lda'])
-        include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
+        # this is really the number of times we add layers to the network for
+        # discriminative pretraining
+        num_hidden_layers = variables['num_hidden_layers']
+        add_lda = common_lib.str_to_bool(variables['add_lda'])
+        include_log_softmax = common_lib.str_to_bool(
+            variables['include_log_softmax'])
     except KeyError as e:
-        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
-            str(e), '{0}/configs'.format(args.dir)))
+        raise Exception("KeyError {0}: Variables need to be defined in "
+                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
@@ -155,11 +183,11 @@ def Train(args, run_opts):
 
     if (args.stage <= -5):
         logger.info("Initializing a basic network")
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/nnet_init.log \
-    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command=run_opts.command,
-               dir=args.dir))
+        common_lib.run_kaldi_command(
+            """{command} {dir}/log/nnet_init.log \
+                    nnet3-init --srand=-2 {dir}/configs/init.config \
+                    {dir}/init.raw""".format(command=run_opts.command,
+                                             dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
@@ -170,29 +198,32 @@ def Train(args, run_opts):
             try:
                 num_targets = int(variables['num_targets'])
             except KeyError as e:
-                raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+                raise Exception("KeyError {0}: Variables need to be defined "
+                                "in {1}".format(
                     str(e), '{0}/configs'.format(args.dir)))
-            if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
+            if common_lib.get_feat_dim_from_scp(targets_scp) != num_targets:
                 raise Exception("Mismatch between num-targets provided to "
                                 "script vs configs")
         else:
             target_type = "sparse"
 
-
-        train_lib.GenerateEgsUsingTargets(
-                  args.feat_dir, args.targets_scp, default_egs_dir,
-                  left_context, right_context,
-                  left_context, right_context, run_opts,
-                  frames_per_eg=args.frames_per_eg,
-                  srand=args.srand,
-                  egs_opts=args.egs_opts,
-                  cmvn_opts=args.cmvn_opts,
-                  online_ivector_dir=args.online_ivector_dir,
-                  samples_per_iter=args.samples_per_iter,
-                  transform_dir=args.transform_dir,
-                  stage=args.egs_stage,
-                  target_type=target_type,
-                  num_targets=num_targets)
+        train_lib.raw_model.generate_egs_from_targets(
+            data=args.feat_dir, targets_scp=args.targets_scp,
+            egs_dir=default_egs_dir,
+            left_context=left_context, right_context=right_context,
+            valid_left_context=left_context,
+            valid_right_context=right_context,
+            run_opts=run_opts,
+            frames_per_eg=args.frames_per_eg,
+            srand=args.srand,
+            egs_opts=args.egs_opts,
+            cmvn_opts=args.cmvn_opts,
+            online_ivector_dir=args.online_ivector_dir,
+            samples_per_iter=args.samples_per_iter,
+            transform_dir=args.transform_dir,
+            stage=args.egs_stage,
+            target_type=target_type,
+            num_targets=num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -201,148 +232,173 @@ def Train(args, run_opts):
 
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
-             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
-                                           left_context, right_context) )
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                                        left_context, right_context))
     assert(args.frames_per_eg == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
-        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+        raise Exception('num_jobs_final cannot exceed the number of archives '
+                        'in the egs directory')
 
     # copy the properties of the egs to dir for
     # use during decoding
-    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (add_lda and args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs=args.max_lda_jobs,
-                                               rand_prune=args.rand_prune)
+        train_lib.common.compute_preconditioning_matrix(
+            args.dir, egs_dir, num_archives, run_opts,
+            max_lda_jobs=args.max_lda_jobs,
+            rand_prune=args.rand_prune)
 
 
     if (args.stage <= -1):
         logger.info("Preparing the initial network.")
-        common_train_lib.PrepareInitialNetwork(args.dir, run_opts)
-
+        common_train_lib.prepare_initial_network(args.dir, run_opts)
 
-    # set num_iters so that as close as possible, we process the data $num_epochs
-    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
-    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    # set num_iters so that as close as possible, we process the data
+    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
+    # $num_epochs*$num_archives, where
+    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
     num_archives_to_process = args.num_epochs * num_archives_expanded
     num_archives_processed = 0
-    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
-
-    num_iters_combine = common_train_lib.VerifyIterations(
-                                         num_iters, args.num_epochs,
-                                         num_hidden_layers, num_archives_expanded,
-                                         args.max_models_combine, args.add_layers_period,
-                                         args.num_jobs_final)
-
-    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
-                        common_train_lib.GetLearningRate(
-                                         iter, current_num_jobs, num_iters,
-                                         num_archives_processed,
-                                         num_archives_to_process,
-                                         args.initial_effective_lrate,
-                                         args.final_effective_lrate)
-                    )
-
-    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    num_iters = ((num_archives_to_process * 2)
+                 / (args.num_jobs_initial + args.num_jobs_final))
+
+    models_to_combine = common_train_lib.verify_iterations(
+        num_iters, args.num_epochs,
+        num_hidden_layers, num_archives_expanded,
+        args.max_models_combine, args.add_layers_period,
+        args.num_jobs_final)
+
+    def learning_rate(iter, current_num_jobs, num_archives_processed):
+        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
+                                           num_archives_processed,
+                                           num_archives_to_process,
+                                           args.initial_effective_lrate,
+                                           args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = "
+                "{1} iterations".format(args.num_epochs, num_iters))
+
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+        current_num_jobs = int(0.5 + args.num_jobs_initial
+                               + (args.num_jobs_final - args.num_jobs_initial)
+                               * float(iter) / num_iters)
 
         if args.stage <= iter:
-            model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
-
-            logger.info("On iteration {0}, learning rate is {1}.".format(iter, LearningRate(iter, current_num_jobs, num_archives_processed)))
-
-            train_lib.TrainOneIteration(dir=args.dir,
-                                        iter=iter,
-                                        srand=args.srand,
-                                        egs_dir=egs_dir,
-                                        num_jobs=current_num_jobs,
-                                        num_archives_processed=num_archives_processed,
-                                        num_archives=num_archives,
-                                        learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
-                                        minibatch_size=args.minibatch_size,
-                                        frames_per_eg=args.frames_per_eg,
-                                        num_hidden_layers=num_hidden_layers,
-                                        add_layers_period=args.add_layers_period,
-                                        left_context=left_context,
-                                        right_context=right_context,
-                                        momentum=args.momentum,
-                                        max_param_change=args.max_param_change,
-                                        shuffle_buffer_size=args.shuffle_buffer_size,
-                                        run_opts=run_opts,
-                                        get_raw_nnet_from_am=False)
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
+
+            train_lib.common.train_one_iteration(
+                dir=args.dir,
+                iter=iter,
+                srand=args.srand,
+                egs_dir=egs_dir,
+                num_jobs=current_num_jobs,
+                num_archives_processed=num_archives_processed,
+                num_archives=num_archives,
+                learning_rate=learning_rate(iter, current_num_jobs,
+                                            num_archives_processed),
+                minibatch_size=args.minibatch_size,
+                frames_per_eg=args.frames_per_eg,
+                num_hidden_layers=num_hidden_layers,
+                add_layers_period=args.add_layers_period,
+                left_context=left_context,
+                right_context=right_context,
+                momentum=args.momentum,
+                max_param_change=args.max_param_change,
+                shuffle_buffer_size=args.shuffle_buffer_size,
+                run_opts=run_opts,
+                get_raw_nnet_from_am=False,
+                background_process_handler=background_process_handler)
+
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(
-                                 args.dir, iter-2, num_iters, num_iters_combine,
-                                 args.preserve_model_interval,
-                                 get_raw_nnet_from_am=False)
+                # do a clean up everythin but the last 2 models, under certain
+                # conditions
+                common_train_lib.remove_model(
+                    args.dir, iter-2, num_iters, models_to_combine,
+                    args.preserve_model_interval,
+                    get_raw_nnet_from_am=False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
                 if iter % reporting_iter_interval == 0:
-                # lets do some reporting
-                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    # lets do some reporting
+                    [report, times, data] = (
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
-                    common_train_lib.SendMail(message, subject, args.email)
+                    subject = ("Update : Expt {dir} : "
+                               "Iter {iter}".format(dir=args.dir, iter=iter))
+                    common_lib.send_mail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
-        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
-                                run_opts, get_raw_nnet_from_am=False)
+        train_lib.common.combine_models(
+            args.dir, num_iters, models_to_combine,
+            egs_dir, run_opts,
+            background_process_handler=background_process_handler,
+            get_raw_nnet_from_am=False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
-        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
-        avg_post_vec_file = train_lib.ComputeAveragePosterior(
-                            args.dir, 'final', egs_dir,
-                            num_archives, args.prior_subset_size, run_opts,
-                            get_raw_nnet_from_am=False)
+        logger.info("Getting average posterior for purposes of "
+                    "adjusting the priors.")
+        avg_post_vec_file = train_lib.common.compute_average_posterior(
+            args.dir, 'final', egs_dir,
+            num_archives, args.prior_subset_size, run_opts,
+            get_raw_nnet_from_am=False)
 
     if args.cleanup:
-        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        logger.info("Cleaning up the experiment directory "
+                    "{0}".format(args.dir))
         remove_egs = args.remove_egs
         if args.egs_dir is not None:
             # this egs_dir was not created by this experiment so we will not
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval=args.preserve_model_interval,
-                         remove_egs=remove_egs,
-                         get_raw_nnet_from_am=False)
+        common_train_lib.clean_nnet_dir(
+            args.dir, num_iters, egs_dir,
+            preserve_model_interval=args.preserve_model_interval,
+            remove_egs=remove_egs,
+            get_raw_nnet_from_am=False)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
-        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_lib.send_mail(report, "Update : Expt {0} : "
+                                     "complete".format(args.dir), args.email)
+
+    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
+        f.write(report)
 
-    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
-    report_handle.write(report)
-    report_handle.close()
+    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
+                                 "{0}".format(args.dir))
 
-    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
 
-def Main():
-    [args, run_opts] = GetArgs()
+def main():
+    [args, run_opts] = get_args()
     try:
-        Train(args, run_opts)
+        background_process_handler = common_lib.BackgroundProcessHandler(
+            polling_time=args.background_polling_time)
+        train(args, run_opts, background_process_handler)
+        background_process_handler.ensure_processes_are_done()
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
-            common_train_lib.SendMail(message, message, args.email)
+            message = ("Training session for experiment {dir} "
+                       "died due to an error.".format(dir=args.dir))
+            common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
         raise e
 
+
 if __name__ == "__main__":
-    Main()
+    main()
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 7e0f06f95e3..33655c8390e 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -5,52 +5,57 @@
 #           2016 Vimal Manohar
 # Apache 2.0.
 
+""" This script is similar to steps/nnet3/train_rnn.py but trains a
+raw neural network instead of an acoustic model.
+"""
 
-# this script is based on steps/nnet3/lstm/train.sh
-
-
+import argparse
+import logging
+import pprint
 import os
 import subprocess
-import argparse
 import sys
-import pprint
-import logging
-import imp
 import traceback
 
-common_train_lib = imp.load_source('', 'steps/nnet3/libs/common_train_lib.py')
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
-train_lib = imp.load_source('', 'steps/nnet3/libs/train_lib.py')
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+import libs.nnet3.train.frame_level_objf as train_lib
+import libs.nnet3.report.log_parse as nnet3_log_parse
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Starting RNN trainer (train_raw_rnn.py)')
 
 
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="""
-    Trains an RNN neural network using the cross-entropy objective.
-    RNNs include LSTMs, BLSTMs and GRUs.
-    RNN acoustic model training differs from feed-forward DNN training
-    in the following ways
-        1. RNN acoustic models train on output chunks rather than individual
-           outputs
-        2. The training includes additional stage of shrinkage, where
-           the parameters of the model are scaled when the derivative averages
-           at the non-linearities are below a threshold.
-        3. RNNs can also be trained with state preservation training
-    """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler='resolve',
-    parents=[common_train_lib.common_parser])
-    # For common options defined in common_train_lib.common_parser,
-    # see steps/nnet3/libs/common_train_lib.py
+def get_args():
+    """ Get args from stdin.
+
+    The common options are defined in the object
+    libs.nnet3.train.common.CommonParser.parser.
+    See steps/libs/nnet3/train/common.py
+    """
+
+    parser = argparse.ArgumentParser(
+        description="""Trains a raw RNN (without transition model) using
+        frame-level objectives like cross-entropy and mean-squared-error.
+        RNNs include LSTMs, BLSTMs and GRUs.
+        RNN acoustic model training differs from feed-forward DNN training in
+        the following ways
+            1. RNN acoustic models train on output chunks rather than
+               individual outputs
+            2. The training includes additional stage of shrinkage, where the
+               parameters of the model are scaled when the derivative averages
+               at the non-linearities are below a threshold.
+            3. RNNs can also be trained with state preservation training""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        conflict_handler='resolve',
+        parents=[common_train_lib.CommonParser.parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -59,13 +64,13 @@ def GetArgs():
                         used to train an LSTM.
                         Caution: if you double this you should halve
                         --trainer.samples-per-iter.""")
-    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default=40,
-                        help="""Number of left steps used in the estimation of LSTM
-                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-left-context", type=int,
+                        dest='chunk_left_context', default=40,
+                        help="""Number of left steps used in the estimation of
+                        LSTM state before prediction of the first label""")
 
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=20000,
+    parser.add_argument("--trainer.samples-per-iter", type=int,
+                        dest='samples_per_iter', default=20000,
                         help="""This is really the number of egs in each
                         archive.  Each eg has 'chunk_width' frames in it--
                         for chunk_width=20, this value (20k) is equivalent
@@ -73,59 +78,68 @@ def GetArgs():
                         regular DNN training.""")
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default=0.5,
+    parser.add_argument("--trainer.optimization.momentum", type=float,
+                        dest='momentum', default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
                         it doesn't increase the effective learning rate.""")
-    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
-                        default=0.99,
-                        help="Scaling factor used for scaling the parameter matrices "
-                             "when the derivative averages are below the "
-                             "shrink-threshold at the non-linearities")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
-                        default=0.15,
-                        help="If the derivative averages are below this "
-                        "threshold we scale the parameter matrices with the shrink-value. "
-                        "It is less than 0.25 for sigmoid non-linearities.")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
-                        default=256,
-                        help="Size of the minibatch to be used in diagnostic jobs "
-                        "(use smaller value for BLSTMs to control memory usage)")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float,
+                        dest='shrink_value', default=0.99,
+                        help="""Scaling factor used for scaling the parameter
+                        matrices when the derivative averages are below the
+                        shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
+                        dest='shrink_threshold', default=0.15,
+                        help="""If the derivative averages are below this
+                        threshold we scale the parameter matrices with the
+                        shrink-value.  It is less than 0.25 for sigmoid
+                        non-linearities.""")
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
+                        dest='cv_minibatch_size', default=256,
+                        help="""Size of the minibatch to be used in diagnostic
+                        jobs (use smaller value for BLSTMs to control memory
+                        usage)""")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
-                        default=100,
-                        help="Number of sequences to be processed in parallel every minibatch" )
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
-                        default=None,
-                        help="The number of time steps to back-propagate from the "
-                        "last label in the chunk. By default it is same as the chunk-width." )
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
+                        dest='num_chunk_per_minibatch', default=100,
+                        help="Number of sequences to be processed in "
+                        "parallel every minibatch" )
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
+                        dest='num_bptt_steps', default=None,
+                        help="""The number of time steps to back-propagate from
+                        the last label in the chunk. By default it is same as
+                        the chunk-width.""")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
                         help="Number of parallel jobs")
-    parser.add_argument("--use-dense-targets", type=str, action=common_train_lib.StrToBoolAction,
-                       default=True, choices=["true", "false"],
-                       help="Train neural network using dense targets")
+    parser.add_argument("--use-dense-targets", type=str,
+                        action=common_lib.StrToBoolAction,
+                        default=True, choices=["true", "false"],
+                        help="Train neural network using dense targets")
     parser.add_argument("--feat-dir", type=str, required=True,
-                        help="Directory with features used for training the neural network.")
+                        help="Directory with features used for training "
+                        "the neural network.")
     parser.add_argument("--targets-scp", type=str, required=True,
                         help="Target for training neural network.")
     parser.add_argument("--dir", type=str, required=True,
-                        help="Directory to store the models and all other files.")
+                        help="Directory to store the models and "
+                        "all other files.")
 
     print(' '.join(sys.argv))
     print(sys.argv)
 
     args = parser.parse_args()
 
-    [args, run_opts] = ProcessArgs(args)
+    [args, run_opts] = process_args(args)
 
     return [args, run_opts]
 
-def ProcessArgs(args):
-    # process the options
+def process_args(args):
+    """ Process the options got from get_args()
+    """
+
     if args.chunk_width < 1:
         raise Exception("--egs.chunk-width should have a minimum value of 1")
 
@@ -135,18 +149,21 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("""This scripts expects {0} to exist and have a configs
-        directory which is the output of make_configs.py script""")
+    if (not os.path.exists(args.dir)
+            or not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs "
+                        "directory which is the output of "
+                        "make_configs.py script")
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_train_lib.RunOpts()
+    run_opts = common_lib.RunOpts()
     if args.use_gpu:
-        if not common_train_lib.CheckIfCudaCompiled():
-            logger.warning("""
-    You are running with one thread but you have not compiled
-    for CUDA.  You may be running a setup optimized for GPUs.  If you have
-    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+        if not common_lib.check_if_cuda_compiled():
+            logger.warning(
+                """You are running with one thread but you have not compiled
+                   for CUDA.  You may be running a setup optimized for GPUs.
+                   If you have GPUs and have nvcc installed, go to src/ and do
+                   ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
@@ -155,8 +172,8 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
-        logger.warning("""
-    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+        logger.warning("Without using a GPU this will be very slow. "
+                       "nnet3 does not yet support multiple threads.")
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
@@ -165,39 +182,53 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = ""
 
     run_opts.command = args.command
-    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.egs_command = (args.egs_command
+                            if args.egs_command is not None else
+                            args.command)
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-# args is a Namespace with the required parameters
-def Train(args, run_opts):
+
+def train(args, run_opts, background_process_handler):
+    """ The main function for training.
+
+    Args:
+        args: a Namespace object with the required parameters
+            obtained from the function process_args()
+        run_opts: RunOpts object obtained from the process_args()
+    """
+
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_train_lib.GetIvectorDim(args.online_ivector_dir)
+    feat_dim = common_lib.get_feat_dim(args.feat_dir)
+    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
-    common_train_lib.SplitData(args.feat_dir, args.nj)
+    common_lib.split_data(args.feat_dir, num_jobs)
+    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
+        f.write(str(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.parse_generic_config_vars_file(var_file)
 
     # Set some variables.
-
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
-        add_lda = common_train_lib.StrToBool(variables['add_lda'])
-        include_log_softmax = common_train_lib.StrToBool(variables['include_log_softmax'])
+        # this is really the number of times we add layers to the network for
+        # discriminative pretraining
+        num_hidden_layers = variables['num_hidden_layers']
+        add_lda = common_lib.str_to_bool(variables['add_lda'])
+        include_log_softmax = common_lib.str_to_bool(
+            variables['include_log_softmax'])
     except KeyError as e:
-        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
-            str(e), '{0}/configs'.format(args.dir)))
+        raise Exception("KeyError {0}: Variables need to be defined in "
+                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
@@ -209,11 +240,11 @@ def Train(args, run_opts):
 
     if (args.stage <= -4):
         logger.info("Initializing a basic network")
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/nnet_init.log \
-    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command=run_opts.command,
-               dir=args.dir))
+        common_lib.run_kaldi_command(
+            """{command} {dir}/log/nnet_init.log \
+                    nnet3-init --srand=-2 {dir}/configs/init.config \
+                    {dir}/init.raw""".format(command=run_opts.command,
+                                             dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
@@ -224,29 +255,32 @@ def Train(args, run_opts):
             try:
                 num_targets = int(variables['num_targets'])
             except KeyError as e:
-                raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+                raise Exception("KeyError {0}: Variables need to be defined "
+                                "in {1}".format(
                     str(e), '{0}/configs'.format(args.dir)))
-            if common_train_lib.GetFeatDimFromScp(targets_scp) != num_targets:
+            if common_lib.get_feat_dim_from_scp(targets_scp) != num_targets:
                 raise Exception("Mismatch between num-targets provided to "
                                 "script vs configs")
         else:
             target_type = "sparse"
 
-        train_lib.GenerateEgsUsingTargets(
-                  args.feat_dir, args.targets_scp, default_egs_dir,
-                  left_context, right_context,
-                  args.chunk_width + left_context,
-                  args.chunk_width + right_context, run_opts,
-                  frames_per_eg=args.chunk_width,
-                  srand=args.srand,
-                  egs_opts=args.egs_opts,
-                  cmvn_opts=args.cmvn_opts,
-                  online_ivector_dir=args.online_ivector_dir,
-                  samples_per_iter=args.samples_per_iter,
-                  transform_dir=args.transform_dir,
-                  stage=args.egs_stage,
-                  target_type=target_type,
-                  num_targets=num_targets)
+        train_lib.raw_model.generate_egs_from_targets(
+            data=args.feat_dir, targets_scp=args.targets_scp,
+            egs_dir=default_egs_dir,
+            left_context=left_context, right_context=right_context,
+            valid_left_context=args.chunk_width + left_context,
+            valid_right_context=args.chunk_width + right_context,
+            run_opts=run_opts,
+            frames_per_eg=args.chunk_width,
+            srand=args.srand,
+            egs_opts=args.egs_opts,
+            cmvn_opts=args.cmvn_opts,
+            online_ivector_dir=args.online_ivector_dir,
+            samples_per_iter=args.samples_per_iter,
+            transform_dir=args.transform_dir,
+            stage=args.egs_stage,
+            target_type=target_type,
+            num_targets=num_targets)
 
     if args.egs_dir is None:
         egs_dir = default_egs_dir
@@ -255,51 +289,52 @@ def Train(args, run_opts):
 
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
-             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
-                                           left_context, right_context) )
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                                        left_context, right_context))
     assert(args.chunk_width == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
-        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+        raise Exception('num_jobs_final cannot exceed the number of archives '
+                        'in the egs directory')
 
     # copy the properties of the egs to dir for
     # use during decoding
-    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (add_lda and args.stage <= -2):
         logger.info('Computing the preconditioning matrix for input features')
 
-        train_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
-                                               max_lda_jobs=args.max_lda_jobs,
-                                               rand_prune=args.rand_prune)
+        train_lib.common.compute_preconditioning_matrix(
+            args.dir, egs_dir, num_archives, run_opts,
+            max_lda_jobs=args.max_lda_jobs,
+            rand_prune=args.rand_prune)
 
 
     if (args.stage <= -1):
         logger.info("Preparing the initial network.")
-        common_train_lib.PrepareInitialNetwork(args.dir, run_opts)
+        common_train_lib.prepare_initial_network(args.dir, run_opts)
 
-
-    # set num_iters so that as close as possible, we process the data $num_epochs
-    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
-    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    # set num_iters so that as close as possible, we process the data
+    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
+    # $num_epochs*$num_archives, where
+    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_to_process = args.num_epochs * num_archives
     num_archives_processed = 0
-    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
-
-    num_iters_combine = common_train_lib.VerifyIterations(
-                                         num_iters, args.num_epochs,
-                                         num_hidden_layers, num_archives,
-                                         args.max_models_combine, args.add_layers_period,
-                                         args.num_jobs_final)
-
-    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
-                        common_train_lib.GetLearningRate(
-                                         iter, current_num_jobs, num_iters,
-                                         num_archives_processed,
-                                         num_archives_to_process,
-                                         args.initial_effective_lrate,
-                                         args.final_effective_lrate)
-                    )
+    num_iters = ((num_archives_to_process * 2)
+                 / (args.num_jobs_initial + args.num_jobs_final))
+
+    models_to_combine = common_train_lib.verify_iterations(
+        num_iters, args.num_epochs,
+        num_hidden_layers, num_archives,
+        args.max_models_combine, args.add_layers_period,
+        args.num_jobs_final)
+
+    def learning_rate(iter, current_num_jobs, num_archives_processed):
+        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
+                                           num_archives_processed,
+                                           num_archives_to_process,
+                                           args.initial_effective_lrate,
+                                           args.final_effective_lrate)
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -308,114 +343,139 @@ def Train(args, run_opts):
 
     min_deriv_time = args.chunk_width - num_bptt_steps
 
+    logger.info("Training will run for {0} epochs = "
+                "{1} iterations".format(args.num_epochs, num_iters))
 
-    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+        current_num_jobs = int(0.5 + args.num_jobs_initial
+                               + (args.num_jobs_final - args.num_jobs_initial)
+                               * float(iter) / num_iters)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
-            shrinkage_value = (args.shrink_value
-                               if common_train_lib.DoShrinkage(iter, model_file,
-                                                               "SigmoidComponent",
-                                                               args.shrink_threshold,
-                                                               get_raw_nnet_from_am=False)
-                               else 1
-                               )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(
-                iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
-
-            train_lib.TrainOneIteration(
-                      dir=args.dir,
-                      iter=iter,
-                      srand=args.srand,
-                      egs_dir=egs_dir,
-                      num_jobs=current_num_jobs,
-                      num_archives_processed=num_archives_processed,
-                      num_archives=num_archives,
-                      learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
-                      shrinkage_value=shrinkage_value,
-                      num_chunk_per_minibatch=args.num_chunk_per_minibatch,
-                      num_hidden_layers=num_hidden_layers,
-                      add_layers_period=args.add_layers_period,
-                      left_context=left_context,
-                      right_context=right_context,
-                      min_deriv_time=min_deriv_time,
-                      momentum=args.momentum,
-                      max_param_change=args.max_param_change,
-                      shuffle_buffer_size=args.shuffle_buffer_size,
-                      cv_minibatch_size=args.cv_minibatch_size,
-                      run_opts=run_opts,
-                      get_raw_nnet_from_am=False)
+
+            shrinkage_value = 1.0
+            if args.shrink_value != 1.0:
+                shrinkage_value = (args.shrink_value
+                                   if common_train_lib.do_shrinkage(
+                                        iter, model_file, "SigmoidComponent",
+                                        args.shrink_threshold,
+                                        get_raw_nnet_from_am=False)
+                                   else 1
+                                   )
+            logger.info("On iteration {0}, learning rate is {1} and "
+                        "shrink value is {2}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed), shrinkage_value))
+
+            train_lib.common.train_one_iteration(
+                dir=args.dir,
+                iter=iter,
+                srand=args.srand,
+                egs_dir=egs_dir,
+                num_jobs=current_num_jobs,
+                num_archives_processed=num_archives_processed,
+                num_archives=num_archives,
+                learning_rate=learning_rate(iter, current_num_jobs,
+                                            num_archives_processed),
+                shrinkage_value=shrinkage_value,
+                num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                num_hidden_layers=num_hidden_layers,
+                add_layers_period=args.add_layers_period,
+                left_context=left_context,
+                right_context=right_context,
+                min_deriv_time=min_deriv_time,
+                momentum=args.momentum,
+                max_param_change=args.max_param_change,
+                shuffle_buffer_size=args.shuffle_buffer_size,
+                cv_minibatch_size=args.cv_minibatch_size,
+                run_opts=run_opts,
+                get_raw_nnet_from_am=False,
+                background_process_handler=background_process_handler)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(
-                                 args.dir, iter-2, num_iters, num_iters_combine,
-                                 args.preserve_model_interval,
-                                 get_raw_nnet_from_am=False)
+                # do a clean up everythin but the last 2 models, under certain
+                # conditions
+                common_train_lib.remove_model(
+                    args.dir, iter-2, num_iters, models_to_combine,
+                    args.preserve_model_interval,
+                    get_raw_nnet_from_am=False)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
                 if iter % reporting_iter_interval == 0:
-                # lets do some reporting
-                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    # lets do some reporting
+                    [report, times, data] = (
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
-                    common_train_lib.SendMail(message, subject, args.email)
+                    subject = ("Update : Expt {dir} : "
+                               "Iter {iter}".format(dir=args.dir, iter=iter))
+                    common_lib.send_mail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
-        train_lib.CombineModels(args.dir, num_iters, num_iters_combine, egs_dir,
-                                run_opts, chunk_width=args.chunk_width,
-                                get_raw_nnet_from_am=False)
+        train_lib.common.combine_models(
+            args.dir, num_iters, models_to_combine,
+            egs_dir, run_opts, chunk_width=args.chunk_width,
+            background_process_handler=background_process_handler,
+            get_raw_nnet_from_am=False)
 
     if include_log_softmax and args.stage <= num_iters + 1:
-        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
-        avg_post_vec_file = train_lib.ComputeAveragePosterior(
-                            args.dir, 'final', egs_dir,
-                            num_archives, args.prior_subset_size, run_opts,
-                            get_raw_nnet_from_am=False)
+        logger.info("Getting average posterior for purposes of "
+                    "adjusting the priors.")
+        avg_post_vec_file = train_lib.common.compute_average_posterior(
+            args.dir, 'final', egs_dir,
+            num_archives, args.prior_subset_size, run_opts,
+            get_raw_nnet_from_am=False)
 
     if args.cleanup:
-        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        logger.info("Cleaning up the experiment directory "
+                    "{0}".format(args.dir))
         remove_egs = args.remove_egs
         if args.egs_dir is not None:
             # this egs_dir was not created by this experiment so we will not
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
-                         preserve_model_interval=args.preserve_model_interval,
-                         remove_egs=remove_egs,
-                         get_raw_nnet_from_am=False)
+        common_train_lib.clean_nnet_dir(
+            args.dir, num_iters, egs_dir,
+            preserve_model_interval=args.preserve_model_interval,
+            remove_egs=remove_egs,
+            get_raw_nnet_from_am=False)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
-        common_train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        common_lib.send_mail(report, "Update : Expt {0} : "
+                                     "complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
-    report_handle.write(report)
-    report_handle.close()
+    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
+        f.write(report)
 
-    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
+                                 "{0}".format(args.dir))
 
-def Main():
-    [args, run_opts] = GetArgs()
+
+def main():
+    [args, run_opts] = get_args()
     try:
-        Train(args, run_opts)
+        background_process_handler = common_lib.BackgroundProcessHandler(
+            polling_time=args.background_polling_time)
+        train(args, run_opts, background_process_handler)
+        background_process_handler.ensure_processes_are_done()
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
-            common_train_lib.SendMail(message, message, args.email)
+            message = ("Training session for experiment {dir} "
+                       "died due to an error.".format(dir=args.dir))
+            common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
         raise e
 
+
 if __name__ == "__main__":
-    Main()
+    main()
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index e845154dfdd..216f38f0ae8 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -1,58 +1,61 @@
 #!/usr/bin/env python
 
-# Copyright 2016 Vijayaditya Peddinti.
-#           2016 Vimal Manohar
+# Copyright 2016    Vijayaditya Peddinti.
+#           2016    Vimal Manohar
 # Apache 2.0.
 
-# this script is based on steps/nnet3/lstm/train.sh
+""" This script is based on steps/nnet3/lstm/train.sh
+"""
 
-import os
-import subprocess
 import argparse
-import sys
-import pprint
 import logging
-import imp
-import traceback
+import os
+import pprint
 import shutil
+import sys
+import traceback
 
-sys.path.append("steps/libs")
-
-import common as common_lib
-import nnet3.train.common  as common_train_lib
-import nnet3.train.frame_level_objf as train_lib
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+import libs.nnet3.train.frame_level_objf as train_lib
+import libs.nnet3.report.log_parse as nnet3_log_parse
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Starting RNN trainer (train_rnn.py)')
 
 
-def GetArgs():
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="""
-    Trains an RNN acoustic model using the cross-entropy objective.
-    RNNs include LSTMs, BLSTMs and GRUs.
-    RNN acoustic model training differs from feed-forward DNN training
-    in the following ways
-        1. RNN acoustic models train on output chunks rather than individual
-           outputs
-        2. The training includes additional stage of shrinkage, where
-           the parameters of the model are scaled when the derivative averages
-           at the non-linearities are below a threshold.
-        3. RNNs can also be trained with state preservation training
-    """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    conflict_handler = 'resolve',
-    parents=[common_train_lib.CommonParser.parser])
-    # For common options defined in common_train_lib.common_parser,
-    # see steps/nnet3/libs/common_train_lib.py
+def get_args():
+    """ Get args from stdin.
+
+    We add compulsary arguments as named arguments for readability
+
+    The common options are defined in the object
+    libs.nnet3.train.common.CommonParser.parser.
+    See steps/libs/nnet3/train/common.py
+    """
+
+    parser = argparse.ArgumentParser(
+        description="""Trains an RNN acoustic model using the cross-entropy
+        objective.  RNNs include LSTMs, BLSTMs and GRUs.
+        RNN acoustic model training differs from feed-forward DNN training in
+        the following ways
+            1. RNN acoustic models train on output chunks rather than
+               individual outputs
+            2. The training includes additional stage of shrinkage, where
+               the parameters of the model are scaled when the derivative
+               averages at the non-linearities are below a threshold.
+            3. RNNs can also be trained with state preservation training""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        conflict_handler='resolve',
+        parents=[common_train_lib.CommonParser.parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -61,13 +64,13 @@ def GetArgs():
                         used to train an LSTM.
                         Caution: if you double this you should halve
                         --trainer.samples-per-iter.""")
-    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
-                        default=40,
-                        help="""Number of left steps used in the estimation of LSTM
-                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-left-context", type=int,
+                        dest='chunk_left_context', default=40,
+                        help="""Number of left steps used in the estimation of
+                        LSTM state before prediction of the first label""")
 
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=20000,
+    parser.add_argument("--trainer.samples-per-iter", type=int,
+                        dest='samples_per_iter', default=20000,
                         help="""This is really the number of egs in each
                         archive.  Each eg has 'chunk_width' frames in it--
                         for chunk_width=20, this value (20k) is equivalent
@@ -75,50 +78,66 @@ def GetArgs():
                         regular DNN training.""")
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default=0.5,
+    parser.add_argument("--trainer.optimization.momentum", type=float,
+                        dest='momentum', default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
                         it doesn't increase the effective learning rate.""")
-    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
-                        default=0.99,
-                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
-                        default=0.15,
-                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
-            default=256,
-            help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float,
+                        dest='shrink_value', default=0.99,
+                        help="""Scaling factor used for scaling the parameter
+                        matrices when the derivative averages are below the
+                        shrink-threshold at the non-linearities""")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
+                        dest='shrink_threshold', default=0.15,
+                        help="""If the derivative averages are below this
+                        threshold we scale the parameter matrices with the
+                        shrink-value. It is less than 0.25 for sigmoid
+                        non-linearities.""")
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
+                        dest='cv_minibatch_size', default=256,
+                        help="""Size of the minibatch to be used in diagnostic
+                        jobs (use smaller value for BLSTMs to control memory
+                        usage)""")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
-                        default=100,
-                        help="Number of sequences to be processed in parallel every minibatch" )
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
-                        default=None,
-                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
+                        dest='num_chunk_per_minibatch', default=100,
+                        help="Number of sequences to be processed in "
+                        "parallel every minibatch")
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
+                        dest='num_bptt_steps', default=None,
+                        help="""The number of time steps to back-propagate from
+                        the last label in the chunk. By default it is same as
+                        the chunk-width.""")
 
     # General options
     parser.add_argument("--feat-dir", type=str, required=True,
-                        help="Directory with features used for training the neural network.")
+                        help="Directory with features used for training "
+                        "the neural network.")
     parser.add_argument("--lang", type=str, required=True,
                         help="Language directory")
     parser.add_argument("--ali-dir", type=str, required=True,
-                        help="Directory with alignments used for training the neural network.")
+                        help="Directory with alignments used for training "
+                        "the neural network.")
     parser.add_argument("--dir", type=str, required=True,
-                        help="Directory to store the models and all other files.")
+                        help="Directory to store the models and "
+                        "all other files.")
 
     print(' '.join(sys.argv))
     print(sys.argv)
 
     args = parser.parse_args()
 
-    [args, run_opts] = ProcessArgs(args)
+    [args, run_opts] = process_args(args)
 
     return [args, run_opts]
 
-def ProcessArgs(args):
-    # process the options
+
+def process_args(args):
+    """ Process the options got from get_args()
+    """
+
     if args.chunk_width < 1:
         raise Exception("--egs.chunk-width should have a minimum value of 1")
 
@@ -128,21 +147,24 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
-        raise Exception("""This scripts expects {0} to exist and have a configs
-        directory which is the output of make_configs.py script""")
+    if (not os.path.exists(args.dir)
+            or not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs "
+                        "directory which is the output of "
+                        "make_configs.py script")
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_train_lib.RunOpts()
+    run_opts = common_lib.RunOpts()
     if args.use_gpu:
-        if not common_lib.CheckIfCudaCompiled():
-            logger.warning("""
-    You are running with one thread but you have not compiled
-    for CUDA.  You may be running a setup optimized for GPUs.  If you have
-    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+        if not common_lib.check_if_cuda_compiled():
+            logger.warning(
+                """You are running with one thread but you have not compiled
+                   for CUDA.  You may be running a setup optimized for GPUs.
+                   If you have GPUs and have nvcc installed, go to src/ and do
+                   ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
@@ -151,8 +173,8 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
-        logger.warning("""
-    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+        logger.warning("Without using a GPU this will be very slow. "
+                       "nnet3 does not yet support multiple threads.")
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
@@ -161,43 +183,55 @@ def ProcessArgs(args):
         run_opts.prior_queue_opt = ""
 
     run_opts.command = args.command
-    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.egs_command = (args.egs_command
+                            if args.egs_command is not None else
+                            args.command)
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-# args is a Namespace with the required parameters
-def Train(args, run_opts):
+
+def train(args, run_opts, background_process_handler):
+    """ The main function for training.
+
+    Args:
+        args: a Namespace object with the required parameters
+            obtained from the function process_args()
+        run_opts: RunOpts object obtained from the process_args()
+    """
+
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Set some variables.
-    num_leaves = common_lib.GetNumberOfLeavesFromTree(args.ali_dir)
-    num_jobs = common_lib.GetNumberOfJobs(args.ali_dir)
-    feat_dim = common_lib.GetFeatDim(args.feat_dir)
-    ivector_dim = common_lib.GetIvectorDim(args.online_ivector_dir)
+    # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
+    num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
+    feat_dim = common_lib.get_feat_dim(args.feat_dir)
+    ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
-    common_lib.SplitData(args.feat_dir, num_jobs)
+    common_lib.split_data(args.feat_dir, num_jobs)
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
-    f = open('{0}/num_jobs'.format(args.dir), 'w')
-    f.write(str(num_jobs))
-    f.close()
+
+    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
+        f.write(str(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    variables = common_train_lib.ParseGenericConfigVarsFile(var_file)
+    variables = common_train_lib.parse_generic_config_vars_file(var_file)
 
     # Set some variables.
     try:
         model_left_context = variables['model_left_context']
         model_right_context = variables['model_right_context']
-        num_hidden_layers = variables['num_hidden_layers'] # this is really the number of times we add layers to the network for discriminative pretraining
+        # this is really the number of times we add layers to the network for
+        # discriminative pretraining
+        num_hidden_layers = variables['num_hidden_layers']
     except KeyError as e:
-        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
-            str(e), '{0}/configs'.format(args.dir)))
+        raise Exception("KeyError {0}: Variables need to be defined in "
+                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
@@ -208,22 +242,24 @@ def Train(args, run_opts):
     # transform.
 
     if (args.stage <= -5):
-        logger.info("Initializing a basic network for estimating preconditioning matrix")
-        common_lib.RunKaldiCommand("""
-{command} {dir}/log/nnet_init.log \
-    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
-    """.format(command=run_opts.command,
-               dir=args.dir))
+        logger.info("Initializing a basic network for estimating "
+                    "preconditioning matrix")
+        common_lib.run_kaldi_command(
+            """{command} {dir}/log/nnet_init.log \
+                    nnet3-init --srand=-2 {dir}/configs/init.config \
+                    {dir}/init.raw""".format(command=run_opts.command,
+                                             dir=args.dir))
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -4) and args.egs_dir is None:
         logger.info("Generating egs")
 
-        train_lib.acoustic_model.GenerateEgs(
-            args.feat_dir, args.ali_dir, default_egs_dir,
-            left_context, right_context,
-            args.chunk_width + left_context,
-            args.chunk_width + right_context, run_opts,
+        train_lib.acoustic_model.generate_egs(
+            data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
+            left_context=left_context, right_context=right_context,
+            valid_left_context=args.chunk_width + left_context,
+            valid_right_context=args.chunk_width + right_context,
+            run_opts=run_opts,
             frames_per_eg=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
@@ -240,21 +276,22 @@ def Train(args, run_opts):
 
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
-             common_train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim,
-                                           left_context, right_context) )
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+                                        left_context, right_context))
     assert(args.chunk_width == frames_per_eg)
 
     if (args.num_jobs_final > num_archives):
-        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+        raise Exception('num_jobs_final cannot exceed the number of archives '
+                        'in the egs directory')
 
     # copy the properties of the egs to dir for
     # use during decoding
-    common_train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+    common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (args.stage <= -3):
         logger.info('Computing the preconditioning matrix for input features')
 
-        train_lib.common.ComputePreconditioningMatrix(
+        train_lib.common.compute_preconditioning_matrix(
             args.dir, egs_dir, num_archives, run_opts,
             max_lda_jobs=args.max_lda_jobs,
             rand_prune=args.rand_prune)
@@ -262,39 +299,39 @@ def Train(args, run_opts):
     if (args.stage <= -2):
         logger.info("Computing initial vector for FixedScaleComponent before"
                     " softmax, using priors^{prior_scale} and rescaling to"
-                    " average 1".format(prior_scale=args.presoftmax_prior_scale_power))
-
-        common_train_lib.ComputePresoftmaxPriorScale(
-            args.dir, args.ali_dir, num_jobs, run_opts,
-            presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)
+                    " average 1".format(
+                        prior_scale=args.presoftmax_prior_scale_power))
 
+        common_train_lib.compute_presoftmax_prior_scale(
+                args.dir, args.ali_dir, num_jobs, run_opts,
+                presoftmax_prior_scale_power=args.presoftmax_prior_scale_power)
 
     if (args.stage <= -1):
         logger.info("Preparing the initial acoustic model.")
-        train_lib.PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
-
+        train_lib.acoustic_model.prepare_initial_acoustic_model(
+            args.dir, args.ali_dir, run_opts)
 
-    # set num_iters so that as close as possible, we process the data $num_epochs
-    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
-    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    # set num_iters so that as close as possible, we process the data
+    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
+    # $num_epochs*$num_archives, where
+    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_to_process = args.num_epochs * num_archives
     num_archives_processed = 0
-    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+    num_iters = ((num_archives_to_process * 2)
+                 / (args.num_jobs_initial + args.num_jobs_final))
 
-    num_iters_combine = common_train_lib.VerifyIterations(
+    models_to_combine = common_train_lib.verify_iterations(
         num_iters, args.num_epochs,
         num_hidden_layers, num_archives,
         args.max_models_combine, args.add_layers_period,
         args.num_jobs_final)
 
-    LearningRate = (lambda iter, current_num_jobs, num_archives_processed:
-                        common_train_lib.GetLearningRate(
-                            iter, current_num_jobs, num_iters,
-                            num_archives_processed,
-                            num_archives_to_process,
-                            args.initial_effective_lrate,
-                            args.final_effective_lrate)
-                    )
+    def learning_rate(iter, current_num_jobs, num_archives_processed):
+        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
+                                           num_archives_processed,
+                                           num_archives_to_process,
+                                           args.initial_effective_lrate,
+                                           args.final_effective_lrate)
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -303,14 +340,16 @@ def Train(args, run_opts):
 
     min_deriv_time = args.chunk_width - num_bptt_steps
 
-
-    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    logger.info("Training will run for {0} epochs = "
+                "{1} iterations".format(args.num_epochs, num_iters))
 
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+        current_num_jobs = int(0.5 + args.num_jobs_initial
+                               + (args.num_jobs_final - args.num_jobs_initial)
+                               * float(iter) / num_iters)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -318,15 +357,18 @@ def Train(args, run_opts):
             shrinkage_value = 1.0
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
-                                   if common_train_lib.DoShrinkage(
+                                   if common_train_lib.do_shrinkage(
                                         iter, model_file, "SigmoidComponent",
                                         args.shrink_threshold)
                                    else 1
                                    )
-            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(
-                iter, LearningRate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+            logger.info("On iteration {0}, learning rate is {1} and "
+                        "shrink value is {2}.".format(
+                            iter, learning_rate(iter, current_num_jobs,
+                                                num_archives_processed),
+                            shrinkage_value))
 
-            train_lib.common.TrainOneIteration(
+            train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
                 srand=args.srand,
@@ -334,7 +376,8 @@ def Train(args, run_opts):
                 num_jobs=current_num_jobs,
                 num_archives_processed=num_archives_processed,
                 num_archives=num_archives,
-                learning_rate=LearningRate(iter, current_num_jobs, num_archives_processed),
+                learning_rate=learning_rate(iter, current_num_jobs,
+                                            num_archives_processed),
                 shrinkage_value=shrinkage_value,
                 minibatch_size=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
@@ -346,78 +389,93 @@ def Train(args, run_opts):
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
                 cv_minibatch_size=args.cv_minibatch_size,
-                run_opts=run_opts)
+                run_opts=run_opts,
+                background_process_handler=background_process_handler)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain conditions
-                common_train_lib.RemoveModel(
-                                 args.dir, iter-2, num_iters, num_iters_combine,
-                                 args.preserve_model_interval)
+                # do a clean up everythin but the last 2 models, under certain
+                # conditions
+                common_train_lib.remove_model(
+                    args.dir, iter-2, num_iters, models_to_combine,
+                    args.preserve_model_interval)
 
             if args.email is not None:
                 reporting_iter_interval = num_iters * args.reporting_interval
                 if iter % reporting_iter_interval == 0:
-                # lets do some reporting
-                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    # lets do some reporting
+                    [report, times, data] = (
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
-                    subject = "Update : Expt {dir} : Iter {iter}".format(dir=args.dir, iter=iter)
-                    common_train_lib.SendMail(message, subject, args.email)
+                    subject = ("Update : Expt {dir} : "
+                               "Iter {iter}".format(dir=args.dir, iter=iter))
+                    common_lib.send_mail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        train_lib.common.CombineModels(args.dir, num_iters, num_iters_combine,
-                                       egs_dir,
-                                       run_opts, chunk_width=args.chunk_width)
+        train_lib.common.combine_models(
+            args.dir, num_iters, models_to_combine,
+            egs_dir, run_opts,
+            background_process_handler=background_process_handler,
+            chunk_width=args.chunk_width)
 
     if args.stage <= num_iters + 1:
-        logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = train_lib.ComputeAveragePosterior(
+        logger.info("Getting average posterior for purposes of "
+                    "adjusting the priors.")
+        avg_post_vec_file = train_lib.common.compute_average_posterior(
             args.dir, 'combined', egs_dir,
             num_archives, args.prior_subset_size, run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir=args.dir)
         final_model = "{dir}/final.mdl".format(dir=args.dir)
-        train_lib.AdjustAmPriors(args.dir, combined_model, avg_post_vec_file,
-                                 final_model, run_opts)
+        train_lib.common.adjust_am_priors(args.dir, combined_model,
+                                          avg_post_vec_file, final_model,
+                                          run_opts)
 
     if args.cleanup:
-        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        logger.info("Cleaning up the experiment directory "
+                    "{0}".format(args.dir))
         remove_egs = args.remove_egs
         if args.egs_dir is not None:
             # this egs_dir was not created by this experiment so we will not
             # delete it
             remove_egs = False
 
-        common_train_lib.CleanNnetDir(
+        common_train_lib.clean_nnet_dir(
             args.dir, num_iters, egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
-        common_lib.SendMail(report, "Update : Expt {0} : complete".format(
-                                      args.dir), args.email)
+        common_lib.send_mail(report, "Update : Expt {0} : "
+                                     "complete".format(args.dir), args.email)
 
-    report_handle = open("{dir}/accuracy.report".format(dir=args.dir), "w")
-    report_handle.write(report)
-    report_handle.close()
+    with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
+        f.write(report)
 
-    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
+                                 "{0}".format(args.dir))
 
-def Main():
-    [args, run_opts] = GetArgs()
+
+def main():
+    [args, run_opts] = get_args()
     try:
-        Train(args, run_opts)
+        background_process_handler = common_lib.BackgroundProcessHandler(
+            polling_time=args.background_polling_time)
+        train(args, run_opts, background_process_handler)
+        background_process_handler.ensure_processes_are_done()
     except Exception as e:
         if args.email is not None:
-            message = "Training session for experiment {dir} died due to an error.".format(dir=args.dir)
-            common_lib.SendMail(message, message, args.email)
+            message = ("Training session for experiment {dir} "
+                       "died due to an error.".format(dir=args.dir))
+            common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
         raise e
 
+
 if __name__ == "__main__":
-    Main()
+    main()

From bd791523164a1add2b723cff0685a0d558dc988b Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 15 Nov 2016 14:47:12 -0500
Subject: [PATCH 24/71] raw_python_script: Fixed errors in standards

---
 .../nnet3/train/chain_objf/acoustic_model.py  | 578 ++++++++++--------
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |  27 +-
 .../nnet3/train/frame_level_objf/common.py    |  32 +-
 egs/wsj/s5/steps/nnet3/chain/train.py         |   5 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  10 +
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  24 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  41 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  13 +-
 8 files changed, 401 insertions(+), 329 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index dfbe46aaa55..60df647ff83 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -9,10 +9,12 @@
 """
 
 import logging
+import math
 import os
 import sys
 
 import libs.common as common_lib
+import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -43,12 +45,14 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
 
 def create_denominator_fst(dir, tree_dir, run_opts):
     common_lib.run_kaldi_command(
-        """copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
-           {command} {dir}/log/make_den_fst.log \
+        """copy-transition-model {tree_dir}/final.mdl \
+                {dir}/0.trans_mdl""".format(dir=dir, tree_dir=tree_dir))
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/make_den_fst.log \
                    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl \
                    {dir}/phone_lm.fst \
                    {dir}/den.fst {dir}/normalization.fst""".format(
-                       tree_dir=tree_dir, dir=dir, command=run_opts.command))
+                       dir=dir, command=run_opts.command))
 
 
 def generate_chain_egs(dir, data, lat_dir, egs_dir,
@@ -115,6 +119,265 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                     egs_opts=egs_opts if egs_opts is not None else ''))
 
 
+def train_new_models(dir, iter, srand, num_jobs,
+                     num_archives_processed, num_archives,
+                     raw_model_string, egs_dir,
+                     apply_deriv_weights, chunk_width,
+                     left_deriv_truncate, right_deriv_truncate,
+                     l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                     momentum, max_param_change,
+                     shuffle_buffer_size, num_chunk_per_minibatch,
+                     frame_subsampling_factor, truncate_deriv_weights,
+                     cache_io_opts, run_opts,
+                     background_process_handler=None):
+    """
+    Called from train_one_iteration(), this method trains new models
+    with 'num_jobs' jobs, and
+    writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
+
+    We cannot easily use a single parallel SGE job to do the main training,
+    because the computation of which archive and which --frame option
+    to use for each job is a little complex, so we spawn each one separately.
+    this is no longer true for RNNs as we use do not use the --frame option
+    but we use the same script for consistency with FF-DNN code
+    """
+
+    deriv_time_opts = []
+    if left_deriv_truncate is not None:
+        deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
+                                    left_deriv_truncate))
+    if right_deriv_truncate is not None:
+        deriv_time_opts.append("--optimization.max-deriv-time={0}".format(
+                                    int(chunk_width-right_deriv_truncate)))
+
+    processes = []
+    for job in range(1, num_jobs+1):
+        # k is a zero-based index that we will derive the other indexes from.
+        k = num_archives_processed + job - 1
+        # work out the 1-based archive index.
+        archive_index = (k % num_archives) + 1
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        frame_shift = ((archive_index + k/num_archives)
+                       % frame_subsampling_factor)
+        if job == 1:
+            cur_cache_io_opts = "{0} --write-cache={1}/cache.{2}".format(
+                cache_io_opts, dir, iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = common_lib.run_kaldi_command(
+            """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+                    nnet3-chain-train {parallel_train_opts} \
+                    --apply-deriv-weights={app_deriv_wts} \
+                    --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+                    {cache_io_opts}  --xent-regularize={xent_reg} \
+                    {deriv_time_opts} \
+                    --print-interval=10 --momentum={momentum} \
+                    --max-param-change={max_param_change} \
+                    "{raw_model}" {dir}/den.fst \
+                    "ark,bg:nnet3-chain-copy-egs """
+            """--truncate-deriv-weights={trunc_deriv} """
+            """--frame-shift={fr_shft} """
+            """ark:{egs_dir}/cegs.{archive_index}.ark ark:- | """
+            """nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} """
+            """--srand={srand} ark:- ark:- | nnet3-chain-merge-egs """
+            """--minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+                    {dir}/{next_iter}.{job}.raw""".format(
+                        command=run_opts.command,
+                        train_queue_opt=run_opts.train_queue_opt,
+                        dir=dir, iter=iter, srand=iter + srand,
+                        next_iter=iter + 1, job=job,
+                        deriv_time_opts=" ".join(deriv_time_opts),
+                        trunc_deriv=truncate_deriv_weights,
+                        app_deriv_wts=apply_deriv_weights,
+                        fr_shft=frame_shift, l2=l2_regularize,
+                        xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
+                        parallel_train_opts=run_opts.parallel_train_opts,
+                        momentum=momentum, max_param_change=max_param_change,
+                        raw_model=raw_model_string,
+                        egs_dir=egs_dir, archive_index=archive_index,
+                        shuffle_buffer_size=shuffle_buffer_size,
+                        cache_io_opts=cur_cache_io_opts,
+                        num_chunk_per_minibatch=num_chunk_per_minibatch),
+            wait=False,
+            background_process_handler=background_process_handler)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training "
+                        "iteration {0}".format(iter))
+
+
+def train_one_iteration(dir, iter, srand, egs_dir,
+                        num_jobs, num_archives_processed, num_archives,
+                        learning_rate, shrinkage_value,
+                        num_chunk_per_minibatch, chunk_width,
+                        num_hidden_layers, add_layers_period,
+                        apply_deriv_weights, left_deriv_truncate,
+                        right_deriv_truncate,
+                        l2_regularize, xent_regularize,
+                        leaky_hmm_coefficient,
+                        momentum, max_param_change, shuffle_buffer_size,
+                        frame_subsampling_factor, truncate_deriv_weights,
+                        run_opts, background_process_handler=None):
+    """ Called from steps/nnet3/chain/train.py for one iteration for
+    neural network training with LF-MMI objective
+
+    """
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir)).readline().strip())
+        except (IOError, ValueError) as e:
+            raise Exception("Exception while reading the random seed "
+                            "for training: {0}".format(e.str()))
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration "
+                           "(srand={0}) is different from the one saved last "
+                           "time (srand={1}). Using srand={0}.".format(
+                               srand, saved_srand))
+    else:
+        with open('{0}/srand'.format(dir), 'w') as f:
+            f.write(str(srand))
+
+    # Sets off some background jobs to compute train and
+    # validation set objectives
+    compute_train_cv_probabilities(
+        dir, iter, egs_dir, l2_regularize, xent_regularize,
+        leaky_hmm_coefficient, run_opts,
+        background_process_handler=background_process_handler)
+
+    if iter > 0:
+        # Runs in the background
+        compute_progress(dir, iter, run_opts,
+                         background_process_handler=background_process_handler)
+
+    if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period)
+            and iter % add_layers_period == 0):
+
+        # if we've just added new hiden layer, don't do averaging but take the
+        # best.
+        do_average = False
+
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(
+            dir, cur_num_hidden_layers)
+        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={lr} "
+                            "{dir}/{iter}.mdl - | nnet3-init --srand={srand} "
+                            "- {config} - |".format(lr=learning_rate, dir=dir,
+                                                    iter=iter,
+                                                    srand=iter + srand,
+                                                    config=config_file))
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            # on iteration 0, pick the best, don't average.
+            do_average = False
+        raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
+                            "{1}/{2}.mdl - |".format(learning_rate, dir, iter))
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir=dir,
+                                                                 iter=iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller
+        # minibatch size (and we will later choose the output of just one of
+        # the jobs): the model-averaging isn't always helpful when the model is
+        # changing too fast (i.e. it can worsen the objective function), and
+        # the smaller minibatch size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
+                     num_archives_processed=num_archives_processed,
+                     num_archives=num_archives,
+                     raw_model_string=raw_model_string,
+                     egs_dir=egs_dir,
+                     apply_deriv_weights=apply_deriv_weights,
+                     chunk_width=chunk_width,
+                     left_deriv_truncate=left_deriv_truncate,
+                     right_deriv_truncate=right_deriv_truncate,
+                     l2_regularize=l2_regularize,
+                     xent_regularize=xent_regularize,
+                     leaky_hmm_coefficient=leaky_hmm_coefficient,
+                     momentum=momentum,
+                     max_param_change=cur_max_param_change,
+                     shuffle_buffer_size=shuffle_buffer_size,
+                     num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
+                     frame_subsampling_factor=frame_subsampling_factor,
+                     truncate_deriv_weights=truncate_deriv_weights,
+                     cache_io_opts=cache_io_opts, run_opts=run_opts,
+                     background_process_handler=background_process_handler)
+
+    [models_to_average, best_model] = common_train_lib.get_successful_models(
+         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        common_train_lib.get_average_nnet_model(
+            dir=dir, iter=iter,
+            nnets_list=" ".join(nnets_list),
+            run_opts=run_opts,
+            shrink=shrinkage_value)
+
+    else:
+        # choose the best model from different jobs
+        common_train_lib.get_best_nnet_model(
+            dir=dir, iter=iter,
+            best_model_index=best_model,
+            run_opts=run_opts,
+            shrink=shrinkage_value)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of "
+                        "iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in "
+                        "iteration {1}".format(new_model, iter))
+    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
+def check_for_required_file(feat_dir, tree_dir, lat_dir):
+    files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+             '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+             '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+             '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]
+    for file in files:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
+
+
 def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                                    max_lda_jobs=None, rand_prune=4.0,
                                    lda_opts=None):
@@ -170,16 +433,13 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
 
-def prepare_initial_acoustic_model(dir, run_opts):
+def prepare_initial_acoustic_model(dir, run_opts, srand=-1):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    common_lib.run_kaldi_command(
-        """{command} {dir}/log/add_first_layer.log \
-                nnet3-init --srand=-1 {dir}/init.raw \
-                {dir}/configs/layer1.config {dir}/0.raw
-        """.format(command=run_opts.command, dir=dir))
+    common_train_lib.prepare_initial_network(dir, run_opts,
+                                             srand=srand)
 
     # The model-format for a 'chain' acoustic model is just the transition
     # model and then the raw nnet, so we can use 'cat' to create this, as
@@ -193,55 +453,6 @@ def prepare_initial_acoustic_model(dir, run_opts):
                 {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
 
 
-def combine_models(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
-                   egs_dir, leaky_hmm_coefficient, l2_regularize,
-                   xent_regularize, run_opts, background_process_handler=None):
-    """ Function to do model combination
-
-    In the nnet3 setup, the logic
-    for doing averaging of subsets of the models in the case where
-    there are too many models to reliably esetimate interpolation
-    factors (max_models_combine) is moved into the nnet3-combine
-    """
-
-    raw_model_strings = []
-    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-        model_file = '{0}/{1}.mdl'.format(dir, iter)
-        if os.path.exists(model_file):
-            raw_model_strings.append(
-                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
-        else:
-            print("{0}: warning: model file {1} does not exist "
-                  "(final combination)".format(sys.argv[0], model_file))
-
-    common_lib.run_kaldi_command(
-        """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters=40 \
-                --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --enforce-sum-to-one=true --enforce-positive-weights=true \
-                --verbose=3 {dir}/den.fst {raw_models} """
-        """ "ark,bg:nnet3-chain-merge-egs """
-        """--minibatch-size={num_chunk_per_minibatch} """
-        """ark:{egs_dir}/combine.cegs ark:-|" - \| \
-                nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
-                {dir}/final.mdl""".format(
-                    command=run_opts.command,
-                    combine_queue_opt=run_opts.combine_queue_opt,
-                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
-                    dir=dir, raw_models=" ".join(raw_model_strings),
-                    num_chunk_per_minibatch=num_chunk_per_minibatch,
-                    num_iters=num_iters,
-                    egs_dir=egs_dir))
-
-    # Compute the probability of the final, combined model with
-    # the same subset we used for the previous compute_probs, as the
-    # different subsets will lead to different probs.
-    compute_train_cv_probabilities(
-        dir, 'final', egs_dir, l2_regularize, xent_regularize,
-        leaky_hmm_coefficient, run_opts, wait=False,
-        background_process_handler=background_process_handler)
-
-
 def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
                                    xent_regularize, leaky_hmm_coefficient,
                                    run_opts, wait=False,
@@ -299,215 +510,54 @@ def compute_progress(dir, iter, run_opts, wait=False,
                    prev_model=prev_model), wait=wait,
         background_process_handler=background_process_handler)
 
-# Called from TrainOneIteration, this model does one iteration of training
-# with 'num_jobs' jobs, and
-# writes files like exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
-def TrainNewModels(dir, iter, srand, num_jobs,
-                   num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts):
-    # We cannot easily use a single parallel SGE job to do the main training,
-    # because the computation of which archive and which --frame option
-    # to use for each job is a little complex, so we spawn each one separately.
-    # this is no longer true for RNNs as we use do not use the --frame option
-    # but we use the same script for consistency with FF-DNN code
-
-    deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
-
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
-        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        if job == 1:
-            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
-        else:
-            cur_cache_io_opts = cache_io_opts
-
-        process_handle = common_train_lib.RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-chain-train {parallel_train_opts} \
-  --apply-deriv-weights={app_deriv_wts} \
-  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-   "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     deriv_time_opts = deriv_time_opts,
-                     trunc_deriv = truncate_deriv_weights,
-                     app_deriv_wts = apply_deriv_weights,
-                     fr_shft = frame_shift, l2 = l2_regularize,
-                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     cache_io_opts = cur_cache_io_opts,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
-                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      frame_subsampling_factor, truncate_deriv_weights,
-                      run_opts):
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
-
-    if iter > 0:
-        chain_lib.ComputeProgress(dir, iter, run_opts)
-
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-        cache_io_opts = ""
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
-
-    if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-        cur_max_param_change = max_param_change
-    else:
-        # on iteration zero or when we just added a layer, use a smaller minibatch
-        # size (and we will later choose the output of just one of the jobs): the
-        # model-averaging isn't always helpful when the model is changing too fast
-        # (i.e. it can worsen the objective function), and the smaller minibatch
-        # size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-        cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
-    TrainNewModels(dir = dir, iter = iter, srand = srand, num_jobs = num_jobs,
-                   num_archives_processed = num_archives_processed,
-                   num_archives = num_archives,
-                   raw_model_string = raw_model_string,
-                   egs_dir = egs_dir,
-                   apply_deriv_weights = apply_deriv_weights,
-                   left_deriv_truncate = left_deriv_truncate,
-                   right_deriv_truncate = right_deriv_truncate,
-                   l2_regularize = l2_regularize,
-                   xent_regularize = xent_regularize,
-                   leaky_hmm_coefficient = leaky_hmm_coefficient,
-                   momentum = momentum,
-                   max_param_change = cur_max_param_change,
-                   shuffle_buffer_size = shuffle_buffer_size,
-                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor = frame_subsampling_factor,
-                   truncate_deriv_weights = truncate_deriv_weights,
-                   cache_io_opts = cache_io_opts, run_opts = run_opts)
-
-    [models_to_average, best_model] = common_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        common_train_lib.RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
+def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
+                   egs_dir, leaky_hmm_coefficient, l2_regularize,
+                   xent_regularize, run_opts, background_process_handler=None):
+    """ Function to do model combination
 
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    In the nnet3 setup, the logic
+    for doing averaging of subsets of the models in the case where
+    there are too many models to reliably esetimate interpolation
+    factors (max_models_combine) is moved into the nnet3-combine.
+    """
+    raw_model_strings = []
+    print len(models_to_combine)
 
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
+    models_to_combine.add(num_iters)
 
-def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
-    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
-                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
-                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
-                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
-        if not os.path.isfile(file):
-            raise Exception('Expected {0} to exist.'.format(file))
+    for iter in models_to_combine:
+        model_file = '{0}/{1}.mdl'.format(dir, iter)
+        if os.path.exists(model_file):
+            raw_model_strings.append(
+                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+        else:
+            print("{0}: warning: model file {1} does not exist "
+                  "(final combination)".format(sys.argv[0], model_file))
 
+    common_lib.run_kaldi_command(
+        """{command} {combine_queue_opt} {dir}/log/combine.log \
+                nnet3-chain-combine --num-iters=40 \
+                --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                --verbose=3 {dir}/den.fst {raw_models} """
+        """ "ark,bg:nnet3-chain-merge-egs """
+        """--minibatch-size={num_chunk_per_minibatch} """
+        """ark:{egs_dir}/combine.cegs ark:-|" - \| \
+                nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
+                {dir}/final.mdl""".format(
+                    command=run_opts.command,
+                    combine_queue_opt=run_opts.combine_queue_opt,
+                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
+                    dir=dir, raw_models=" ".join(raw_model_strings),
+                    num_chunk_per_minibatch=num_chunk_per_minibatch,
+                    num_iters=num_iters,
+                    egs_dir=egs_dir))
 
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    compute_train_cv_probabilities(
+        dir, 'final', egs_dir, l2_regularize, xent_regularize,
+        leaky_hmm_coefficient, run_opts, wait=False,
+        background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index c85efd72ca2..efcbaca8bdc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -15,7 +15,6 @@
 import math
 import re
 import shutil
-import time
 
 import libs.common as common_lib
 
@@ -256,12 +255,12 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
     return scaled_counts
 
 
-def prepare_initial_network(dir, run_opts):
+def prepare_initial_network(dir, run_opts, srand=-3):
     common_lib.run_kaldi_command(
         """{command} {dir}/log/add_first_layer.log \
-                nnet3-init --srand=-3 {dir}/init.raw \
+                nnet3-init --srand={srand} {dir}/init.raw \
                 {dir}/configs/layer1.config {dir}/0.raw""".format(
-                    command=run_opts.command,
+                    command=run_opts.command, srand=srand,
                     dir=dir))
 
 
@@ -483,19 +482,6 @@ def __init__(self):
         self.parser.add_argument("--trainer.num-epochs", type=int,
                                  dest='num_epochs', default=8,
                                  help="Number of epochs to train the model")
-        self.parser.add_argument("--trainer.prior-subset-size", type=int,
-                                 dest='prior_subset_size', default=20000,
-                                 help="Number of samples for computing priors")
-        self.parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
-                                 dest='num_jobs_compute_prior', default=10,
-                                 help="The prior computation jobs are single "
-                                 "threaded and run on the CPU")
-        self.parser.add_argument("--trainer.max-models-combine", type=int,
-                                 dest='max_models_combine', default=20,
-                                 help="""The maximum number of models used in
-                                 the final model combination stage.  These
-                                 models will themselves be averages of
-                                 iteration-number ranges""")
         self.parser.add_argument("--trainer.shuffle-buffer-size", type=int,
                                  dest='shuffle_buffer_size', default=5000,
                                  help=""" Controls randomization of the samples
@@ -556,12 +542,13 @@ def __init__(self):
                                  help="Number of neural net jobs to run in "
                                  "parallel at the end of training")
         self.parser.add_argument("--trainer.optimization.max-models-combine",
+                                 "--trainer.max-models-combine",
                                  type=int, dest='max_models_combine',
                                  default=20,
-                                 help="""The is the maximum number of models we
-                                 give to the final 'combine' stage, but these
+                                 help="""The maximum number of models used in
+                                 the final model combination stage.  These
                                  models will themselves be averages of
-                                 iteration-number ranges.""")
+                                 iteration-number ranges""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index b49a59f5a3b..7400bd6f880 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -184,9 +184,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                            "time (srand={1}). Using srand={0}.".format(
                                srand, saved_srand))
     else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
+        with open('{0}/srand'.format(dir), 'w') as f:
+            f.write(str(srand))
 
     # Sets off some background jobs to compute train and
     # validation set objectives
@@ -206,7 +205,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
     cache_read_opt = ""
-    if (iter > 0 and iter <= (num_hidden_layers-1) * add_layers_period
+    if (iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period)
             and iter % add_layers_period == 0):
 
         # if we've just added new hiden layer, don't do averaging but take the
@@ -215,8 +214,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
 
         cur_num_hidden_layers = 1 + iter / add_layers_period
         config_file = "{0}/configs/layer{1}.config".format(
-                dir,
-                cur_num_hidden_layers)
+            dir, cur_num_hidden_layers)
         if get_raw_nnet_from_am:
             raw_model_string = ("nnet3-am-copy --raw=true "
                                 "--learning-rate={lr} {dir}/{iter}.mdl - | "
@@ -277,7 +275,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      background_process_handler=background_process_handler)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
-            num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
+         num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
     nnets_list = []
     for n in models_to_average:
         nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
@@ -328,6 +326,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
         if num_lda_jobs > max_lda_jobs:
             num_lda_jobs = max_lda_jobs
 
+    # Write stats with the same format as stats for LDA.
     common_lib.run_kaldi_command(
         """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                 nnet3-acc-lda-stats --rand-prune={rand_prune} \
@@ -361,20 +360,22 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
 
     common_lib.run_kaldi_command(
         """{command} {dir}/log/get_transform.log \
-                nnet-get-feature-transform \
-                {lda_opts} {dir}/lda.mat {dir}/lda_stats
-        """.format(command=run_opts.command, dir=dir,
-                   lda_opts=lda_opts if lda_opts is not None else ""))
+                nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
+                {dir}/lda_stats""".format(
+                    command=run_opts.command, dir=dir,
+                    lda_opts=lda_opts if lda_opts is not None else ""))
 
     common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
 
-def prepare_initial_acoustic_model(dir, alidir, run_opts):
+def prepare_initial_acoustic_model(dir, alidir, run_opts,
+                                   srand=-3):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    common_lib.prepare_initial_network(dir, run_opts)
+    common_train_lib.prepare_initial_network(dir, run_opts,
+                                             srand=srand)
 
     # Convert to .mdl, train the transitions, set the priors.
     common_lib.run_kaldi_command(
@@ -453,8 +454,9 @@ def compute_progress(dir, iter, egs_dir, run_opts, mb_size=256,
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    run_opts, background_process_handler=None,
                    chunk_width=None, get_raw_nnet_from_am=True):
-    """
-    Now do combination.  In the nnet3 setup, the logic
+    """ Function to do model combination
+
+    In the nnet3 setup, the logic
     for doing averaging of subsets of the models in the case where
     there are too many models to reliably esetimate interpolation
     factors (max_models_combine) is moved into the nnet3-combine.
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index afeb8084e9e..7b2fdbeb5f0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -146,7 +146,9 @@ def get_args():
                         shrink-nonlinearity type""")
 
     # RNN specific trainer options
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
+    parser.add_argument("--trainer.num-chunk-per-minibatch",
+                        "--trainer.rnn.num-chunk-per-minibatch",
+                        type=int,
                         dest='num_chunk_per_minibatch', default=512,
                         help="Number of sequences to be processed in "
                         "parallel every minibatch")
@@ -419,6 +421,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                             num_archives_processed),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                chunk_width=args.chunk_width,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 apply_deriv_weights=args.apply_deriv_weights,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 77a37d52fb2..8206a0bef42 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -54,6 +54,16 @@ def get_args():
                         default=8,
                         help="Number of output labels per example")
 
+    # trainer options
+    parser.add_argument("--trainer.prior-subset-size", type=int,
+                        dest='prior_subset_size', default=20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
+                        dest='num_jobs_compute_prior', default=10,
+                        help="The prior computation jobs are single "
+                        "threaded and run on the CPU")
+
+    # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
                         type=float, dest='minibatch_size', default=512,
                         help="Size of the minibatch used to compute the "
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 8eefe174f99..ab7740a9292 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -12,7 +12,6 @@
 import logging
 import pprint
 import os
-import subprocess
 import sys
 import traceback
 
@@ -54,6 +53,16 @@ def get_args():
                         default=8,
                         help="Number of output labels per example")
 
+    # trainer options
+    parser.add_argument("--trainer.prior-subset-size", type=int,
+                        dest='prior_subset_size', default=20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
+                        dest='num_jobs_compute_prior', default=10,
+                        help="The prior computation jobs are single "
+                        "threaded and run on the CPU")
+
+    # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
                         type=float, dest='minibatch_size', default=512,
                         help="Size of the minibatch used to compute the "
@@ -149,11 +158,6 @@ def train(args, run_opts, background_process_handler):
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
-    # split the training data into parts for individual jobs
-    common_lib.split_data(args.feat_dir, num_jobs)
-    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
-        f.write(str(num_jobs))
-
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
@@ -200,8 +204,9 @@ def train(args, run_opts, background_process_handler):
             except KeyError as e:
                 raise Exception("KeyError {0}: Variables need to be defined "
                                 "in {1}".format(
-                    str(e), '{0}/configs'.format(args.dir)))
-            if common_lib.get_feat_dim_from_scp(targets_scp) != num_targets:
+                                    str(e), '{0}/configs'.format(args.dir)))
+            if (common_lib.get_feat_dim_from_scp(args.targets_scp)
+                    != num_targets):
                 raise Exception("Mismatch between num-targets provided to "
                                 "script vs configs")
         else:
@@ -252,7 +257,6 @@ def train(args, run_opts, background_process_handler):
             max_lda_jobs=args.max_lda_jobs,
             rand_prune=args.rand_prune)
 
-
     if (args.stage <= -1):
         logger.info("Preparing the initial network.")
         common_train_lib.prepare_initial_network(args.dir, run_opts)
@@ -351,7 +355,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
-        avg_post_vec_file = train_lib.common.compute_average_posterior(
+        train_lib.common.compute_average_posterior(
             args.dir, 'final', egs_dir,
             num_archives, args.prior_subset_size, run_opts,
             get_raw_nnet_from_am=False)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 33655c8390e..7df33509661 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -13,7 +13,6 @@
 import logging
 import pprint
 import os
-import subprocess
 import sys
 import traceback
 
@@ -67,27 +66,38 @@ def get_args():
     parser.add_argument("--egs.chunk-left-context", type=int,
                         dest='chunk_left_context', default=40,
                         help="""Number of left steps used in the estimation of
-                        LSTM state before prediction of the first label""")
+                        LSTM state before prediction of the first label.
+                        Overrides the default value in CommonParser""")
 
+    # trainer options
     parser.add_argument("--trainer.samples-per-iter", type=int,
                         dest='samples_per_iter', default=20000,
                         help="""This is really the number of egs in each
                         archive.  Each eg has 'chunk_width' frames in it--
                         for chunk_width=20, this value (20k) is equivalent
                         to the 400k number that we use as a default in
-                        regular DNN training.""")
+                        regular DNN training.
+                        Overrides the default value in CommonParser.""")
+    parser.add_argument("--trainer.prior-subset-size", type=int,
+                        dest='prior_subset_size', default=20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
+                        dest='num_jobs_compute_prior', default=10,
+                        help="The prior computation jobs are single "
+                        "threaded and run on the CPU")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.momentum", type=float,
                         dest='momentum', default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
+                        it doesn't increase the effective learning rate.
+                        Overrides the default value in CommonParser""")
     parser.add_argument("--trainer.optimization.shrink-value", type=float,
                         dest='shrink_value', default=0.99,
                         help="""Scaling factor used for scaling the parameter
                         matrices when the derivative averages are below the
-                        shrink-threshold at the non-linearities")
+                        shrink-threshold at the non-linearities""")
     parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
                         dest='shrink_threshold', default=0.15,
                         help="""If the derivative averages are below this
@@ -104,7 +114,7 @@ def get_args():
     parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
                         dest='num_chunk_per_minibatch', default=100,
                         help="Number of sequences to be processed in "
-                        "parallel every minibatch" )
+                        "parallel every minibatch")
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
                         dest='num_bptt_steps', default=None,
                         help="""The number of time steps to back-propagate from
@@ -136,6 +146,7 @@ def get_args():
 
     return [args, run_opts]
 
+
 def process_args(args):
     """ Process the options got from get_args()
     """
@@ -206,11 +217,6 @@ def train(args, run_opts, background_process_handler):
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
-    # split the training data into parts for individual jobs
-    common_lib.split_data(args.feat_dir, num_jobs)
-    with open('{0}/num_jobs'.format(args.dir), 'w') as f:
-        f.write(str(num_jobs))
-
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
@@ -257,8 +263,9 @@ def train(args, run_opts, background_process_handler):
             except KeyError as e:
                 raise Exception("KeyError {0}: Variables need to be defined "
                                 "in {1}".format(
-                    str(e), '{0}/configs'.format(args.dir)))
-            if common_lib.get_feat_dim_from_scp(targets_scp) != num_targets:
+                                    str(e), '{0}/configs'.format(args.dir)))
+            if (common_lib.get_feat_dim_from_scp(args.targets_scp)
+                    != num_targets):
                 raise Exception("Mismatch between num-targets provided to "
                                 "script vs configs")
         else:
@@ -309,7 +316,6 @@ def train(args, run_opts, background_process_handler):
             max_lda_jobs=args.max_lda_jobs,
             rand_prune=args.rand_prune)
 
-
     if (args.stage <= -1):
         logger.info("Preparing the initial network.")
         common_train_lib.prepare_initial_network(args.dir, run_opts)
@@ -368,8 +374,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                    )
             logger.info("On iteration {0}, learning rate is {1} and "
                         "shrink value is {2}.".format(
-                iter, learning_rate(iter, current_num_jobs,
-                                    num_archives_processed), shrinkage_value))
+                            iter, learning_rate(iter, current_num_jobs,
+                                                num_archives_processed),
+                            shrinkage_value))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
@@ -428,7 +435,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
-        avg_post_vec_file = train_lib.common.compute_average_posterior(
+        train_lib.common.compute_average_posterior(
             args.dir, 'final', egs_dir,
             num_archives, args.prior_subset_size, run_opts,
             get_raw_nnet_from_am=False)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 216f38f0ae8..85644219711 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -75,14 +75,23 @@ def get_args():
                         archive.  Each eg has 'chunk_width' frames in it--
                         for chunk_width=20, this value (20k) is equivalent
                         to the 400k number that we use as a default in
-                        regular DNN training.""")
+                        regular DNN training.
+                        Overrides the default value in CommonParser.""")
+    parser.add_argument("--trainer.prior-subset-size", type=int,
+                        dest='prior_subset_size', default=20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int,
+                        dest='num_jobs_compute_prior', default=10,
+                        help="The prior computation jobs are single "
+                        "threaded and run on the CPU")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.momentum", type=float,
                         dest='momentum', default=0.5,
                         help="""Momentum used in update computation.
                         Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
+                        it doesn't increase the effective learning rate.
+                        Overrides the default value in CommonParser""")
     parser.add_argument("--trainer.optimization.shrink-value", type=float,
                         dest='shrink_value', default=0.99,
                         help="""Scaling factor used for scaling the parameter

From 3c100d3b7642b532263492d53a86bed839b6c6fc Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 17 Nov 2016 10:34:31 -0500
Subject: [PATCH 25/71] raw_python_script: Reorganizing scripts

---
 egs/wsj/s5/steps/libs/common.py               |  65 ++++++--
 .../nnet3/train/chain_objf/acoustic_model.py  |  75 +++++----
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |  10 +-
 .../nnet3/train/frame_level_objf/common.py    | 144 +++++++++++-------
 egs/wsj/s5/steps/nnet3/chain/train.py         |  43 ++++--
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |  14 +-
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py   |  22 +--
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  27 ++--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  24 +--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  24 +--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  24 +--
 11 files changed, 302 insertions(+), 170 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index c966b941ee1..58c12c46216 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -115,10 +115,10 @@ def next(self):
         if self.__current is None:
             raise StopIteration()
 
-        data = self.__current.data
+        node = self.__current
         self.__current = self.__current.next_node
 
-        return data
+        return node
 
 
 class LinkedList():
@@ -126,6 +126,7 @@ class LinkedList():
     def __init__(self):
         self.__head = None
         self.__tail = None
+        self.__size = 0
 
     def __iter__(self):
         return LinkedListIterator(self.__head)
@@ -133,13 +134,21 @@ def __iter__(self):
     def push(self, node):
         """Pushes the node <node> at the "front" of the linked list
         """
+        if self.__head == None:
+            self.__head = node
+            return
         node.next_node = self.__head
         node.previous_node = None
         self.__head.previous_node = node
         self.__head = node
+        self.__size += 1
 
     def pop(self):
         """Pops the last node out of the list"""
+
+        if self.__tail is None:
+            return None
+
         old_last_node = self.__tail
         to_be_last = self.__tail.previous_node
         to_be_last.next_node = None
@@ -147,6 +156,7 @@ def pop(self):
 
         # Set the last node to the "to_be_last"
         self.__tail = to_be_last
+        self.__size -= 1
 
         return old_last_node
 
@@ -157,14 +167,23 @@ def remove(self, node):
         next_node = node.next_node
         previous_node = node.previous_node
 
-        previous_node.next_node = next_node
-        next_node.previous_node = previous_node
+        if previous_node is not None:
+            previous_node.next_node = next_node
+
+        if next_node is not None:
+            next_node.previous_node = previous_node
 
         # Make it "free"
         node.next_node = node.previous_node = None
+        self.__size -= 1
 
         return node
 
+    def size():
+        return self.__size
+
+    def is_not_empty():
+        return self.__size != 0
 
 class BackgroundProcessHandler():
     """ This class handles background processes to ensure that a top-level
@@ -185,19 +204,35 @@ class BackgroundProcessHandler():
     def __init__(self, polling_time=600):
         self.__process_queue = LinkedList()
         self.__polling_time = polling_time
-        self.poll()
+        self.__timer = None
+        self.__is_running = False
+
+    def __run():
+        self.__is_running = False
+        if self.poll():
+            self.start()
+
+    def start(self):
+        if not self.__is_running:
+            self.__timer = Timer(self.__polling_time, self.__run())
+            self.__timer.start()
+            self.__is_running = True
+
+    def stop(self):
+        self.__timer.cancel()
+        self.__is_running = False
 
     def poll(self):
         for n in self.__process_queue:
             if self.is_process_done(n.data):
-                self.ensure_process_is_done(n.data)
-                self.__process_queue.remove(n)
-        threading.Timer(self.__polling_time, self.poll).start()
+                self.ensure_process_is_done(n)
+        return self.__process_queue.is_not_empty()
 
     def add_process(self, t):
         """ Add a (process handle, command) tuple to the queue
         """
-        self.__process_queue.Push(ListNode(data=t))
+        self.__process_queue.push(ListNode(data=t))
+        self.start()
 
     def is_process_done(self, t):
         p, command = t
@@ -205,15 +240,21 @@ def is_process_done(self, t):
             return False
         return True
 
-    def ensure_process_is_done(self, t):
-        p, command = t
+    def ensure_process_is_done(self, n):
+        p, command = n.data
+        logger.info("Waiting for process '{0}' to end".format(command))
         [stdout, stderr] = p.communicate()
         if p.returncode is not 0:
             raise KaldiCommandException(command, stderr)
+        self.__process_queue.remove(n)
 
     def ensure_processes_are_done(self):
         for n in self.__process_queue:
-            self.ensure_process_is_done(n.data)
+            self.ensure_process_is_done(n)
+        self.stop()
+
+    def __del__(self):
+        self.stop()
 
 
 def run_kaldi_command(command, wait=True, background_process_handler=None):
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 60df647ff83..9d0391954a5 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -121,7 +121,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
 
 def train_new_models(dir, iter, srand, num_jobs,
                      num_archives_processed, num_archives,
-                     raw_model_string, egs_dir,
+                     raw_model_string, egs_dir, left_context, right_context,
                      apply_deriv_weights, chunk_width,
                      left_deriv_truncate, right_deriv_truncate,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
@@ -175,13 +175,14 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --print-interval=10 --momentum={momentum} \
                     --max-param-change={max_param_change} \
                     "{raw_model}" {dir}/den.fst \
-                    "ark,bg:nnet3-chain-copy-egs """
-            """--truncate-deriv-weights={trunc_deriv} """
-            """--frame-shift={fr_shft} """
-            """ark:{egs_dir}/cegs.{archive_index}.ark ark:- | """
-            """nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} """
-            """--srand={srand} ark:- ark:- | nnet3-chain-merge-egs """
-            """--minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+                    "ark,bg:nnet3-chain-copy-egs \
+                        --left-context={lc} --right-context={rc} \
+                        --truncate-deriv-weights={trunc_deriv} \
+                        --frame-shift={fr_shft} \
+                        ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \
+                        nnet3-chain-shuffle-egs --buffer-size={buf_size} \
+                        --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \
+                        --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \
                     {dir}/{next_iter}.{job}.raw""".format(
                         command=run_opts.command,
                         train_queue_opt=run_opts.train_queue_opt,
@@ -196,9 +197,9 @@ def train_new_models(dir, iter, srand, num_jobs,
                         momentum=momentum, max_param_change=max_param_change,
                         raw_model=raw_model_string,
                         egs_dir=egs_dir, archive_index=archive_index,
-                        shuffle_buffer_size=shuffle_buffer_size,
+                        buf_size=shuffle_buffer_size,
                         cache_io_opts=cur_cache_io_opts,
-                        num_chunk_per_minibatch=num_chunk_per_minibatch),
+                        num_chunk_per_mb=num_chunk_per_minibatch),
             wait=False,
             background_process_handler=background_process_handler)
 
@@ -224,6 +225,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         learning_rate, shrinkage_value,
                         num_chunk_per_minibatch, chunk_width,
                         num_hidden_layers, add_layers_period,
+                        left_context, right_context,
                         apply_deriv_weights, left_deriv_truncate,
                         right_deriv_truncate,
                         l2_regularize, xent_regularize,
@@ -259,8 +261,10 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     # Sets off some background jobs to compute train and
     # validation set objectives
     compute_train_cv_probabilities(
-        dir, iter, egs_dir, l2_regularize, xent_regularize,
-        leaky_hmm_coefficient, run_opts,
+        dir=dir, iter=iter, egs_dir=egs_dir,
+        left_context=left_context, right_context=right_context,
+        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
+        leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts,
         background_process_handler=background_process_handler)
 
     if iter > 0:
@@ -312,6 +316,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      num_archives=num_archives,
                      raw_model_string=raw_model_string,
                      egs_dir=egs_dir,
+                     left_context=left_context, right_context=right_context,
                      apply_deriv_weights=apply_deriv_weights,
                      chunk_width=chunk_width,
                      left_deriv_truncate=left_deriv_truncate,
@@ -368,7 +373,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         os.remove("{0}/cache.{1}".format(dir, iter))
 
 
-def check_for_required_file(feat_dir, tree_dir, lat_dir):
+def check_for_required_files(feat_dir, tree_dir, lat_dir):
     files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
              '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
              '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
@@ -453,11 +458,11 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1):
                 {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
 
 
-def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
+def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
+                                   right_context, l2_regularize,
                                    xent_regularize, leaky_hmm_coefficient,
                                    run_opts, wait=False,
                                    background_process_handler=None):
-
     model = '{0}/{1}.mdl'.format(dir, iter)
 
     common_lib.run_kaldi_command(
@@ -465,10 +470,11 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-                "ark,bg:nnet3-chain-merge-egs """
-        """ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
-        """.format(command=run_opts.command,
-                   dir=dir, iter=iter, model=model,
+                "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
+                    --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs \
+                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+        """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
+                   lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
                    xent_reg=xent_regularize,
                    egs_dir=egs_dir), wait=wait,
@@ -479,12 +485,11 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize,
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-                "ark,bg:nnet3-chain-merge-egs """
-        """ark:{egs_dir}/train_diagnostic.cegs ark:- |"
-        """.format(command=run_opts.command,
-                   dir=dir,
-                   iter=iter,
-                   model=model,
+                "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
+                    --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs \
+                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+        """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
+                   lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
                    xent_reg=xent_regularize,
                    egs_dir=egs_dir), wait=wait,
@@ -512,7 +517,8 @@ def compute_progress(dir, iter, run_opts, wait=False,
 
 
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
-                   egs_dir, leaky_hmm_coefficient, l2_regularize,
+                   egs_dir, left_context, right_context,
+                   leaky_hmm_coefficient, l2_regularize,
                    xent_regularize, run_opts, background_process_handler=None):
     """ Function to do model combination
 
@@ -540,14 +546,16 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
                 nnet3-chain-combine --num-iters=40 \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                 --enforce-sum-to-one=true --enforce-positive-weights=true \
-                --verbose=3 {dir}/den.fst {raw_models} """
-        """ "ark,bg:nnet3-chain-merge-egs """
-        """--minibatch-size={num_chunk_per_minibatch} """
-        """ark:{egs_dir}/combine.cegs ark:-|" - \| \
+                --verbose=3 {dir}/den.fst {raw_models} \
+                "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
+                    --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \
+                    nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
+                    ark:- ark:- |" - \| \
                 nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
+                    lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
                     num_chunk_per_minibatch=num_chunk_per_minibatch,
@@ -558,6 +566,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
     # the same subset we used for the previous compute_probs, as the
     # different subsets will lead to different probs.
     compute_train_cv_probabilities(
-        dir, 'final', egs_dir, l2_regularize, xent_regularize,
-        leaky_hmm_coefficient, run_opts, wait=False,
+        dir=dir, iter='final', egs_dir=egs_dir,
+        left_context=left_context, right_context=right_context,
+        l2_regularize=l2_regularize, xent_regularize=xent_regularize,
+        leaky_hmm_coefficient=leaky_hmm_coefficient,
+        run_opts=run_opts, wait=False,
         background_process_handler=background_process_handler)
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index efcbaca8bdc..e960e6bf2b8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -417,9 +417,9 @@ class CommonParser:
     in steps/nnet3/train*.py and steps/nnet3/chain/train.py
     """
 
-    def __init__(self):
-        self.parser = argparse.ArgumentParser(add_help=False)
+    parser = argparse.ArgumentParser(add_help=False)
 
+    def __init__(self):
         # feat options
         self.parser.add_argument("--feat.online-ivector-dir", type=str,
                                  dest='online_ivector_dir', default=None,
@@ -612,3 +612,9 @@ def __init__(self):
                                  iterations.
                                  If 0 and reporting mail has been specified
                                  then only failure notifications are sent""")
+        self.parser.add_argument("--background-polling-time",
+                                 dest="background_polling_time",
+                                 type=float, default=60,
+                                 help="""Polling frequency in seconds at which
+                                 the background process handler checks for
+                                 errors in the processes.""")
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 7400bd6f880..d19fd61670f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -190,14 +190,19 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     # Sets off some background jobs to compute train and
     # validation set objectives
     compute_train_cv_probabilities(
-        dir, iter, egs_dir, run_opts,
+        dir=dir, iter=iter, egs_dir=egs_dir,
+        left_context=left_context, right_context=right_context,
+        run_opts=run_opts,
         mb_size=cv_minibatch_size,
         get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
         background_process_handler=background_process_handler)
 
     if iter > 0:
         # Runs in the background
-        compute_progress(dir, iter, egs_dir, run_opts,
+        compute_progress(dir=dir, iter=iter, egs_dir=egs_dir,
+                         left_context=left_context,
+                         right_context=right_context,
+                         run_opts=run_opts,
                          mb_size=cv_minibatch_size, wait=False,
                          get_raw_nnet_from_am=get_raw_nnet_from_am,
                          background_process_handler=background_process_handler)
@@ -263,13 +268,15 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    train_new_models(dir, iter, srand, num_jobs,
-                     num_archives_processed, num_archives,
-                     raw_model_string, egs_dir,
-                     left_context, right_context,
-                     momentum, cur_max_param_change,
-                     shuffle_buffer_size, cur_minibatch_size,
-                     cache_read_opt, run_opts,
+    train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs,
+                     num_archives_processed=num_archives_processed,
+                     num_archives=num_archives,
+                     raw_model_string=raw_model_string, egs_dir=egs_dir,
+                     left_context=left_context, right_context=right_context,
+                     momentum=momentum, max_param_change=cur_max_param_change,
+                     shuffle_buffer_size=shuffle_buffer_size,
+                     minibatch_size=cur_minibatch_size,
+                     cache_read_opt=cache_read_opt, run_opts=run_opts,
                      frames_per_eg=frames_per_eg,
                      min_deriv_time=min_deriv_time,
                      background_process_handler=background_process_handler)
@@ -387,44 +394,52 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts,
                    dir=dir, alidir=alidir))
 
 
-def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, mb_size=256,
+def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
+                                   right_context, run_opts, mb_size=256,
                                    wait=False, background_process_handler=None,
                                    get_raw_nnet_from_am=True):
-
     if get_raw_nnet_from_am:
         model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(
                     dir=dir, iter=iter)
     else:
         model = "{dir}/{iter}.raw".format(dir=dir, iter=iter)
 
+    context_opts = "--left-context={lc} --right-context={rc}".format(
+        lc=left_context, rc=right_context)
+
     common_lib.run_kaldi_command(
         """ {command} {dir}/log/compute_prob_valid.{iter}.log \
                 nnet3-compute-prob "{model}" \
-                "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
-        """ark:{egs_dir}/valid_diagnostic.egs ark:- |"
-        """.format(command=run_opts.command,
-                   dir=dir,
-                   iter=iter,
-                   mb_size=mb_size,
-                   model=model,
-                   egs_dir=egs_dir), wait=wait,
-        background_process_handler=background_process_handler)
+                "ark,bg:nnet3-copy-egs {context_opts} \
+                    ark:{egs_dir}/valid_diagnostic.egs ark:- | \
+                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    ark:- |" """.format(command=run_opts.command,
+                                        dir=dir,
+                                        iter=iter,
+                                        context_opts=context_opts,
+                                        mb_size=mb_size,
+                                        model=model,
+                                        egs_dir=egs_dir),
+        wait=wait, background_process_handler=background_process_handler)
 
     common_lib.run_kaldi_command(
         """{command} {dir}/log/compute_prob_train.{iter}.log \
                 nnet3-compute-prob "{model}" \
-                "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
-        """ark:{egs_dir}/train_diagnostic.egs ark:- |"
-        """.format(command=run_opts.command,
-                   dir=dir,
-                   iter=iter,
-                   mb_size=mb_size,
-                   model=model,
-                   egs_dir=egs_dir), wait=wait,
-        background_process_handler=background_process_handler)
-
-
-def compute_progress(dir, iter, egs_dir, run_opts, mb_size=256,
+                "ark,bg:nnet3-copy-egs {context_opts} \
+                    ark:{egs_dir}/train_diagnostic.egs ark:- | \
+                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    ark:- |" """.format(command=run_opts.command,
+                                        dir=dir,
+                                        iter=iter,
+                                        context_opts=context_opts,
+                                        mb_size=mb_size,
+                                        model=model,
+                                        egs_dir=egs_dir),
+        wait=wait, background_process_handler=background_process_handler)
+
+
+def compute_progress(dir, iter, egs_dir, left_context, right_context,
+                     run_opts, mb_size=256,
                      background_process_handler=None, wait=False,
                      get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
@@ -435,23 +450,28 @@ def compute_progress(dir, iter, egs_dir, run_opts, mb_size=256,
         prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
         model = '{0}/{1}.raw'.format(dir, iter)
 
+    context_opts = "--left-context={lc} --right-context={rc}".format(
+        lc=left_context, rc=right_context)
+
     common_lib.run_kaldi_command(
             """{command} {dir}/log/progress.{iter}.log \
                     nnet3-info {model} '&&' \
                     nnet3-show-progress --use-gpu=no {prev_model} {model} \
-                    "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} """
-            """ark:{egs_dir}/train_diagnostic.egs ark:-|"
-            """.format(command=run_opts.command,
-                       dir=dir,
-                       iter=iter,
-                       model=model,
-                       mb_size=mb_size,
-                       prev_model=prev_model,
-                       egs_dir=egs_dir), wait=wait,
-            background_process_handler=background_process_handler)
+                    "ark,bg:nnet3-copy-egs {context_opts} \
+                        ark:{egs_dir}/train_diagnostic.egs ark:- | \
+                        nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                        ark:- |" """.format(command=run_opts.command,
+                                            dir=dir,
+                                            iter=iter,
+                                            model=model,
+                                            mb_size=mb_size,
+                                            prev_model=prev_model,
+                                            egs_dir=egs_dir),
+            wait=wait, background_process_handler=background_process_handler)
 
 
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
+                   left_context, right_context,
                    run_opts, background_process_handler=None,
                    chunk_width=None, get_raw_nnet_from_am=True):
     """ Function to do model combination
@@ -491,17 +511,23 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
     else:
         out_model = '{dir}/final.raw'.format(dir=dir)
 
+    context_opts = "--left-context={lc} --right-context={rc}".format(
+        lc=left_context, rc=right_context)
+
     common_lib.run_kaldi_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
                 nnet3-combine --num-iters=40 \
                 --enforce-sum-to-one=true --enforce-positive-weights=true \
                 --verbose=3 {raw_models} \
-                "ark,bg:nnet3-merge-egs --measure-output-frames=false """
-        """--minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+                "ark,bg:nnet3-copy-egs {context_opts} \
+                    ark:{egs_dir}/combine.egs ark:- | \
+                        nnet3-merge-egs --measure-output-frames=false \
+                        --minibatch-size={mbsize} ark:- ark:- |" \
                 "{out_model}"
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
+                   context_opts=context_opts,
                    mbsize=mbsize,
                    out_model=out_model,
                    egs_dir=egs_dir))
@@ -511,11 +537,15 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
     # different subsets will lead to different probs.
     if get_raw_nnet_from_am:
         compute_train_cv_probabilities(
-            dir, 'combined', egs_dir, run_opts, wait=False,
+            dir=dir, iter='combined', egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts, wait=False,
             background_process_handler=background_process_handler)
     else:
         compute_train_cv_probabilities(
-            dir, 'final', egs_dir, run_opts, wait=False,
+            dir=dir, iter='final', egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts, wait=False,
             background_process_handler=background_process_handler,
             get_raw_nnet_from_am=False)
 
@@ -583,8 +613,8 @@ def align(dir, data, lang, run_opts, iter=None, transform_dir=None,
 
 
 def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
-            prior_subset_size, num_archives, run_opts,
-            transform_dir=None, online_ivector_dir=None):
+            prior_subset_size, num_archives, left_context, right_context,
+            run_opts, transform_dir=None, online_ivector_dir=None):
     raise Exception("Realignment stage has not been implemented in nnet3")
     logger.info("Getting average posterior for purposes of adjusting "
                 "the priors.")
@@ -593,8 +623,10 @@ def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
     # we're using different random subsets of it.
 
     avg_post_vec_file = compute_average_posterior(
-            dir, iter, prev_egs_dir,
-            num_archives, prior_subset_size, run_opts)
+            dir=dir, iter=iter, egs_dir=prev_egs_dir,
+            num_archives=num_archives, prior_subset_size=prior_subset_size,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts)
 
     avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
     logger.info("Re-adjusting priors based on computed posteriors")
@@ -627,8 +659,8 @@ def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
 
 
 def compute_average_posterior(dir, iter, egs_dir, num_archives,
-                              prior_subset_size, run_opts,
-                              get_raw_nnet_from_am=True):
+                              prior_subset_size, left_context, right_context,
+                              run_opts, get_raw_nnet_from_am=True):
     """ Computes the average posterior of the network
     Note: this just uses CPUs, using a smallish subset of data.
     """
@@ -645,11 +677,16 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives,
     else:
         model = "{dir}/final.raw".format(dir=dir)
 
+    context_opts = "--left-context={lc} --right-context={rc}".format(
+        lc=left_context, rc=right_context)
+
     common_lib.run_kaldi_command(
         """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
                 {dir}/log/get_post.{iter}.JOB.log \
-                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
+                nnet3-copy-egs {context_opts} \
                 ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+                nnet3-subset-egs --srand=JOB --n={prior_subset_size} \
+                ark:- ark:- \| \
                 nnet3-merge-egs --measure-output-frames=true \
                 --minibatch-size=128 ark:- ark:- \| \
                 nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
@@ -662,6 +699,7 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives,
                     prior_queue_opt=run_opts.prior_queue_opt,
                     iter=iter, prior_subset_size=prior_subset_size,
                     egs_dir=egs_dir, egs_part=egs_part,
+                    context_opts=context_opts,
                     prior_gpu_opt=run_opts.prior_gpu_opt))
 
     # make sure there is time for $dir/post.{iter}.*.vec to appear.
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 7b2fdbeb5f0..b5cdbcd2832 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -47,7 +47,7 @@ def get_args():
         objective function.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser.parser])
+        parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -125,7 +125,7 @@ def get_args():
                         default=0.00002,
                         help="Learning rate used during the final iteration")
     parser.add_argument("--trainer.optimization.shrink-value", type=float,
-                        dest='shrink_value', default=0.99,
+                        dest='shrink_value', default=1.0,
                         help="""Scaling factor used for scaling the parameter
                         matrices when the derivative averages are below the
                         shrink-threshold at the non-linearities""")
@@ -198,7 +198,7 @@ def process_args(args):
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
     # set the options corresponding to args.use_gpu
-    run_opts = common_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
@@ -244,7 +244,7 @@ def train(args, run_opts, background_process_handler):
                                        args.lat_dir)
 
     # Set some variables.
-    num_jobs = common_lib.get_number_of_leaves_from_tree(args.tree_dir)
+    num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
 
@@ -302,7 +302,7 @@ def train(args, run_opts, background_process_handler):
         # this is where get_egs.sh is called.
         chain_lib.generate_chain_egs(
             dir=args.dir, data=args.feat_dir,
-            latdir=args.lat_dir, egs_dir=default_egs_dir,
+            lat_dir=args.lat_dir, egs_dir=default_egs_dir,
             left_context=left_context + args.frame_subsampling_factor/2,
             right_context=right_context + args.frame_subsampling_factor/2,
             valid_left_context=(left_context + args.frame_subsampling_factor/2
@@ -375,11 +375,12 @@ def train(args, run_opts, background_process_handler):
         args.num_jobs_final)
 
     def learning_rate(iter, current_num_jobs, num_archives_processed):
-        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
-                                           num_archives_processed,
-                                           num_archives_to_process,
-                                           args.initial_effective_lrate,
-                                           args.final_effective_lrate)
+        return common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                  num_iters,
+                                                  num_archives_processed,
+                                                  num_archives_to_process,
+                                                  args.initial_effective_lrate,
+                                                  args.final_effective_lrate)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -424,6 +425,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 chunk_width=args.chunk_width,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
+                left_context=left_context,
+                right_context=right_context,
                 apply_deriv_weights=args.apply_deriv_weights,
                 left_deriv_truncate=args.left_deriv_truncate,
                 right_deriv_truncate=args.right_deriv_truncate,
@@ -450,7 +453,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 if iter % reporting_iter_interval == 0:
                     # lets do some reporting
                     [report, times, data] = (
-                        nnet3_log_parse.generate_accuracy_report(args.dir))
+                        nnet3_log_parse.generate_accuracy_report(
+                            args.dir, "log-probability"))
                     message = report
                     subject = ("Update : Expt {dir} : "
                                "Iter {iter}".format(dir=args.dir, iter=iter))
@@ -461,10 +465,15 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
         chain_lib.combine_models(
-            args.dir, num_iters, models_to_combine,
-            args.num_chunk_per_minibatch, egs_dir,
-            args.leaky_hmm_coefficient, args.l2_regularize,
-            args.xent_regularize, run_opts,
+            dir=args.dir, num_iters=num_iters,
+            models_to_combine=models_to_combine,
+            num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+            egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            leaky_hmm_coefficient=args.leaky_hmm_coefficient,
+            l2regularize=args.l2_regularize,
+            xent_regularize=args.xent_regularize,
+            run_opts=run_opts,
             background_process_handler=background_process_handler)
 
     if args.cleanup:
@@ -482,7 +491,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(
+        args.dir, "log-probability")
     if args.email is not None:
         common_lib.send_mail(report, "Update : Expt {0} : "
                                      "complete".format(args.dir), args.email)
@@ -507,6 +517,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
+        background_process_handler.stop()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index d7fa9ea5983..32551ca253f 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -9,7 +9,7 @@
 import imp
 
 nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
+import libs.common as common_lib
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
@@ -47,7 +47,7 @@ def GetArgs():
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
                         default=0.0)
-    parser.add_argument("--include-log-softmax", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--include-log-softmax", type=str, action=common_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
     parser.add_argument("--max-change-per-component", type=float,
                         help="Enforces per-component max change (except for the final affine layer). "
@@ -75,7 +75,7 @@ def GetArgs():
                         help="options to be supplied to NaturalGradientAffineComponent", default="")
 
     # Gradient clipper options
-    parser.add_argument("--norm-based-clipping", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--norm-based-clipping", type=str, action=common_lib.StrToBoolAction,
                         help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
     parser.add_argument("--clipping-threshold", type=float,
                         help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
@@ -107,15 +107,15 @@ def CheckArgs(args):
 
     ## Check arguments.
     if args.feat_dir is not None:
-        args.feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+        args.feat_dim = common_lib.get_feat_dim(args.feat_dir)
 
     if args.ali_dir is not None:
-        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
+        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
     elif args.tree_dir is not None:
-        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.tree_dir)
+        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.tree_dir)
 
     if args.ivector_dir is not None:
-        args.ivector_dim = common_train_lib.GetIvectorDim(args.ivector_dir)
+        args.ivector_dim = common_lib.get_ivector_dim(args.ivector_dir)
 
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 9278c249ad5..29c95dc88cd 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -12,7 +12,7 @@
 import ast
 
 nodes = imp.load_source('', 'steps/nnet3/components.py')
-common_train_lib = imp.load_source('ntl', 'steps/nnet3/libs/common_train_lib.py')
+import libs.common as common_lib
 
 def GetArgs():
     # we add compulsary arguments as named arguments for readability
@@ -64,16 +64,16 @@ def GetArgs():
                         "If CNN layers are used the first set of splice indexes will be used as input "
                         "to the first CNN layer and later splice indexes will be interpreted as indexes "
                         "for the TDNNs.")
-    parser.add_argument("--add-lda", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--add-lda", type=str, action=common_lib.StrToBoolAction,
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
-    parser.add_argument("--include-log-softmax", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--include-log-softmax", type=str, action=common_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
-    parser.add_argument("--add-final-sigmoid", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--add-final-sigmoid", type=str, action=common_lib.StrToBoolAction,
                         help="add a final sigmoid layer as alternate to log-softmax-layer. "
                         "Can only be used if include-log-softmax is false. "
                         "This is useful in cases where you want the output to be "
@@ -88,7 +88,7 @@ def GetArgs():
                         help="For chain models, if nonzero, add a separate output for cross-entropy "
                         "regularization (with learning-rate-factor equal to the inverse of this)",
                         default=0.0)
-    parser.add_argument("--xent-separate-forward-affine", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=common_lib.StrToBoolAction,
                         help="if using --xent-regularize, gives it separate last-but-one weight matrix",
                         default=False, choices = ["false", "true"])
     parser.add_argument("--final-layer-normalize-target", type=float,
@@ -118,7 +118,7 @@ def GetArgs():
                         help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
 
 
-    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=common_train_lib.StrToBoolAction,
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=common_lib.StrToBoolAction,
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
     parser.add_argument("config_dir",
@@ -137,15 +137,15 @@ def CheckArgs(args):
 
     ## Check arguments.
     if args.feat_dir is not None:
-        args.feat_dim = common_train_lib.GetFeatDim(args.feat_dir)
+        args.feat_dim = common_lib.get_feat_dim(args.feat_dir)
 
     if args.ali_dir is not None:
-        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.ali_dir)
+        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
     elif args.tree_dir is not None:
-        args.num_targets = common_train_lib.GetNumberOfLeavesFromTree(args.tree_dir)
+        args.num_targets = common_lib.get_number_of_leaves_from_tree(args.tree_dir)
 
     if args.ivector_dir is not None:
-        args.ivector_dim = common_train_lib.GetIvectorDim(args.ivector_dir)
+        args.ivector_dim = common_lib.get_ivector_dim(args.ivector_dir)
 
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
@@ -238,7 +238,7 @@ def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, c
     cnn_args = ParseCnnString(cnn_layer)
     num_cnn_layers = len(cnn_args)
     # We use an Idct layer here to convert MFCC to FBANK features
-    common_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+    common_lib.write_idct_matrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
     prev_layer_output = {'descriptor':  "input",
                          'dimension': feat_dim}
     prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 8206a0bef42..14cd0ccf8ed 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -279,11 +279,12 @@ def train(args, run_opts, background_process_handler):
         args.num_jobs_final)
 
     def learning_rate(iter, current_num_jobs, num_archives_processed):
-        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
-                                           num_archives_processed,
-                                           num_archives_to_process,
-                                           args.initial_effective_lrate,
-                                           args.final_effective_lrate)
+        return common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                  num_iters,
+                                                  num_archives_processed,
+                                                  num_archives_to_process,
+                                                  args.initial_effective_lrate,
+                                                  args.final_effective_lrate)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -346,16 +347,21 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
         train_lib.common.combine_models(
-            args.dir, num_iters, models_to_combine,
-            egs_dir, run_opts,
+            dir=args.dir, num_iter=num_iters,
+            models_to_combine=models_to_combine,
+            egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts,
             background_process_handler=background_process_handler)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
         avg_post_vec_file = train_lib.common.compute_average_posterior(
-            args.dir, 'combined', egs_dir,
-            num_archives, args.prior_subset_size, run_opts)
+            dir=args.dir, iter='combined', egs_dir=egs_dir,
+            num_archives=num_archives,
+            left_context=left_context, right_context=right_context,
+            prior_subset_size=args.prior_subset_size, run_opts=run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir=args.dir)
@@ -374,7 +380,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs = False
 
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
@@ -404,6 +410,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
+        background_process_handler.timer.cancel()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index ab7740a9292..c38551cae4d 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -278,11 +278,12 @@ def train(args, run_opts, background_process_handler):
         args.num_jobs_final)
 
     def learning_rate(iter, current_num_jobs, num_archives_processed):
-        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
-                                           num_archives_processed,
-                                           num_archives_to_process,
-                                           args.initial_effective_lrate,
-                                           args.final_effective_lrate)
+        return common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                  num_iters,
+                                                  num_archives_processed,
+                                                  num_archives_to_process,
+                                                  args.initial_effective_lrate,
+                                                  args.final_effective_lrate)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -347,8 +348,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         train_lib.common.combine_models(
-            args.dir, num_iters, models_to_combine,
-            egs_dir, run_opts,
+            dir=args.dir, num_iters=num_iters,
+            models_to_combine=models_to_combine, egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts,
             background_process_handler=background_process_handler,
             get_raw_nnet_from_am=False)
 
@@ -356,8 +359,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
         train_lib.common.compute_average_posterior(
-            args.dir, 'final', egs_dir,
-            num_archives, args.prior_subset_size, run_opts,
+            dir=args.dir, iter='final', egs_dir=egs_dir,
+            num_archives=num_archives,
+            left_context=left_context, right_context=right_context,
+            prior_subset_size=args.prior_subset_size, run_opts=run_opts,
             get_raw_nnet_from_am=False)
 
     if args.cleanup:
@@ -401,6 +406,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
+        background_process_handler.timer.cancel()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 7df33509661..67ca4d072f5 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -336,11 +336,12 @@ def train(args, run_opts, background_process_handler):
         args.num_jobs_final)
 
     def learning_rate(iter, current_num_jobs, num_archives_processed):
-        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
-                                           num_archives_processed,
-                                           num_archives_to_process,
-                                           args.initial_effective_lrate,
-                                           args.final_effective_lrate)
+        return common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                  num_iters,
+                                                  num_archives_processed,
+                                                  num_archives_to_process,
+                                                  args.initial_effective_lrate,
+                                                  args.final_effective_lrate)
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -427,8 +428,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.raw")
         train_lib.common.combine_models(
-            args.dir, num_iters, models_to_combine,
-            egs_dir, run_opts, chunk_width=args.chunk_width,
+            dir=args.dir, num_iters=num_iters,
+            models_to_combine=models_to_combine, egs_dir=egs_dir,
+            left_context=left_context, right_context=right_context,
+            run_opts=run_opts, chunk_width=args.chunk_width,
             background_process_handler=background_process_handler,
             get_raw_nnet_from_am=False)
 
@@ -436,8 +439,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
         train_lib.common.compute_average_posterior(
-            args.dir, 'final', egs_dir,
-            num_archives, args.prior_subset_size, run_opts,
+            dir=args.dir, iter='final', egs_dir=egs_dir,
+            num_archives=num_archives,
+            left_context=left_context, right_context=right_context,
+            prior_subset_size=args.prior_subset_size, run_opts=run_opts,
             get_raw_nnet_from_am=False)
 
     if args.cleanup:
@@ -481,6 +486,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
+        background_process_handler.timer.cancel()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 85644219711..ce428e86fb5 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -336,11 +336,12 @@ def train(args, run_opts, background_process_handler):
         args.num_jobs_final)
 
     def learning_rate(iter, current_num_jobs, num_archives_processed):
-        common_train_lib.get_learning_rate(iter, current_num_jobs, num_iters,
-                                           num_archives_processed,
-                                           num_archives_to_process,
-                                           args.initial_effective_lrate,
-                                           args.final_effective_lrate)
+        return common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                  num_iters,
+                                                  num_archives_processed,
+                                                  num_archives_to_process,
+                                                  args.initial_effective_lrate,
+                                                  args.final_effective_lrate)
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -424,8 +425,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
         train_lib.common.combine_models(
-            args.dir, num_iters, models_to_combine,
-            egs_dir, run_opts,
+            dir=args.dir, num_iters=num_iters,
+            models_to_combine=models_to_combine, egs_dir=egs_dir,
+            run_opts=run_opts,
+            left_context=left_context, right_context=right_context,
             background_process_handler=background_process_handler,
             chunk_width=args.chunk_width)
 
@@ -433,8 +436,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
         avg_post_vec_file = train_lib.common.compute_average_posterior(
-            args.dir, 'combined', egs_dir,
-            num_archives, args.prior_subset_size, run_opts)
+            dir=args.dir, iter='combined', egs_dir=egs_dir,
+            num_archives=num_archives,
+            left_context=left_context, right_context=right_context,
+            prior_subset_size=args.prior_subset_size, run_opts=run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir=args.dir)
@@ -483,6 +488,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
+        background_process_handler.timer.cancel()
         raise e
 
 

From c8faff1321297dabba85dee020f95b9aee61ab09 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 17 Nov 2016 10:46:59 -0500
Subject: [PATCH 26/71] raw_python_script: Fixing bug with background process
 handler

---
 egs/wsj/s5/steps/nnet3/train_dnn.py     | 2 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py | 2 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py | 2 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 14cd0ccf8ed..67c7f04812a 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -410,7 +410,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
-        background_process_handler.timer.cancel()
+        background_process_handler.stop()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index c38551cae4d..005c5368f2b 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -406,7 +406,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
-        background_process_handler.timer.cancel()
+        background_process_handler.stop()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 67ca4d072f5..4f24c3abfd9 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -486,7 +486,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
-        background_process_handler.timer.cancel()
+        background_process_handler.stop()
         raise e
 
 
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index ce428e86fb5..af57cea5c35 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -488,7 +488,7 @@ def main():
                        "died due to an error.".format(dir=args.dir))
             common_lib.send_mail(message, message, args.email)
         traceback.print_exc()
-        background_process_handler.timer.cancel()
+        background_process_handler.stop()
         raise e
 
 

From 6d953e9d8ed9024ba08fc9a69ca4af520e17e359 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Thu, 17 Nov 2016 16:12:16 -0500
Subject: [PATCH 27/71] raw_python_script: Fixing bugs based on testing

---
 egs/wsj/s5/steps/libs/__init__.py             |  2 ++
 egs/wsj/s5/steps/libs/common.py               |  5 ++--
 egs/wsj/s5/steps/libs/nnet3/train/__init__.py |  2 ++
 .../libs/nnet3/train/chain_objf/__init__.py   |  2 ++
 .../nnet3/train/frame_level_objf/__init__.py  |  4 ++++
 .../train/frame_level_objf/acoustic_model.py  | 23 +++++++++++++++++++
 .../nnet3/train/frame_level_objf/common.py    | 19 ---------------
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  4 ++--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  4 ++--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  4 ++--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  4 ++--
 11 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py
index 2a472386568..013c95d0b3f 100644
--- a/egs/wsj/s5/steps/libs/__init__.py
+++ b/egs/wsj/s5/steps/libs/__init__.py
@@ -6,4 +6,6 @@
 """ This package contains modules and subpackages used in kaldi scripts.
 """
 
+import common
+
 __all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 58c12c46216..b04b50771f0 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -214,12 +214,13 @@ def __run():
 
     def start(self):
         if not self.__is_running:
-            self.__timer = Timer(self.__polling_time, self.__run())
+            self.__timer = threading.Timer(self.__polling_time, self.__run())
             self.__timer.start()
             self.__is_running = True
 
     def stop(self):
-        self.__timer.cancel()
+        if self.__timer is not None:
+            self.__timer.cancel()
         self.__is_running = False
 
     def poll(self):
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py
index 57883b372fd..ada7230865b 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/__init__.py
@@ -10,4 +10,6 @@
 chain_objf -- LF-MMI objective training
 """
 
+import common
+
 __all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
index b2010518d2a..f6cb292e829 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/__init__.py
@@ -7,4 +7,6 @@
 deep neural network acoustic model with chain objective.
 """
 
+import acoustic_model
+
 __all__ = ["acoustic_model"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
index d5148f3c396..366582de7af 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/__init__.py
@@ -7,4 +7,8 @@
 neural networks with frame-level objectives.
 """
 
+import common
+import raw_model
+import acoustic_model
+
 __all__ = ["common", "raw_model", "acoustic_model"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 3ce8c8033fc..fd9fa5a8e87 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -11,6 +11,8 @@
 import logging
 
 import libs.common as common_lib
+import libs.nnet3.train.common as common_train_lib
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -67,3 +69,24 @@ def generate_egs(data, alidir, egs_dir,
                    frames_per_eg=frames_per_eg, srand=srand, data=data,
                    alidir=alidir, egs_dir=egs_dir,
                    egs_opts=egs_opts if egs_opts is not None else ''))
+
+
+def prepare_initial_acoustic_model(dir, alidir, run_opts,
+                                   srand=-3):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    common_train_lib.prepare_initial_network(dir, run_opts,
+                                             srand=srand)
+
+    # Convert to .mdl, train the transitions, set the priors.
+    common_lib.run_kaldi_command(
+        """{command} {dir}/log/init_mdl.log \
+                nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+                nnet3-am-train-transitions - \
+                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command=run_opts.command,
+                   dir=dir, alidir=alidir))
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index d19fd61670f..01e3633676e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -375,25 +375,6 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
 
 
-def prepare_initial_acoustic_model(dir, alidir, run_opts,
-                                   srand=-3):
-    """ Adds the first layer; this will also add in the lda.mat and
-        presoftmax_prior_scale.vec. It will also prepare the acoustic model
-        with the transition model."""
-
-    common_train_lib.prepare_initial_network(dir, run_opts,
-                                             srand=srand)
-
-    # Convert to .mdl, train the transitions, set the priors.
-    common_lib.run_kaldi_command(
-        """{command} {dir}/log/init_mdl.log \
-                nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
-                nnet3-am-train-transitions - \
-                "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
-        """.format(command=run_opts.command,
-                   dir=dir, alidir=alidir))
-
-
 def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                                    right_context, run_opts, mb_size=256,
                                    wait=False, background_process_handler=None,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 67c7f04812a..726042d373b 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -47,7 +47,7 @@ def get_args():
         cross-entropy objective.  DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser.parser])
+        parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -109,7 +109,7 @@ def process_args(args):
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 005c5368f2b..5f8a1889d2d 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -46,7 +46,7 @@ def get_args():
         DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser.parser])
+        parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -108,7 +108,7 @@ def process_args(args):
                         "make_configs.py script")
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 4f24c3abfd9..041f5f48a89 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -54,7 +54,7 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser.parser])
+        parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -167,7 +167,7 @@ def process_args(args):
                         "make_configs.py script")
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index af57cea5c35..314f98958f1 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -55,7 +55,7 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser.parser])
+        parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -166,7 +166,7 @@ def process_args(args):
         args.transform_dir = args.ali_dir
 
     # set the options corresponding to args.use_gpu
-    run_opts = common_lib.RunOpts()
+    run_opts = common_train_lib.RunOpts()
     if args.use_gpu:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(

From c785b44b64775f1f5206e1afe7cd1de07d5cecf4 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 18 Nov 2016 22:06:33 -0500
Subject: [PATCH 28/71] raw_python_script: Removing linked list

---
 egs/wsj/s5/steps/libs/common.py               | 173 ++++++------------
 .../nnet3/train/chain_objf/acoustic_model.py  |   8 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |   8 +-
 .../train/frame_level_objf/acoustic_model.py  |   8 +-
 .../nnet3/train/frame_level_objf/common.py    |   8 +-
 .../nnet3/train/frame_level_objf/raw_model.py |   8 +-
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py   |   2 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |   6 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |   4 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |   4 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |   4 +-
 11 files changed, 67 insertions(+), 166 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index b04b50771f0..e93abc45323 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -16,13 +16,7 @@
 import threading
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(logging.NullHandler())
 
 
 def send_mail(message, subject, email_id):
@@ -88,103 +82,6 @@ def __init__(self, command, err):
                            "{0}\n{1}\n{2}".format(command, "-"*10, err))
 
 
-class ListNode():
-    """ A structure to store a node in a doubly linked-list
-
-    Attributes:
-        data: Any object that is to be stored
-        next_node: A reference to the next object
-        previous_node: A reference to the previous object
-    """
-
-    def __init__(self, data=None, next_node=None, previous_node=None):
-        self.data = data
-        self.next_node = next_node
-        self.previous_node = previous_node
-
-
-class LinkedListIterator():
-
-    def __init__(self, node):
-        self.__current = node
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        if self.__current is None:
-            raise StopIteration()
-
-        node = self.__current
-        self.__current = self.__current.next_node
-
-        return node
-
-
-class LinkedList():
-
-    def __init__(self):
-        self.__head = None
-        self.__tail = None
-        self.__size = 0
-
-    def __iter__(self):
-        return LinkedListIterator(self.__head)
-
-    def push(self, node):
-        """Pushes the node <node> at the "front" of the linked list
-        """
-        if self.__head == None:
-            self.__head = node
-            return
-        node.next_node = self.__head
-        node.previous_node = None
-        self.__head.previous_node = node
-        self.__head = node
-        self.__size += 1
-
-    def pop(self):
-        """Pops the last node out of the list"""
-
-        if self.__tail is None:
-            return None
-
-        old_last_node = self.__tail
-        to_be_last = self.__tail.previous_node
-        to_be_last.next_node = None
-        old_last_node.previous_node = None
-
-        # Set the last node to the "to_be_last"
-        self.__tail = to_be_last
-        self.__size -= 1
-
-        return old_last_node
-
-    def remove(self, node):
-        """Removes and returns node, and connects the previous and next
-        nicely
-        """
-        next_node = node.next_node
-        previous_node = node.previous_node
-
-        if previous_node is not None:
-            previous_node.next_node = next_node
-
-        if next_node is not None:
-            next_node.previous_node = previous_node
-
-        # Make it "free"
-        node.next_node = node.previous_node = None
-        self.__size -= 1
-
-        return node
-
-    def size():
-        return self.__size
-
-    def is_not_empty():
-        return self.__size != 0
-
 class BackgroundProcessHandler():
     """ This class handles background processes to ensure that a top-level
     script waits until all the processes end before exiting
@@ -198,41 +95,70 @@ class BackgroundProcessHandler():
 
     Attributes:
         __process_queue: Stores a list of process handles and command tuples
-
+        __polling_time: The time after which the processes are polled
+        __timer: Internal timer object
+        __is_running: Stores whether a timer is running
     """
 
     def __init__(self, polling_time=600):
-        self.__process_queue = LinkedList()
+        self.__process_queue = []
         self.__polling_time = polling_time
         self.__timer = None
+        self.__lock = threading.Lock()
         self.__is_running = False
 
-    def __run():
+    def __run(self):
+        """ Internal function to run a poll. Calls poll(). """
+        assert(self.__is_running)
         self.__is_running = False
+        logger.debug("Polling...")
         if self.poll():
+            # If there are any more background processes running,
+            # start a new timer
             self.start()
 
     def start(self):
+        """ Start the background process handler.
+
+        Repeatedly calls itself through the __run() method every
+        __polling_time seconds.
+        """
         if not self.__is_running:
-            self.__timer = threading.Timer(self.__polling_time, self.__run())
-            self.__timer.start()
+            self.__timer = threading.Timer(self.__polling_time, self.__run)
+            logger.debug("Starting new timer...")
             self.__is_running = True
+            self.__timer.start()
 
     def stop(self):
+        """ Stop the background process handler by cancelling any running timer.
+        """
         if self.__timer is not None:
             self.__timer.cancel()
         self.__is_running = False
 
     def poll(self):
-        for n in self.__process_queue:
-            if self.is_process_done(n.data):
-                self.ensure_process_is_done(n)
-        return self.__process_queue.is_not_empty()
+        """ Poll background processes and check their statuses.
+
+        Returns True if any processes are still in the queue.
+        """
+        with self.__lock:
+            remaining_processes = []
+            for t in self.__process_queue:
+                if self.is_process_done(t):
+                    self.ensure_process_is_done(t)
+                else:
+                    remaining_processes.append(t)
+            self.__process_queue = remaining_processes
+            num_processes = len(self.__process_queue)
+            logger.debug("Number of processes remaining is {0}...".format(
+                            num_processes))
+        return (num_processes > 0)
 
     def add_process(self, t):
-        """ Add a (process handle, command) tuple to the queue
+        """ Add a (process handle, command) tuple to the queue.
         """
-        self.__process_queue.push(ListNode(data=t))
+        with self.__lock:
+            self.__process_queue.append(t)
         self.start()
 
     def is_process_done(self, t):
@@ -241,22 +167,27 @@ def is_process_done(self, t):
             return False
         return True
 
-    def ensure_process_is_done(self, n):
-        p, command = n.data
-        logger.info("Waiting for process '{0}' to end".format(command))
+    def ensure_process_is_done(self, t):
+        p, command = t
+        logger.debug("Waiting for process '{0}' to end".format(command))
         [stdout, stderr] = p.communicate()
         if p.returncode is not 0:
             raise KaldiCommandException(command, stderr)
-        self.__process_queue.remove(n)
 
     def ensure_processes_are_done(self):
-        for n in self.__process_queue:
-            self.ensure_process_is_done(n)
+        self.__process_queue.reverse()
+        while len(self.__process_queue) > 0:
+            t = self.__process_queue.pop()
+            self.ensure_process_is_done(t)
         self.stop()
 
     def __del__(self):
         self.stop()
 
+    def debug(self):
+        for p, command in self.__process_queue:
+            logger.info("Process '{0}' is running".format(command))
+
 
 def run_kaldi_command(command, wait=True, background_process_handler=None):
     """ Runs commands frequently seen in Kaldi scripts. These are usually a
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 0d50e822ffa..2d2f41dc739 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -17,13 +17,7 @@
 import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(NullHandler())
 
 
 def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index e960e6bf2b8..7f752192611 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -19,13 +19,7 @@
 import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(logging.NullHandler())
 
 
 class RunOpts:
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index fd9fa5a8e87..7d58c18f040 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -15,13 +15,7 @@
 
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(NullHandler())
 
 
 def generate_egs(data, alidir, egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 47601f040b5..165f0fa040e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -19,13 +19,7 @@
 import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(NullHandler())
 
 
 def train_new_models(dir, iter, srand, num_jobs,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index aa74520fc55..1977999f90f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -14,13 +14,7 @@
 import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
+logger.addHandler(NullHandler())
 
 
 def generate_egs_using_targets(data, targets_scp, egs_dir,
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 26bf87ee0e8..22b5fa975dd 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -75,7 +75,7 @@ def GetArgs():
                         help="options to be supplied to NaturalGradientAffineComponent", default="")
 
     # Gradient clipper options
-    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
+    parser.add_argument("--norm-based-clipping", type=str, action=common_lib.StrToBoolAction,
                         help="Outdated option retained for back compatibility, has no effect.",
                         default=True, choices = ["false", "true"])
     parser.add_argument("--clipping-threshold", type=float,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 726042d373b..8ed528c5756 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -21,7 +21,7 @@
 import libs.nnet3.report.log_parse as nnet3_log_parse
 
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -347,7 +347,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
         train_lib.common.combine_models(
-            dir=args.dir, num_iter=num_iters,
+            dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine,
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
@@ -380,7 +380,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs = False
 
         common_train_lib.clean_nnet_dir(
-            dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
+            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 5f8a1889d2d..d301ed4f630 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -21,7 +21,7 @@
 import libs.nnet3.report.log_parse as nnet3_log_parse
 
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -375,7 +375,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs = False
 
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs,
             get_raw_nnet_from_am=False)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 004f443d6eb..a7ed6d8ab6b 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -21,7 +21,7 @@
 import libs.nnet3.train.frame_level_objf as train_lib
 import libs.nnet3.report.log_parse as nnet3_log_parse
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -459,7 +459,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs = False
 
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs,
             get_raw_nnet_from_am=False)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index b8442a3afc0..65a8bed2bf6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -21,7 +21,7 @@
 import libs.nnet3.report.log_parse as nnet3_log_parse
 
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
@@ -461,7 +461,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs = False
 
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            nnet_dir=args.dir, num_iters=num_iters, egs_dir=egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 

From 750d2e88935fad3e3fcd7dc1b734a4d7199c003f Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 20 Nov 2016 16:53:58 -0500
Subject: [PATCH 29/71] raw_python_script: Fixed some minor bugs

---
 .../libs/nnet3/train/chain_objf/acoustic_model.py | 11 ++++-------
 .../train/frame_level_objf/acoustic_model.py      |  2 +-
 .../libs/nnet3/train/frame_level_objf/common.py   | 15 ++++++---------
 .../nnet3/train/frame_level_objf/raw_model.py     |  2 +-
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 2d2f41dc739..fbebafe6920 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -17,7 +17,7 @@
 import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
-logger.addHandler(NullHandler())
+logger.addHandler(logging.NullHandler())
 
 
 def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
@@ -122,8 +122,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch,
                      frame_subsampling_factor, truncate_deriv_weights,
-                     cache_io_opts, run_opts,
-                     background_process_handler=None):
+                     cache_io_opts, run_opts):
     """
     Called from train_one_iteration(), this method trains new models
     with 'num_jobs' jobs, and
@@ -194,8 +193,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         buf_size=shuffle_buffer_size,
                         cache_io_opts=cur_cache_io_opts,
                         num_chunk_per_mb=num_chunk_per_minibatch),
-            wait=False,
-            background_process_handler=background_process_handler)
+            wait=False)
 
         processes.append(process_handle)
 
@@ -323,8 +321,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
                      frame_subsampling_factor=frame_subsampling_factor,
                      truncate_deriv_weights=truncate_deriv_weights,
-                     cache_io_opts=cache_io_opts, run_opts=run_opts,
-                     background_process_handler=background_process_handler)
+                     cache_io_opts=cache_io_opts, run_opts=run_opts)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
          num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 7d58c18f040..ade21ca2e16 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -15,7 +15,7 @@
 
 
 logger = logging.getLogger(__name__)
-logger.addHandler(NullHandler())
+logger.addHandler(logging.NullHandler())
 
 
 def generate_egs(data, alidir, egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 165f0fa040e..f7472cd4f46 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -19,7 +19,7 @@
 import libs.nnet3.train.common as common_train_lib
 
 logger = logging.getLogger(__name__)
-logger.addHandler(NullHandler())
+logger.addHandler(logging.NullHandler())
 
 
 def train_new_models(dir, iter, srand, num_jobs,
@@ -30,8 +30,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      shuffle_buffer_size, minibatch_size,
                      cache_read_opt, run_opts,
                      frames_per_eg=-1,
-                     min_deriv_time=None, max_deriv_time=None,
-                     background_process_handler=None):
+                     min_deriv_time=None, max_deriv_time=None):
     """ Called from train_one_iteration(), this model does one iteration of
     training with 'num_jobs' jobs, and writes files like
     exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
@@ -116,8 +115,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         raw_model=raw_model_string, context_opts=context_opts,
                         egs_dir=egs_dir, archive_index=archive_index,
                         shuffle_buffer_size=shuffle_buffer_size,
-                        minibatch_size=minibatch_size), wait=False,
-            background_process_handler=background_process_handler)
+                        minibatch_size=minibatch_size), wait=False)
 
         processes.append(process_handle)
 
@@ -279,8 +277,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      cache_read_opt=cache_read_opt, run_opts=run_opts,
                      frames_per_eg=frames_per_eg,
                      min_deriv_time=min_deriv_time,
-                     max_deriv_time=max_deriv_time,
-                     background_process_handler=background_process_handler)
+                     max_deriv_time=max_deriv_time)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
          num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
@@ -437,8 +434,8 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
 
     common_lib.run_kaldi_command(
             """{command} {dir}/log/progress.{iter}.log \
-                    nnet3-info {model} '&&' \
-                    nnet3-show-progress --use-gpu=no {prev_model} {model} \
+                    nnet3-info "{model}" '&&' \
+                    nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \
                     "ark,bg:nnet3-copy-egs {context_opts} \
                         ark:{egs_dir}/train_diagnostic.egs ark:- | \
                         nnet3-merge-egs --minibatch-size={mb_size} ark:- \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 1977999f90f..9ea3a07e05c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -14,7 +14,7 @@
 import libs.common as common_lib
 
 logger = logging.getLogger(__name__)
-logger.addHandler(NullHandler())
+logger.addHandler(logging.NullHandler())
 
 
 def generate_egs_using_targets(data, targets_scp, egs_dir,

From 1c02d5a70fcc06c0f13e09324efcb0c92f30b0c9 Mon Sep 17 00:00:00 2001
From: Yenda Trmal <jtrmal@gmail.com>
Date: Mon, 21 Nov 2016 12:59:35 -0500
Subject: [PATCH 30/71] changes for GALE mandarin setup

---
 .../s5/local/split_wer_per_corpus.sh          |  51 +++++++
 ...etails.jtrmal1@jhu.edu.2016-11-21-11-32-32 |  97 ++++++++++++++
 egs/gale_mandarin/s5/conf/decode.config       |   0
 egs/gale_mandarin/s5/local/bad_utts           |  12 ++
 .../s5/local/gale_data_prep_audio.sh          |  79 +++++++----
 .../s5/local/gale_data_prep_split.sh          |  33 +++--
 .../s5/local/gale_data_prep_txt.sh            |  74 +++++++----
 .../s5/local/gale_format_data.sh              |  29 ++--
 egs/gale_mandarin/s5/local/gale_prep_dict.sh  | 125 +++++++++---------
 egs/gale_mandarin/s5/local/gale_train_lms.sh  |  24 ++--
 .../s5/local/split_wer_per_corpus.sh          |  61 +++++++++
 egs/gale_mandarin/s5/local/test.LDC2013S04    |  20 +++
 egs/gale_mandarin/s5/local/test.LDC2013S08    |  20 +++
 egs/gale_mandarin/s5/local/test.LDC2014S09    |  20 +++
 egs/gale_mandarin/s5/local/test.LDC2015S06    |  14 ++
 egs/gale_mandarin/s5/local/test.LDC2015S13    |  20 +++
 egs/gale_mandarin/s5/local/test.LDC2016S03    |  20 +++
 egs/wsj/s5/utils/pinyin_map.pl                |  20 +--
 18 files changed, 546 insertions(+), 173 deletions(-)
 create mode 100755 egs/gale_arabic/s5/local/split_wer_per_corpus.sh
 create mode 100644 egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32
 create mode 100644 egs/gale_mandarin/s5/conf/decode.config
 create mode 100644 egs/gale_mandarin/s5/local/bad_utts
 create mode 100755 egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2013S04
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2013S08
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2014S09
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2015S06
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2015S13
 create mode 100644 egs/gale_mandarin/s5/local/test.LDC2016S03

diff --git a/egs/gale_arabic/s5/local/split_wer_per_corpus.sh b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh
new file mode 100755
index 00000000000..71c8adcc3fe
--- /dev/null
+++ b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+#set -o pipefail -e
+
+galeFolder=$(readlink -f $1)
+symtab=./data/lang/words.txt
+
+min_lmwt=7
+max_lmwt=20
+
+for dir in exp/*/*decode*; do
+ for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  mkdir -p $dir/scoring_$type/log
+  for x in $dir/scoring/*.tra $dir/scoring/test_filt.txt; do
+    cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
+  done
+
+  utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+     cat $dir/scoring_${type}/LMWT.tra \| \
+      utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+      compute-wer --text --mode=present \
+       ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
+done
+done
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "#RESULTS splits generated by $USER at $time"
+
+for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
+ echo -e "\n# WER $type"
+ for x in exp/*/*decode*; do
+  grep WER $x/wer_${type}_* | utils/best_wer.sh;
+ done | sort -n -k2
+done
+
+
+
+
diff --git a/egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32 b/egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32
new file mode 100644
index 00000000000..47974d88975
--- /dev/null
+++ b/egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32
@@ -0,0 +1,97 @@
+#RESULTS splits generated by jtrmal1@jhu.edu at 2016-11-21-12-05-54
+
+# WER test.LDC2013S04
+%WER 42.23 [ 40179 / 95137, 5329 ins, 8769 del, 26081 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10
+%WER 43.81 [ 41682 / 95137, 5469 ins, 9213 del, 27000 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13
+%WER 49.06 [ 46677 / 95137, 5459 ins, 10672 del, 30546 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13
+%WER 50.53 [ 48073 / 95137, 5505 ins, 11022 del, 31546 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12
+%WER 51.47 [ 48971 / 95137, 5103 ins, 12391 del, 31477 sub ] exp/tri2a/decode/wer_test.LDC2013S04_13
+%WER 53.30 [ 50708 / 95137, 4829 ins, 13624 del, 32255 sub ] exp/tri1/decode/wer_test.LDC2013S04_13
+
+# WER test.LDC2013S08
+%WER 26.01 [ 20781 / 79911, 3764 ins, 3034 del, 13983 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_8
+%WER 27.43 [ 21917 / 79911, 3644 ins, 3544 del, 14729 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13
+%WER 31.24 [ 24968 / 79911, 3820 ins, 3943 del, 17205 sub ] exp/tri2b/decode/wer_test.LDC2013S08_12
+%WER 32.45 [ 25932 / 79911, 3816 ins, 4112 del, 18004 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11
+%WER 34.22 [ 27349 / 79911, 3677 ins, 5034 del, 18638 sub ] exp/tri2a/decode/wer_test.LDC2013S08_13
+%WER 35.88 [ 28676 / 79911, 3715 ins, 5127 del, 19834 sub ] exp/tri1/decode/wer_test.LDC2013S08_12
+
+# WER test.LDC2014S09
+%WER 50.54 [ 39383 / 77932, 10535 ins, 7593 del, 21255 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12
+%WER 52.14 [ 40634 / 77932, 10271 ins, 8530 del, 21833 sub ] exp/tri3b/decode/wer_test.LDC2014S09_17
+%WER 56.57 [ 44085 / 77932, 9394 ins, 10954 del, 23737 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16
+%WER 57.95 [ 45158 / 77932, 8777 ins, 12547 del, 23834 sub ] exp/tri2a/decode/wer_test.LDC2014S09_15
+%WER 58.19 [ 45347 / 77932, 9712 ins, 10831 del, 24804 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15
+%WER 59.38 [ 46277 / 77932, 7944 ins, 14560 del, 23773 sub ] exp/tri1/decode/wer_test.LDC2014S09_16
+
+# WER test.LDC2015S06
+%WER 46.22 [ 28480 / 61612, 8454 ins, 5015 del, 15011 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_9
+%WER 48.08 [ 29624 / 61612, 8471 ins, 5669 del, 15484 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13
+%WER 52.67 [ 32450 / 61612, 8425 ins, 6441 del, 17584 sub ] exp/tri2b/decode/wer_test.LDC2015S06_12
+%WER 53.51 [ 32968 / 61612, 8444 ins, 6576 del, 17948 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_11
+%WER 55.08 [ 33936 / 61612, 8031 ins, 7811 del, 18094 sub ] exp/tri2a/decode/wer_test.LDC2015S06_13
+%WER 56.70 [ 34937 / 61612, 7890 ins, 8531 del, 18516 sub ] exp/tri1/decode/wer_test.LDC2015S06_13
+
+# WER test.LDC2015S13
+%WER 23.35 [ 19752 / 84594, 2196 ins, 3274 del, 14282 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9
+%WER 24.81 [ 20984 / 84594, 2214 ins, 3600 del, 15170 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12
+%WER 28.62 [ 24211 / 84594, 2306 ins, 4186 del, 17719 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12
+%WER 30.03 [ 25405 / 84594, 2106 ins, 4617 del, 18682 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12
+%WER 30.58 [ 25869 / 84594, 2142 ins, 4798 del, 18929 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12
+%WER 32.16 [ 27206 / 84594, 1958 ins, 5681 del, 19567 sub ] exp/tri1/decode/wer_test.LDC2015S13_13
+
+# WER test.LDC2016S03
+%WER 53.04 [ 77015 / 145212, 34385 ins, 9733 del, 32897 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_12
+%WER 54.68 [ 79399 / 145212, 34634 ins, 10414 del, 34351 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17
+%WER 58.99 [ 85661 / 145212, 33946 ins, 12904 del, 38811 sub ] exp/tri2b/decode/wer_test.LDC2016S03_16
+%WER 59.80 [ 86841 / 145212, 34387 ins, 12610 del, 39844 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_15
+%WER 60.29 [ 87547 / 145212, 31358 ins, 15266 del, 40923 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16
+%WER 61.75 [ 89662 / 145212, 30628 ins, 16992 del, 42042 sub ] exp/tri1/decode/wer_test.LDC2016S03_16
+
+# CER test.LDC2013S04
+%WER 33.93 [ 51673 / 152279, 7241 ins, 12180 del, 32252 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_10
+%WER 35.31 [ 53769 / 152279, 7813 ins, 11593 del, 34363 sub ] exp/tri3b/decode/cer_test.LDC2013S04_11
+%WER 40.56 [ 61767 / 152279, 8062 ins, 13321 del, 40384 sub ] exp/tri2b/decode/cer_test.LDC2013S04_11
+%WER 42.08 [ 64081 / 152279, 8052 ins, 13940 del, 42089 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10
+%WER 43.22 [ 65818 / 152279, 7602 ins, 15416 del, 42800 sub ] exp/tri2a/decode/cer_test.LDC2013S04_11
+%WER 44.93 [ 68413 / 152279, 7255 ins, 16855 del, 44303 sub ] exp/tri1/decode/cer_test.LDC2013S04_11
+
+# CER test.LDC2013S08
+%WER 19.18 [ 25398 / 132434, 4773 ins, 3650 del, 16975 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8
+%WER 20.54 [ 27201 / 132434, 4792 ins, 4037 del, 18372 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11
+%WER 24.12 [ 31943 / 132434, 4817 ins, 4968 del, 22158 sub ] exp/tri2b/decode/cer_test.LDC2013S08_12
+%WER 25.15 [ 33309 / 132434, 4839 ins, 5019 del, 23451 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11
+%WER 26.90 [ 35623 / 132434, 4725 ins, 6057 del, 24841 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12
+%WER 28.45 [ 37674 / 132434, 4506 ins, 6690 del, 26478 sub ] exp/tri1/decode/cer_test.LDC2013S08_12
+
+# CER test.LDC2014S09
+%WER 42.24 [ 53240 / 126027, 16007 ins, 10270 del, 26963 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11
+%WER 43.81 [ 55212 / 126027, 15435 ins, 11971 del, 27806 sub ] exp/tri3b/decode/cer_test.LDC2014S09_15
+%WER 48.72 [ 61395 / 126027, 14667 ins, 15066 del, 31662 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14
+%WER 50.20 [ 63270 / 126027, 15105 ins, 14701 del, 33464 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_13
+%WER 50.37 [ 63481 / 126027, 13343 ins, 18289 del, 31849 sub ] exp/tri2a/decode/cer_test.LDC2014S09_14
+%WER 51.95 [ 65470 / 126027, 12613 ins, 20231 del, 32626 sub ] exp/tri1/decode/cer_test.LDC2014S09_14
+
+# CER test.LDC2015S06
+%WER 38.57 [ 38234 / 99132, 12510 ins, 7120 del, 18604 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9
+%WER 40.30 [ 39954 / 99132, 12593 ins, 7986 del, 19375 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12
+%WER 44.83 [ 44438 / 99132, 12639 ins, 8903 del, 22896 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11
+%WER 45.71 [ 45318 / 99132, 12631 ins, 9164 del, 23523 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10
+%WER 47.39 [ 46983 / 99132, 12432 ins, 9935 del, 24616 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11
+%WER 49.03 [ 48600 / 99132, 12250 ins, 10831 del, 25519 sub ] exp/tri1/decode/cer_test.LDC2015S06_11
+
+# CER test.LDC2015S13
+%WER 17.05 [ 23993 / 140702, 2450 ins, 3594 del, 17949 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_8
+%WER 18.39 [ 25872 / 140702, 2257 ins, 4274 del, 19341 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11
+%WER 21.98 [ 30933 / 140702, 2347 ins, 4784 del, 23802 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11
+%WER 23.23 [ 32679 / 140702, 2197 ins, 5383 del, 25099 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_11
+%WER 23.88 [ 33596 / 140702, 2030 ins, 6225 del, 25341 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12
+%WER 25.47 [ 35842 / 140702, 1944 ins, 6979 del, 26919 sub ] exp/tri1/decode/cer_test.LDC2015S13_12
+
+# CER test.LDC2016S03
+%WER 45.40 [ 106787 / 235216, 53964 ins, 12519 del, 40304 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_11
+%WER 46.75 [ 109953 / 235216, 54007 ins, 13639 del, 42307 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15
+%WER 51.08 [ 120139 / 235216, 53593 ins, 16514 del, 50032 sub ] exp/tri2b/decode/cer_test.LDC2016S03_14
+%WER 51.97 [ 122235 / 235216, 52763 ins, 17940 del, 51532 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_15
+%WER 52.61 [ 123739 / 235216, 47836 ins, 22637 del, 53266 sub ] exp/tri2a/decode/cer_test.LDC2016S03_16
+%WER 54.06 [ 127163 / 235216, 47776 ins, 23865 del, 55522 sub ] exp/tri1/decode/cer_test.LDC2016S03_15
diff --git a/egs/gale_mandarin/s5/conf/decode.config b/egs/gale_mandarin/s5/conf/decode.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/gale_mandarin/s5/local/bad_utts b/egs/gale_mandarin/s5/local/bad_utts
new file mode 100644
index 00000000000..6683c9a97a5
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/bad_utts
@@ -0,0 +1,12 @@
+CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070308_040701
+CCTV2_ECONOMYANDLAW_CMN_20070426_202800
+CCTV2_ECONOMYANDLAW_CMN_20070426_202800(1)
+CCTV2_LIANGHUI_PROBLEM_20070308_213000
+CCTV4_TDYFOCUS_CMN_20070824_092801
+VOA_ISSUESANDOPINIONS_CMN_20070801_210500
+VOA_ISSUESANDOPINIONS_CMN_20070926_210500
+VOA_LISTENERSHOTLINE_CMN_20070906_223000
+VOA_LISTENERSHOTLINE_CMN_20070926_223000
+VOA_LISTENERSHOTLINE_CMN_20070927_223000
+PHOENIX_NEWSLINE_CMN_20070101_114800
+PHOENIX_NEWSLINE_CMN_20070101_114800(1)
diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh
index c2d61cfb864..598c6b37c17 100755
--- a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh
+++ b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh
@@ -1,46 +1,69 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2014 QCRI (author: Ahmed Ali)
+# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
 # Apache 2.0
 
 
-if [ $# -ne 2 ]; then
-   echo "Arguments should be the <output folder> <data folder> "; exit 1
-fi
+echo $0 "$@"
+
+galeData=$(readlink -f "${@: -1}" );
+wavedir=$galeData/wav
+mkdir -p $wavedir
+
 
-# check that sox is installed 
+length=$(($#-1))
+args=${@:1:$length}
 
+# check that sox is installed
 which sox  &>/dev/null
-if [[ $? != 0 ]]; then 
- echo "sox is not installed"
- exit 1 
+if [[ $? != 0 ]]; then
+ echo "$0: sox is not installed"
+ exit 1
 fi
 
-galeData=$1
-wavedir=$galeData/wav
-mkdir -p $wavedir
+set -e -o pipefail
+
+for var in $args; do
+  CD=$(basename $var)
+  [ -d $wavedir/$CD ] && rm -rf $wavedir/$CD
+  mkdir -p $wavedir/$CD
+  find $var -type f -name *.wav | while read file; do
+    f=$(basename $file)
+    if [[ ! -L "$wavedir/$CD/$f" ]]; then
+      ln -sf $file $wavedir/$CD/$f
+    fi
+  done
 
-audio_path=$2
-
-mkdir -p $wavedir/
-  
-#copy and convert the flac to wav
-find $audio_path -type f -name *.flac  | while read file; do
-  f_name=$(basename $file)
-  if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then
-   echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" "
-   sox $file $wavedir/"${f_name%.flac}.wav"
-  fi
-  
+  #make an flac symmlink as well
+  find $var -type f -name *.flac  | while read file; do
+    f=$(basename $file)
+
+    if [[ ! -L "$wavedir/$CD/$f" ]]; then
+      ln -sf $file $wavedir/$CD/$f
+    fi
+  done
 done
 
-find $wavedir -name *.wav > $galeData/wav$$ 
-awk -F "/" '{print $NF}' $galeData/wav$$  | sed 's:\.wav::' > $galeData/id$$ 
-paste -d ' ' $galeData/id$$ $galeData/wav$$  | sort -u > $galeData/wav.scp  
+#figure out the proper sox command line
+#the flac will be converted on the fly
+(
+  for w in `find $wavedir -name *.wav` ; do
+    base=`basename $w .wav`
+    fullpath=`readlink -f $w`
+    echo "$base sox $fullpath -r 16000 -t wav - |"
+  done
+
+  for w in `find $wavedir -name *.flac` ; do
+    base=`basename $w .flac`
+    fullpath=`readlink -f $w`
+    echo "$base sox $fullpath -r 16000 -t wav - |"
+  done
+)  | sort -u > $galeData/wav.scp
 
-#clean 
+#clean
 rm -fr $galeData/id$$ $galeData/wav$$
-echo data prep audio succeded
+echo "$0: data prep audio succeded"
 
 exit 0
 
diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh
index 63b6d8d2f7b..40c29415a1e 100755
--- a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh
+++ b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh
@@ -1,37 +1,33 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2014 (author: Ahmed Ali, Hainan Xu)
+# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
 # Apache 2.0
 
 if [ $# -ne 1 ]; then
    echo "Arguments should be the <gale folder>"; exit 1
 fi
 
+set -e -o pipefail
 #data will data/local
 
 galeData=$(readlink -f $1)
 mkdir -p data/local
 dir=$(readlink -f data/local)
 
-cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist
-
-cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled
-
-# we want about 6h dev data; 300 is manually chosen
-cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev
-
-
-cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list
 
 # some problem with the text data; same utt id but different transcription
-cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
+cat $galeData/all | awk '{print$2}' | \
+  sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
 
-utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup
+utils/filter_scp.pl --exclude -f 2 \
+  $galeData/dup.list $galeData/all > $galeData/all.nodup
 
-mv $galeData/all_nodup $galeData/all
+mv $galeData/all $galeData/all.orig
+mv $galeData/all.nodup $galeData/all
 
-utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev
-utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train
+grep    -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts  > $galeData/all.dev
+grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts  > $galeData/all.train
 
 cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list
 cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list
@@ -46,11 +42,11 @@ utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt
 
 for x in dev train; do
  outdir=$dir/$x
- file=$galeData/all.$x 
+ file=$galeData/all.$x
  mkdir -p $outdir
  awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
  awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done 
+done
 
 cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list
 cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list
@@ -60,5 +56,6 @@ utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train
 
 cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
  {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
- 
+
+
 echo data prep split succeeded
diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh
index 3fe32055f6c..7e3e57c92a8 100755
--- a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh
+++ b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh
@@ -1,28 +1,38 @@
 #!/bin/bash
 
 # Copyright 2014 (author: Ahmed Ali, Hainan Xu)
+# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
 # Apache 2.0
 
-if [ $# -ne 2 ]; then
-   echo "Arguments should be the <gale folder> <txt data folder>"; exit 1
-fi
-
+echo $0 "$@"
 export LC_ALL=C
 
-galeData=$1
-text=$2
+galeData=$(readlink -f "${@: -1}" );
 
-cur=`pwd`
+length=$(($#-1))
+args=${@:1:$length}
 
-txtdir=$galeData/txt 
-mkdir -p $galeData/txt 
+top_pwd=`pwd`
+txtdir=$galeData/txt
+mkdir -p $txtdir
 
 cd $txtdir
 
-find $text -type f -name *.tdf | while read file; do 
-sed '1,3d' $file 
-done > all.tmp
+for cdx in ${args[@]}; do
+  echo "Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    tgt=$(basename $cdx)
+    test -x $tgt || ln -s $cdx `basename $tgt`
+  else
+    echo "I don't really know what I shall do with $cdx " >&2
+  fi
+done
 
+find -L . -type f -name *.tdf | while read file; do
+sed '1,3d' $file
+done > all.tmp
 
 perl -e '
     ($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0];
@@ -34,22 +44,35 @@ perl -e '
     while (<IN>) {
       @arr= split /\t/,$_;
       $arr[4] =~ s/ //g;
+      $arr[4] = sprintf("%020s", $arr[4]);
       $spkid = "$arr[0]_$arr[4]";
-      $spkfix = sprintf("%060s", $spkid);
-      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
-      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
-	  $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
-	  next if ($rStart == $rEnd);
-	  $id =~ s/.sph//g;
-	  print ID $id;
+      $spkfix = sprintf("%080s", $spkid);
+
+      $start=sprintf ("%0.3f",$arr[2]);
+      $rStart=$start;
+      $start=~s/\.//;
+      $start=~s/^0+$/0/;
+      $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $start = sprintf("%09s", $start);
+
+      $end=sprintf ("%0.3f",$arr[3]);
+      $rEnd=$end;
+      $end=~s/^0+([^0])/$1/;
+      $end=~s/\.//;
+      $end = sprintf("%09s", $end);
+
+      $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
       print TXT "$arr[7]\n";
       print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n";
       print MAP "$arr[0] ${spkfix}_$arr[0]\n";
- }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" 
+ }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"
 
 perl -p -i -e 's=/.$==g' contentall.tmp
 
-cd $cur
+cd $top_pwd
 
 
 pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
@@ -57,11 +80,11 @@ export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-pa
 if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
   echo "--- Downloading mmseg-1.3.0 ..."
   echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
-  wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz 
+  wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
   tar xf tools/mmseg-1.3.0.tar.gz -C tools
   cd tools/mmseg-1.3.0
   mkdir -p lib/python${pyver}/site-packages
-  python setup.py build 
+  CC=gcc CXX=g++ python setup.py build
   python setup.py install --prefix=.
   cd ../..
   if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
@@ -90,11 +113,8 @@ awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all
 cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk
 cat $txtdir/map.tmp | sort -u > $txtdir/../map
 
-sort -c $txtdir/../utt2spk 
+sort -c $txtdir/../utt2spk
 
 utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt
 
-cd ..;
-rm -fr $txtdir
-
 echo data prep text succeeded
diff --git a/egs/gale_mandarin/s5/local/gale_format_data.sh b/egs/gale_mandarin/s5/local/gale_format_data.sh
index 71187e89a12..204fa31fd42 100755
--- a/egs/gale_mandarin/s5/local/gale_format_data.sh
+++ b/egs/gale_mandarin/s5/local/gale_format_data.sh
@@ -8,19 +8,20 @@ if [ -f path.sh ]; then
    echo "missing path.sh"; exit 1;
 fi
 
+set -e -o pipefail
+set -x
+
 for dir in dev train; do
-   cp -pr data/local/$dir data/$dir
+   cp -prT data/local/$dir data/$dir
 done
 
 export LC_ALL=C
 
-mkdir -p data/lang_dev
-
 arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
 
-rm -r data/lang_dev
-cp -r data/lang data/lang_dev
+rm -r data/lang_test || true
+cp -r data/lang data/lang_test
 
 gunzip -c "$arpa_lm" | \
   arpa2fst --disambig-symbol=#0 \
@@ -28,31 +29,35 @@ gunzip -c "$arpa_lm" | \
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_dev/G.fst
+fstisstochastic data/lang_test/G.fst || true
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
 echo "First few lines of lexicon FST:"
-fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
-
+(
+  fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
+) || true
 echo Performing further checks
 
 # Checking that G.fst is determinizable.
-fstdeterminize data/lang_dev/G.fst /dev/null || echo Error determinizing G.
+fstdeterminize data/lang_test/G.fst /dev/null || {
+  echo Error determinizing G.
+  exit 1
+}
 
 # Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_dev/L_disambig.fst /dev/null || echo Error determinizing L.
+fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
 
 # Checking that disambiguated lexicon times G is determinizable
 # Note: we do this with fstdeterminizestar not fstdeterminize, as
 # fstdeterminize was taking forever (presumbaly relates to a bug
 # in this version of OpenFst that makes determinization slow for
 # some case).
-fsttablecompose data/lang_dev/L_disambig.fst data/lang_dev/G.fst | \
+fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
    fstdeterminizestar >/dev/null || echo Error
 
 # Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_dev/G.fst | \
+fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
    fstisstochastic || echo LG is not stochastic
 
 
diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
index cd3ed602c70..cb3f1b56cba 100755
--- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh
+++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
@@ -1,13 +1,14 @@
 #!/bin/bash
 # prepare dictionary for HKUST
-# it is done for English and Chinese separately, 
+# it is done for English and Chinese separately,
 # For English, we use CMU dictionary, and Sequitur G2P
 # for OOVs, while all englist phone set will concert to Chinese
 # phone set at the end. For Chinese, we use an online dictionary,
 # for OOV, we just produce pronunciation using Charactrt Mapping.
-  
-. path.sh
 
+. ./path.sh
+
+set -e -o pipefail
 [ $# != 0 ] && echo "Usage: local/hkust_prepare_dict.sh" && exit 1;
 
 train_dir=data/local/train
@@ -23,18 +24,29 @@ esac
 
 # extract full vocabulary
 cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
-  sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
-  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt  
+  sed -e 's/ /\n/g' | sort -u | \
+  grep -v '\[LAUGHTER\]' | \
+  grep -v '\[NOISE\]' |\
+  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt
 
 # split into English and Chinese
 cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt
-cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt
+cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \
+  perl -CSD -Mutf8 -ane '{print if /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-ch.txt
+cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \
+  perl -CSD -Mutf8 -ane '{print unless /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-weird.txt
+
 
-# produce pronunciations for english 
+# produce pronunciations for english
 if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
   echo "--- Downloading CMU dictionary ..."
-  svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
-    $dict_dir/cmudict || exit 1;
+  svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/  $dict_dir/cmudict || \
+  wget -e robots=off  -r -np -nH --cut-dirs=4 -R index.html http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ -P $dict_dir  || exit 1
+fi
+
+if [ ! -f $dict_dir/cmudict/scripts/make_baseform.pl ] ; then
+  echo "$0: $dict_dir/cmudict/scripts/make_baseform.pl does not exist!";
+  exit
 fi
 
 echo "--- Striping stress and pronunciation variant markers from cmudict ..."
@@ -54,23 +66,6 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
 wc -l $dict_dir/vocab-en-oov.txt
 wc -l $dict_dir/lexicon-en-iv.txt
 
-pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
-export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages
-if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
-  echo "--- Downloading Sequitur G2P ..."
-  echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!"
-  wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz
-  tar xf tools/g2p-r1668.tar.gz -C tools
-  cd tools/g2p
-  echo '#include <cstdio>' >> Utility.hh # won't compile on my system w/o this "patch"
-  python setup.py build 
-  python setup.py install --prefix=.
-  cd ../..
-  if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
-    echo "Sequitur G2P is not found - installation failed?"
-    exit 1
-  fi
-fi
 
 if [ ! -f conf/g2p_model ]; then
   echo "--- Downloading a pre-trained Sequitur G2P model ..."
@@ -82,8 +77,7 @@ if [ ! -f conf/g2p_model ]; then
 fi
 
 echo "--- Preparing pronunciations for OOV words ..."
-python tools/g2p/lib/python${pyver}/site-packages/g2p.py \
-  --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt
+g2p.py --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt
 
 cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\
   sort > $dict_dir/lexicon-en-phn.txt
@@ -91,25 +85,25 @@ cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\
 
 
 
-# produce pronunciations for chinese 
+# produce pronunciations for chinese
 if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then
-  wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz 
+  wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
   gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz
 fi
 
 cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
- perl -e '   
+ perl -e '
   while (<STDIN>) {
     @A = split(" ", $_);
     print $A[1];
     for($n = 2; $n < @A; $n++) {
-      $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; 
-      $tmp = uc($A[$n]); 
+      $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
+      $tmp = uc($A[$n]);
       print " $tmp";
     }
     print "\n";
   }
- ' | sort -k1 > $dict_dir/ch-dict.txt 
+ ' | sort -k1 > $dict_dir/ch-dict.txt
 
 echo "--- Searching for Chinese OOV words ..."
 gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
@@ -120,22 +114,22 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
   $dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\
   egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt
 
-wc -l $dict_dir/vocab-ch-oov.txt
-wc -l $dict_dir/lexicon-ch-iv.txt
+wc -l $dict_dir/vocab-ch-oov.txt || true
+wc -l $dict_dir/lexicon-ch-iv.txt || true
 
 
 
 # this
 unset LC_ALL
-# first make sure number of characters and pinyins 
-# are equal  
+# first make sure number of characters and pinyins
+# are equal
 cat $dict_dir/ch-dict.txt |\
   perl -e '
   use encoding utf8;
   while (<STDIN>) {
     @A = split(" ", $_);
     $word_len = length($A[0]);
-    $proun_len = @A - 1 ; 
+    $proun_len = @A - 1 ;
     if ($word_len == $proun_len) {print $_;}
   }
   ' > $dict_dir/ch-dict-1.txt
@@ -144,11 +138,12 @@ cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep
 cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt
 wc -l $dict_dir/ch-char.txt
 wc -l $dict_dir/ch-char-pinyin.txt
-paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt 
+paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt
+
 
 cat $dict_dir/ch-char-dict.txt |\
   perl -e '
-  my $prev = ""; 
+  my $prev = "";
   my $out_line = "";
   while (<STDIN>) {
     @A = split(" ", $_);
@@ -157,16 +152,16 @@ cat $dict_dir/ch-char-dict.txt |\
     #print length($prev);
     if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
     if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
-    if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} 
+    if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
     $prev = $cur;
   }
-  print $out_line;  
-  ' >  $dict_dir/ch-char-dict-1.txt 
+  print $out_line;
+  ' >  $dict_dir/ch-char-dict-1.txt
 
 cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \
-  'BEGIN{while((getline<w)>0) dict[$1]=$2;} 
+  'BEGIN{while((getline<w)>0) dict[$1]=$2;}
    {printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \
-  > $dict_dir/lexicon-ch-oov.txt 
+  > $dict_dir/lexicon-ch-oov.txt
 
 cat $dict_dir/lexicon-ch-oov.txt |\
   perl -e '
@@ -175,8 +170,8 @@ cat $dict_dir/lexicon-ch-oov.txt |\
   while (<STDIN>) {
     @A = split(" ", $_);
     @entry = ();
-    push(@entry, $A[0]);  
-    for($i = 1; $i < @A; $i++ ) { 
+    push(@entry, $A[0]);
+    for($i = 1; $i < @A; $i++ ) {
       @py = split("/", $A[$i]);
       @entry1 = @entry;
       @entry = ();
@@ -184,29 +179,29 @@ cat $dict_dir/lexicon-ch-oov.txt |\
         for ($k = 0; $k < @py; $k++) {
           $tmp = $entry1[$j]." ".$py[$k];
           push(@entry, $tmp);
-        }     
-      }    
+        }
+      }
     }
     for ($i = 0; $i < @entry; $i++) {
-      print $entry[$i]; 
+      print $entry[$i];
       print "\n";
-    } 
+    }
   }
   ' > $dict_dir/lexicon-ch-oov1.txt
 
 cat $dict_dir/lexicon-ch-oov1.txt $dict_dir/lexicon-ch-iv.txt |\
-  awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt 
+  awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt
 
 cat $dict_dir/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
   utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch-cmu.txt
 
-cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu 
+cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu
 cat conf/pinyin2cmu | awk -v cmu=$dict_dir/cmu \
   'BEGIN{while((getline<cmu)) dict[$1] = 1;}
    {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/cmu-used
 cat $dict_dir/cmu | awk -v cmu=$dict_dir/cmu-used \
   'BEGIN{while((getline<cmu)) dict[$1] = 1;}
-   {if (!dict[$1]) print $1;}' > $dict_dir/cmu-not-used 
+   {if (!dict[$1]) print $1;}' > $dict_dir/cmu-not-used
 
 gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
   $dict_dir/cmu-not-used conf/cmu2pinyin |\
@@ -229,9 +224,9 @@ cat $dict_dir/cmu-py | \
     push(@entry, $W);
     for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
     print "@entry";
-    print "\n";  
-  }  
-' conf/pinyin2cmu > $dict_dir/cmu-cmu 
+    print "\n";
+  }
+' conf/pinyin2cmu > $dict_dir/cmu-cmu
 
 cat $dict_dir/lexicon-en-phn.txt | \
   perl -e '
@@ -248,14 +243,14 @@ cat $dict_dir/lexicon-en-phn.txt | \
     @entry = ();
     $W = shift(@A);
     push(@entry, $W);
-    for($i = 0; $i < @A; $i++) { 
+    for($i = 0; $i < @A; $i++) {
       if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
       else {push(@entry, $A[$i])};
     }
     print "@entry";
-    print "\n";  
+    print "\n";
   }
-' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt 
+' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt
 
 cat $dict_dir/lexicon-en.txt $dict_dir/lexicon-ch-cmu.txt |\
   sort -u > $dict_dir/lexicon1.txt
@@ -267,8 +262,8 @@ cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{f
   while (<STDIN>) {
     $phone = $_;
     chomp($phone);
-    chomp($_);      
-    $phone =~ s:([A-Z]+)[0-9]:$1:; 
+    chomp($_);
+    $phone =~ s:([A-Z]+)[0-9]:$1:;
     if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
     else { $ph_cl{$phone} = [$_]; }
   }
@@ -298,7 +293,5 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
 
 
 export LC_ALL=C
+echo "$0: Done"
 
-
-
-exit 1;
diff --git a/egs/gale_mandarin/s5/local/gale_train_lms.sh b/egs/gale_mandarin/s5/local/gale_train_lms.sh
index af429ae2af1..b70bf8de564 100755
--- a/egs/gale_mandarin/s5/local/gale_train_lms.sh
+++ b/egs/gale_mandarin/s5/local/gale_train_lms.sh
@@ -4,13 +4,13 @@
 # To be run from one directory above this script.
 
 
-lexicon=data/local/dict/lexicon.txt 
+lexicon=data/local/dict/lexicon.txt
 [ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
 
 # check if sri is installed or no
 sri_installed=false
 which ngram-count  &>/dev/null
-if [[ $? == 0 ]]; then 
+if [[ $? == 0 ]]; then
 sri_installed=true
 fi
 
@@ -23,9 +23,9 @@ fi
 
 export LC_ALL=C # You'll get errors about things being not sorted, if you
 # have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
+export PATH=$PATH:$KALDI_ROOT/tools/kaldi_lm
 ( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
+ cd $KALDI_ROOT/tools || exit 1;
  if [ -d kaldi_lm ]; then
    echo Not installing the kaldi_lm toolkit since it is already there.
  else
@@ -45,10 +45,10 @@ dir=data/local/lm
  mkdir -p $dir
  text=data/local/train/text
  [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
- 
+
  cleantext=$dir/text.no_oov
 
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
+ cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
    {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
    > $cleantext || exit 1;
 
@@ -72,20 +72,20 @@ dir=data/local/lm
  cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
    { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
     || exit 1;
- 
+
  train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
 
 # LM is small enough that we don't need to prune it (only about 0.7M N-grams).
 # Perplexity over 128254.000000 words is 90.446690
 
 # note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
+# data/local/lm/3gram-mincount/lm_unpruned.gz
 
 
 # From here is some commands to do a baseline with SRILM (assuming
 # you have it installed).
 
-if $sri_installed; then 
+if $sri_installed; then
 
  heldout_sent=10000 # Don't change this if you want result to be comparable with
     # kaldi_lm results
@@ -101,14 +101,14 @@ if $sri_installed; then
 
  ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
    -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
- ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
+ ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
 # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
 
 # Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
 # Difference in WSJ must have been due to different treatment of <UNK>.
- ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
+ ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
 # 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
 fi
 
 
-echo train lm succeeded
\ No newline at end of file
+echo train lm succeeded
diff --git a/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
new file mode 100755
index 00000000000..b4a4de94a6d
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+set -o pipefail -e
+
+galeFolder=$(readlink -f $1)
+symtab=./data/lang/words.txt
+
+min_lmwt=7
+max_lmwt=20
+
+for dir in exp/*/*decode*; do
+ for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  mkdir -p $dir/scoring_$type/log
+  for x in $dir/scoring/*.char $dir/scoring/*.tra $dir/scoring/char.filt $dir/scoring/text.filt; do
+    cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
+  done
+
+  utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+     cat $dir/scoring_${type}/LMWT.tra \| \
+      utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+      compute-wer --text --mode=present \
+       ark:$dir/scoring_${type}/text.filt  ark,p:- ">&" $dir/wer_${type}_LMWT
+  utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.cer.LMWT.log \
+     cat $dir/scoring_${type}/LMWT.char \| \
+      compute-wer --text --mode=present \
+       ark:$dir/scoring_${type}/char.filt  ark,p:- ">&" $dir/cer_${type}_LMWT
+done
+done
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "#RESULTS splits generated by $USER at $time"
+
+for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
+ echo -e "\n# WER $type"
+ for x in exp/*/*decode*; do
+  grep WER $x/wer_${type}_* | utils/best_wer.sh;
+ done | sort -n -k2
+done
+
+for type in $(ls -1 local/test.* | xargs -n1 basename); do
+ echo -e "\n# CER $type"
+ for x in exp/*/*decode*; do
+  grep WER $x/cer_${type}_* | utils/best_wer.sh;
+ done | sort -n -k2
+done
+
+
+
diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S04 b/egs/gale_mandarin/s5/local/test.LDC2013S04
new file mode 100644
index 00000000000..60b3a95110d
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2013S04
@@ -0,0 +1,20 @@
+CCTV4_ACROSSSTRAIT_CMN_20070108_073033
+PHOENIX_NEWSLINE_CMN_20070101_114800
+CCTV4_TDYFOCUS_CMN_20070111_082801
+CCTV2_ECONOMYANDLAW_CMN_20070126_203005
+PHOENIX_BEHINDHL_CMN_20061004_052800
+PHOENIX_NEWSHACK_CMN_20060923_212301
+PHOENIX_NEWSLINE_CMN_20070102_114800
+PHOENIX_ASIANJRNL_CMN_20070313_075800
+PHOENIX_BEHINDHL_CMN_20061012_052800
+PHOENIX_NEWSLINE_CMN_20070105_114800
+CCTV4_TDYFOCUS_CMN_20061023_092800
+PHOENIX_SOCWATCH_CMN_20060928_225801
+PHOENIX_BEHINDHL_CMN_20061011_052800
+CCTVNEWS_TELLITLIKEITIS_CMN_20070114_140701
+CCTV4_TDYFOCUS_CMN_20070104_082800
+PHOENIX_NEWSLINE_CMN_20061020_114800
+PHOENIX_ASIANJRNL_CMN_20061002_085800
+PHOENIX_BEHINDHL_CMN_20070102_052800
+CCTV4_TDYFOCUS_CMN_20070108_082800
+PHOENIX_ASIANJRNL_CMN_20070111_075800
diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S08 b/egs/gale_mandarin/s5/local/test.LDC2013S08
new file mode 100644
index 00000000000..6c0279412e9
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2013S08
@@ -0,0 +1,20 @@
+CCTV4_DAILYNEWS_CMN_20061023_135801
+CCTV4_DAILYNEWS_CMN_20060923_135800
+PHOENIX_PHNXWRLD_CMN_20070101_111800
+CCTV4_NEWS3_CMN_20060921_085800
+CCTV7_MILITARYNEWS1_CMN_20070102_193006
+PHOENIX_PHNXWRLD_CMN_20061024_112500
+CCTV7_MILITARYNEWS1_CMN_20070113_193011
+CCTV4_NEWS3_CMN_20061003_085800
+PHOENIX_PHNXWRLD_CMN_20061019_112401
+CCTV4_DAILYNEWS_CMN_20060920_135800
+PHOENIX_GOODMORNCN_CMN_20060926_185800
+ANHUI_NEWSREVIEW_CMN_20070103_175711
+CCTV4_DAILYNEWS_CMN_20060915_135800
+CCTV4_DAILYNEWS_CMN_20060924_135801
+PHOENIX_PHNXWRLD_CMN_20061018_112400
+CCTV7_MILITARYNEWS1_CMN_20070127_192932
+CCTVNEWS_EVENINGNEWS_CMN_20070123_225701
+CCTV4_NEWS3_CMN_20070116_075800
+PHOENIX_GOODMORNCN_CMN_20060918_185800
+PHOENIX_GOODMORNCN_CMN_20061009_185800
diff --git a/egs/gale_mandarin/s5/local/test.LDC2014S09 b/egs/gale_mandarin/s5/local/test.LDC2014S09
new file mode 100644
index 00000000000..ed871874636
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2014S09
@@ -0,0 +1,20 @@
+CCTV2_BUSINESSHOUR_CMN_20070428_070000
+CCTV1_LEGALREPORT_CMN_20070315_123701
+CCTV1_LEGALREPORT_CMN_20070418_123701
+CCTVNEWS_PEOPLESCONGRESS3_CMN_20070313_085702
+CCTV1_LEGALREPORT_CMN_20070426_123701
+CCTV4_ACROSSSTRAIT_CMN_20070430_073000
+HUBEI_COMMUNICATE_CMN_20070325_013001
+CCTVNEWS_PEOPLEINNEWS_CMN_20070327_215701
+CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070315_040701
+CCTV1_LEGALREPORT_CMN_20070416_123701
+CCTV2_PEOPLESCONGRESS1_CMN_20070315_213000
+CCTV2_ECONOMYANDLAW_CMN_20070313_105916
+CCTV1_LEGALREPORT_CMN_20070430_123701
+HUBEI_COMMUNICATE_CMN_20070415_230013
+CCTV2_ECONOMYANDLAW_CMN_20070323_202800
+CCTV1_LEGALREPORT_CMN_20070312_123702
+CCTV1_LEGALREPORT_CMN_20070210_123701
+CCTV4_ACROSSSTRAIT_CMN_20070324_073000
+CCTV4_ACROSSSTRAIT_CMN_20070321_034001
+CCTV2_ECONOMYANDLAW_CMN_20070317_202900
diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S06 b/egs/gale_mandarin/s5/local/test.LDC2015S06
new file mode 100644
index 00000000000..dcdb97b1161
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2015S06
@@ -0,0 +1,14 @@
+CCTV1_LEGALREPORT_CMN_20070407_123702
+CCTV4_ACROSSSTRAIT_CMN_20070704_203000
+CCTV4_ACROSSSTRAIT_CMN_20070402_073000
+CCTV2_ECONOMYANDLAW_CMN_20070402_110000
+CCTV2_BUSINESSHOUR_CMN_20070829_220755
+CCTV1_LEGALREPORT_CMN_20070913_123702
+CCTV4_ACROSSSTRAIT_CMN_20070828_072923
+CCTV1_LEGALREPORT_CMN_20070826_123701
+CCTV4_ACROSSSTRAIT_CMN_20070715_203000
+CCTV4_ACROSSSTRAIT_CMN_20070404_202849
+CCTV2_DIALOG_CMN_20070707_090000
+CCTV1_LEGALREPORT_CMN_20070716_123701
+CCTV1_LEGALREPORT_CMN_20070408_123701
+CCTV4_ACROSSSTRAIT_CMN_20070712_203004
diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S13 b/egs/gale_mandarin/s5/local/test.LDC2015S13
new file mode 100644
index 00000000000..ea52a7679af
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2015S13
@@ -0,0 +1,20 @@
+CCTV2_NEWSLIST_CMN_20070426_115000
+CCTV1_30MINNEWS_CMN_20070418_115702
+CCTV2_NEWSLIST_CMN_20070406_115000
+CCTV1_30MINNEWS_CMN_20070204_115701
+CCTVNEWS_EVENINGNEWS_CMN_20070315_225701
+CCTV1_30MINNEWS_CMN_20070417_115701
+CCTV1_30MINNEWS_CMN_20070208_115701
+CCTV4_NEWS3_CMN_20070327_075800
+CCTV7_MILITARYNEWS1_CMN_20070309_100451
+CCTV7_MILITARYNEWS1_CMN_20070310_093000
+CCTV7_MILITARYNEWS1_CMN_20070411_193000
+CCTV2_NEWSLIST_CMN_20070421_115000
+PHOENIX_PHNXWRLD_CMN_20070801_111801
+VOA_INTNLNEWS_CMN_20070927_210000
+PHOENIX_PHNXWRLD_CMN_20070326_111800
+PHOENIX_PHNXWRLD_CMN_20070821_111801
+CCTV1_30MINNEWS_CMN_20070307_115702
+CCTVNEWS_EVENINGNEWS_CMN_20070314_225701
+VOA_CURRENTEVENTS_CMN_20070807_220000
+CCTV1_30MINNEWS_CMN_20070207_115701
diff --git a/egs/gale_mandarin/s5/local/test.LDC2016S03 b/egs/gale_mandarin/s5/local/test.LDC2016S03
new file mode 100644
index 00000000000..73245ed4c29
--- /dev/null
+++ b/egs/gale_mandarin/s5/local/test.LDC2016S03
@@ -0,0 +1,20 @@
+CCTVNEWS_PEOPLEINNEWS_CMN_20080325_202401
+PHOENIX_ASIANJRNL_CMN_20080725_085800
+VOA_LISTENERSHOTLINE_CMN_20080405_223000
+CCTV1_LEGALREPORT_CMN_20080329_123802
+CCTV2_DIALOG_CMN_20080323_220801
+CCTV2_ECONOMYANDLAW_CMN_20080312_202800
+CCTVNEWS_PEOPLESCONGRESS2_CMN_20080310_090602
+VOA_LISTENERSHOTLINE_CMN_20080402_223000
+CCTVNEWS_PEOPLESCONGRESS2_CMN_20080306_083701
+CCTV2_ECONOMYANDLAW_CMN_20080428_202802
+CCTV2_ECONOMYANDLAW_CMN_20080324_202802
+VOA_FOCUSDIALOGUE_CMN_20080412_210500
+CCTV4_ACROSSSTRAIT_CMN_20080416_073002
+VOA_STRAITSTALK_CMN_20080407_210500
+CCTVNEWS_PEOPLESCONGRESS2_CMN_20080308_083701
+CCTV1_LEGALREPORT_CMN_20080406_123801
+CCTV2_DIALOG_CMN_20080427_220801
+CCTV1_LEGALREPORT_CMN_20080411_123801
+CCTVNEWS_XIAOCUIINTERVIEW_CMN_20080306_122702
+CCTVNEWS_PEOPLEINNEWS_CMN_20080408_202701
diff --git a/egs/wsj/s5/utils/pinyin_map.pl b/egs/wsj/s5/utils/pinyin_map.pl
index 65b260e2418..8210ec2af51 100755
--- a/egs/wsj/s5/utils/pinyin_map.pl
+++ b/egs/wsj/s5/utils/pinyin_map.pl
@@ -10,7 +10,7 @@
 open(MAPS, $ARGV[0]) or die("Could not open pinyin map file.");
 my %py2ph; foreach $line (<MAPS>) { @A = split(" ", $line);
   $py = shift(@A);
-  $py2ph{$py} = [@A]; 
+  $py2ph{$py} = [@A];
 }
 
 #foreach $word ( keys %py2ph ) {
@@ -25,14 +25,14 @@
 
 while (<STDIN>) {
   @A = split(" ", $_);
-  @entry = (); 
+  @entry = ();
   $W = shift(@A);
   push(@entry, $W);
   for($i = 0; $i < @A; $i++) {
     $initial= $A[$i]; $final = $A[$i];
     #print $initial, " ", $final, "\n";
     if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;}
-    elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} 
+    elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;}
     elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;}
     elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;}
     elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;}
@@ -58,22 +58,22 @@
       $tone = $final;
       $final =~ s:([A-Z]+)[0-9]:$1:;
       $tone =~ s:[A-Z]+([0-9]):$1:;
-      if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { print "1: no entry find for ", $A[$i], " ", $initial, " ", $final;  exit;}
-      push(@entry, @{$py2ph{$initial}}); 
+      if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { die "$0: no entry find for ", $A[$i], " ", $initial, " ", $final;}
+      push(@entry, @{$py2ph{$initial}});
       @tmp = @{$py2ph{$final}};
       for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
-      push(@entry, @tmp); 
+      push(@entry, @tmp);
     }
     else {
       $tone = $A[$i];
-      $A[$i] =~ s:([A-Z]+)[0-9]:$1:;   
+      $A[$i] =~ s:([A-Z]+)[0-9]:$1:;
       $tone =~ s:[A-Z]+([0-9]):$1:;
-      if (!(exists $py2ph{$A[$i]})) { print "2: no entry find for ", $A[$i];  exit;}
+      if (!(exists $py2ph{$A[$i]})) { die "$0: no entry find for ", $A[$i];}
       @tmp = @{$py2ph{$A[$i]}};
       for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;}
-      push(@entry, @tmp); 
+      push(@entry, @tmp);
     }
-  } 
+  }
   print "@entry";
   print "\n";
 }

From f1e7d97088c3d35df11175b3d8bcbaca6f4fea3b Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 22 Nov 2016 15:52:13 -0500
Subject: [PATCH 31/71] raw_python_script: Better error messages

---
 .../nnet3/train/chain_objf/acoustic_model.py    |  9 ++++++---
 .../libs/nnet3/train/frame_level_objf/common.py |  7 +++++--
 egs/wsj/s5/steps/nnet3/chain/train.py           | 17 +++++++----------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index fbebafe6920..9025379a326 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -182,6 +182,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         dir=dir, iter=iter, srand=iter + srand,
                         next_iter=iter + 1, job=job,
                         deriv_time_opts=" ".join(deriv_time_opts),
+                        lc=left_context, rc=right_context,
                         trunc_deriv=truncate_deriv_weights,
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
@@ -198,18 +199,20 @@ def train_new_models(dir, iter, srand, num_jobs,
         processes.append(process_handle)
 
     all_success = True
+    error_strs = []
     for process in processes:
         process.wait()
         [stdout_value, stderr_value] = process.communicate()
         if stderr_value.strip() != '':
-            print(stderr_value)
+            error_strs.append(stderr_value.strip())
         if process.returncode != 0:
             all_success = False
 
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
         raise Exception("There was error during training "
-                        "iteration {0}".format(iter))
+                        "iteration {0}:\n{1}".format(iter,
+                                                     "\n".join(error_strs)))
 
 
 def train_one_iteration(dir, iter, srand, egs_dir,
@@ -548,7 +551,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
                     lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
-                    num_chunk_per_minibatch=num_chunk_per_minibatch,
+                    num_chunk_per_mb=num_chunk_per_minibatch,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index f7472cd4f46..0ea8e8d89ff 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -120,17 +120,20 @@ def train_new_models(dir, iter, srand, num_jobs,
         processes.append(process_handle)
 
     all_success = True
+    error_strs = []
     for process in processes:
         process.wait()
         [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
+        if stderr_value.strip() != '':
+            error_strs.append(stderr_value.strip())
         if process.returncode != 0:
             all_success = False
 
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
         raise Exception("There was error during training "
-                        "iteration {0}".format(iter))
+                        "iteration {0}:\n{1}".format(iter,
+                                                     "\n".join(error_strs)))
 
 
 def train_one_iteration(dir, iter, srand, egs_dir,
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 754aba20743..70d8b9819a1 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -313,6 +313,9 @@ def train(args, run_opts, background_process_handler):
                     {dir}/init.raw""".format(command=run_opts.command,
                                              dir=args.dir))
 
+    egs_left_context = left_context + args.frame_subsampling_factor/2
+    egs_right_context = right_context + args.frame_subsampling_factor/2
+
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
@@ -320,13 +323,8 @@ def train(args, run_opts, background_process_handler):
         chain_lib.generate_chain_egs(
             dir=args.dir, data=args.feat_dir,
             lat_dir=args.lat_dir, egs_dir=default_egs_dir,
-            left_context=left_context + args.frame_subsampling_factor/2,
-            right_context=right_context + args.frame_subsampling_factor/2,
-            valid_left_context=(left_context + args.frame_subsampling_factor/2
-                                + args.chunk_width),
-            valid_right_context=(right_context
-                                 + args.frame_subsampling_factor/2
-                                 + args.chunk_width),
+            left_context=egs_left_context,
+            right_context=egs_right_context,
             run_opts=run_opts,
             left_tolerance=args.left_tolerance,
             right_tolerance=args.right_tolerance,
@@ -349,7 +347,7 @@ def train(args, run_opts, background_process_handler):
     [egs_left_context, egs_right_context,
      frames_per_eg, num_archives] = (
         common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                                        left_context, right_context))
+                                        egs_left_context, egs_right_context))
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
@@ -445,7 +443,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                             num_archives_processed),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch=args.num_chunk_per_minibatch,
-                chunk_width=args.chunk_width,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
@@ -494,7 +491,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
             leaky_hmm_coefficient=args.leaky_hmm_coefficient,
-            l2regularize=args.l2_regularize,
+            l2_regularize=args.l2_regularize,
             xent_regularize=args.xent_regularize,
             run_opts=run_opts,
             background_process_handler=background_process_handler)

From 436a3ea213f3433451d1dca00cca3a5c719ed623 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 22 Nov 2016 23:52:23 -0500
Subject: [PATCH 32/71] raw_python_script: Added run_job to redirect stdout
 stderr

---
 egs/wsj/s5/steps/libs/common.py               | 46 ++++++++++++++++---
 .../nnet3/train/chain_objf/acoustic_model.py  | 30 ++++++------
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 14 +++---
 .../train/frame_level_objf/acoustic_model.py  |  4 +-
 .../nnet3/train/frame_level_objf/common.py    | 30 ++++++------
 .../nnet3/train/frame_level_objf/raw_model.py |  2 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py           |  6 +--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py       |  6 +--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  6 +--
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  6 +--
 10 files changed, 92 insertions(+), 58 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index e93abc45323..e8a6020979b 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -75,11 +75,14 @@ def check_if_cuda_compiled():
 
 
 class KaldiCommandException(Exception):
-
-    def __init__(self, command, err):
+    """ An Exception class that throws an error string with the
+    kaldi command that caused the error and the error string captured.
+    """
+    def __init__(self, command, err = None):
         Exception.__init__(self,
                            "There was an error while running the command "
-                           "{0}\n{1}\n{2}".format(command, "-"*10, err))
+                           "{0}\n{1}\n{2}".format(command, "-"*10,
+                                                  "" if err is None else err))
 
 
 class BackgroundProcessHandler():
@@ -189,9 +192,41 @@ def debug(self):
             logger.info("Process '{0}' is running".format(command))
 
 
+def run_job(command, wait=True, background_process_handler=None):
+    """ Runs a kaldi job, usually using a script such as queue.pl and
+        run.pl, and redirects the stdout and stderr to the parent
+        process's streams.
+        These are usually a sequence of commands connected by pipes, so we use
+        shell=True.
+
+    Args:
+        background_process_handler: An object of the BackgroundProcessHandler
+            class that is instantiated by the top-level script. If this is
+            provided, then the created process handle is added to the object.
+        wait: If True, wait until the process is completed. However, if the
+            background_process_handler is provided, this option will be
+            ignored and the process will be run in the background.
+    """
+    p = subprocess.Popen(command, shell=True)
+
+    if background_process_handler is not None:
+        wait = False
+        background_process_handler.add_process((p, command))
+
+    if wait:
+        p.communicate()
+        if p.returncode is not 0:
+            raise KaldiCommandException(command)
+        return None
+    else:
+        return p
+
+
 def run_kaldi_command(command, wait=True, background_process_handler=None):
-    """ Runs commands frequently seen in Kaldi scripts. These are usually a
-        sequence of commands connected by pipes, so we use shell=True.
+    """ Runs commands frequently seen in Kaldi scripts and
+        captures the stdout and stderr.
+        These are usually a sequence of commands connected by pipes, so we use
+        shell=True.
 
     Args:
         background_process_handler: An object of the BackgroundProcessHandler
@@ -201,7 +236,6 @@ class that is instantiated by the top-level script. If this is
             background_process_handler is provided, this option will be
             ignored and the process will be run in the background.
     """
-    # logger.info("Running the command\n{0}".format(command))
     p = subprocess.Popen(command, shell=True,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 9025379a326..9fa8eef1822 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -26,11 +26,11 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
     This method trains a phone LM for chain training using the alignments
     in "tree_dir"
     """
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/make_phone_lm.log \
                 chain-est-phone-lm {lm_opts} \
-                "ark:gunzip -c {tree_dir}/ali.*.gz | """
-        """ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
+                "ark:gunzip -c {tree_dir}/ali.*.gz | \
+                    ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
                 {dir}/phone_lm.fst""".format(
                     command=run_opts.command, dir=dir,
                     lm_opts=lm_opts if lm_opts is not None else '',
@@ -38,10 +38,10 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
 
 
 def create_denominator_fst(dir, tree_dir, run_opts):
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """copy-transition-model {tree_dir}/final.mdl \
                 {dir}/0.trans_mdl""".format(dir=dir, tree_dir=tree_dir))
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/make_den_fst.log \
                    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl \
                    {dir}/phone_lm.fst \
@@ -64,7 +64,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
     See options in that script.
     """
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """steps/nnet3/chain/get_egs.sh {egs_opts} \
                 --cmd "{command}" \
                 --cmvn-opts "{cmvn_opts}" \
@@ -158,7 +158,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         else:
             cur_cache_io_opts = cache_io_opts
 
-        process_handle = common_lib.run_kaldi_command(
+        process_handle = common_lib.run_job(
             """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                     nnet3-chain-train {parallel_train_opts} \
                     --apply-deriv-weights={app_deriv_wts} \
@@ -390,7 +390,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
             num_lda_jobs = max_lda_jobs
 
     # Write stats with the same format as stats for LDA.
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                 nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
                 {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" \
@@ -405,7 +405,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
                          range(1, num_lda_jobs + 1))
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/sum_transform_stats.log \
                 sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
                     command=run_opts.command,
@@ -421,7 +421,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
     # variant of an LDA transform but without dimensionality reduction.
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/get_transform.log \
                 nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
                 {dir}/lda_stats""".format(
@@ -445,7 +445,7 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1):
     # We ensure that they have the same mode (even if someone changed the
     # script to make one or both of them text mode) by copying them both
     # before concatenating them.
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/init_mdl.log \
                 nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw \
                 {dir}/0.mdl""".format(command=run_opts.command, dir=dir))
@@ -458,7 +458,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                                    background_process_handler=None):
     model = '{0}/{1}.mdl'.format(dir, iter)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/compute_prob_valid.{iter}.log \
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
@@ -473,7 +473,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                    egs_dir=egs_dir), wait=wait,
         background_process_handler=background_process_handler)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/compute_prob_train.{iter}.log \
                 nnet3-chain-compute-prob --l2-regularize={l2} \
                 --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \
@@ -495,7 +495,7 @@ def compute_progress(dir, iter, run_opts, wait=False,
     prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
     model = '{0}/{1}.mdl'.format(dir, iter)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/progress.{iter}.log \
                 nnet3-am-info {model} '&&' \
                 nnet3-show-progress --use-gpu=no \
@@ -534,7 +534,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
             print("{0}: warning: model file {1} does not exist "
                   "(final combination)".format(sys.argv[0], model_file))
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
                 nnet3-chain-combine --num-iters=40 \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 7f752192611..dcf07fa8af3 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -97,7 +97,7 @@ def get_average_nnet_model(dir, iter, nnets_list, run_opts,
             out_model = "{dir}/{next_iter}.raw".format(dir=dir,
                                                        next_iter=next_iter)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/average.{iter}.log \
                 nnet3-average {nnets_list} \
                 {out_model}""".format(command=run_opts.command,
@@ -126,7 +126,7 @@ def get_best_nnet_model(dir, iter, best_model_index, run_opts,
         out_model = "{dir}/{next_iter}.raw".format(dir=dir,
                                                    next_iter=iter + 1)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/select.{iter}.log \
                 nnet3-copy --scale={scale} {best_model} \
                 {out_model}""".format(command=run_opts.command,
@@ -206,7 +206,7 @@ def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
                                    presoftmax_prior_scale_power=-0.25):
 
     # getting the raw pdf count
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
                 ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
                 post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- \
@@ -215,7 +215,7 @@ def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
                                                dir=dir,
                                                alidir=alidir))
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/sum_pdf_counts.log \
                 vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts \
         """.format(command=run_opts.command,  dir=dir))
@@ -250,7 +250,7 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
 
 
 def prepare_initial_network(dir, run_opts, srand=-3):
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/add_first_layer.log \
                 nnet3-init --srand={srand} {dir}/init.raw \
                 {dir}/configs/layer1.config {dir}/0.raw""".format(
@@ -365,8 +365,8 @@ def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
 
 
 def remove_egs(egs_dir):
-    common_lib.run_kaldi_command("steps/nnet2/remove_egs.sh {egs_dir}".format(
-                                    egs_dir=egs_dir))
+    common_lib.run_job("steps/nnet2/remove_egs.sh {egs_dir}".format(
+                            egs_dir=egs_dir))
 
 
 def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index ade21ca2e16..1360f669f41 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -32,7 +32,7 @@ def generate_egs(data, alidir, egs_dir,
     the model final.mdl and alignments.
     """
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """steps/nnet3/get_egs.sh {egs_opts} \
                 --cmd "{command}" \
                 --cmvn-opts "{cmvn_opts}" \
@@ -75,7 +75,7 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts,
                                              srand=srand)
 
     # Convert to .mdl, train the transitions, set the priors.
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/init_mdl.log \
                 nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
                 nnet3-am-train-transitions - \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 0ea8e8d89ff..b15266c8bdf 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -84,7 +84,7 @@ def train_new_models(dir, iter, srand, num_jobs,
             cache_write_opt = "--write-cache={dir}/cache.{iter}".format(
                 dir=dir, iter=iter+1)
 
-        process_handle = common_lib.run_kaldi_command(
+        process_handle = common_lib.run_job(
             """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                     nnet3-train {parallel_train_opts} {cache_read_opt} \
                     {cache_write_opt} --print-interval=10 \
@@ -335,7 +335,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
             num_lda_jobs = max_lda_jobs
 
     # Write stats with the same format as stats for LDA.
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
                 nnet3-acc-lda-stats --rand-prune={rand_prune} \
                 {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" \
@@ -350,7 +350,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
                          range(1, num_lda_jobs + 1))
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/sum_transform_stats.log \
                 sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
                     command=run_opts.command,
@@ -366,7 +366,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
     # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled
     # variant of an LDA transform but without dimensionality reduction.
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/get_transform.log \
                 nnet-get-feature-transform {lda_opts} {dir}/lda.mat \
                 {dir}/lda_stats""".format(
@@ -389,7 +389,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
     context_opts = "--left-context={lc} --right-context={rc}".format(
         lc=left_context, rc=right_context)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """ {command} {dir}/log/compute_prob_valid.{iter}.log \
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
@@ -404,7 +404,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/compute_prob_train.{iter}.log \
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
@@ -435,7 +435,7 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
     context_opts = "--left-context={lc} --right-context={rc}".format(
         lc=left_context, rc=right_context)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
             """{command} {dir}/log/progress.{iter}.log \
                     nnet3-info "{model}" '&&' \
                     nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \
@@ -497,7 +497,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
     context_opts = "--left-context={lc} --right-context={rc}".format(
         lc=left_context, rc=right_context)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
                 nnet3-combine --num-iters=40 \
                 --enforce-sum-to-one=true --enforce-positive-weights=true \
@@ -570,7 +570,7 @@ def align(dir, data, lang, run_opts, iter=None, transform_dir=None,
     logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
         gpu=" using gpu " if run_opts.realign_use_gpu else " ",
         num_jobs=run_opts.realign_num_jobs))
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """steps/nnet3/align.sh --nj {num_jobs_align} \
                 --cmd "{align_cmd} {align_queue_opt}" \
                 --use-gpu {align_use_gpu} \
@@ -618,7 +618,7 @@ def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
 
     alidir = align(dir, feat_dir, lang, run_opts, iter,
                    transform_dir, online_ivector_dir)
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \
                 {alidir} {prev_egs_dir} {cur_egs_dir}""".format(
                     command=run_opts.command,
@@ -631,7 +631,7 @@ def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
 
 def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model,
                      run_opts):
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} {dir}/log/adjust_priors.final.log \
                 nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \
                 "{output_model}" """.format(
@@ -663,7 +663,7 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives,
     context_opts = "--left-context={lc} --right-context={rc}".format(
         lc=left_context, rc=right_context)
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} \
                 {dir}/log/get_post.{iter}.JOB.log \
                 nnet3-copy-egs {context_opts} \
@@ -688,9 +688,9 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives,
     # make sure there is time for $dir/post.{iter}.*.vec to appear.
     time.sleep(5)
     avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
-    common_lib.run_kaldi_command("""
-{command} {dir}/log/vector_sum.{iter}.log \
-    vector-sum {dir}/post.{iter}.*.vec {output_file}
+    common_lib.run_job(
+        """{command} {dir}/log/vector_sum.{iter}.log \
+                vector-sum {dir}/post.{iter}.*.vec {output_file}
         """.format(command=run_opts.command,
                    dir=dir, iter=iter, output_file=avg_post_vec_file))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 9ea3a07e05c..58240dd2f1b 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -48,7 +48,7 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
             raise Exception("--num-targets is required if "
                             "target-type is sparse")
 
-    common_lib.run_kaldi_command(
+    common_lib.run_job(
         """steps/nnet3/get_egs_targets.sh {egs_opts} \
                 --cmd "{command}" \
                 --cmvn-opts "{cmvn_opts}" \
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 8ed528c5756..d08241f3131 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -195,7 +195,7 @@ def train(args, run_opts, background_process_handler):
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating "
                     "preconditioning matrix")
-        common_lib.run_kaldi_command(
+        common_lib.run_job(
             """{command} {dir}/log/nnet_init.log \
                     nnet3-init --srand=-2 {dir}/configs/init.config \
                     {dir}/init.raw""".format(command=run_opts.command,
@@ -393,8 +393,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
-                                 "{0}".format(args.dir))
+    common_lib.run_job("steps/info/nnet3_dir_info.pl "
+                       "{0}".format(args.dir))
 
 
 def main():
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index d301ed4f630..646c5b302c3 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -187,7 +187,7 @@ def train(args, run_opts, background_process_handler):
 
     if (args.stage <= -5):
         logger.info("Initializing a basic network")
-        common_lib.run_kaldi_command(
+        common_lib.run_job(
             """{command} {dir}/log/nnet_init.log \
                     nnet3-init --srand=-2 {dir}/configs/init.config \
                     {dir}/init.raw""".format(command=run_opts.command,
@@ -389,8 +389,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
-                                 "{0}".format(args.dir))
+    common_lib.run_job("steps/info/nnet3_dir_info.pl "
+                       "{0}".format(args.dir))
 
 
 def main():
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index a7ed6d8ab6b..4a370b0e9ae 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -246,7 +246,7 @@ def train(args, run_opts, background_process_handler):
 
     if (args.stage <= -4):
         logger.info("Initializing a basic network")
-        common_lib.run_kaldi_command(
+        common_lib.run_job(
             """{command} {dir}/log/nnet_init.log \
                     nnet3-init --srand=-2 {dir}/configs/init.config \
                     {dir}/init.raw""".format(command=run_opts.command,
@@ -473,8 +473,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
-                                 "{0}".format(args.dir))
+    common_lib.run_job("steps/info/nnet3_dir_info.pl "
+                       "{0}".format(args.dir))
 
 
 def main():
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 65a8bed2bf6..c14a9d5ff38 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -252,7 +252,7 @@ def train(args, run_opts, background_process_handler):
     if (args.stage <= -5):
         logger.info("Initializing a basic network for estimating "
                     "preconditioning matrix")
-        common_lib.run_kaldi_command(
+        common_lib.run_job(
             """{command} {dir}/log/nnet_init.log \
                     nnet3-init --srand=-2 {dir}/configs/init.config \
                     {dir}/init.raw""".format(command=run_opts.command,
@@ -474,8 +474,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.run_kaldi_command("steps/info/nnet3_dir_info.pl "
-                                 "{0}".format(args.dir))
+    common_lib.run_job("steps/info/nnet3_dir_info.pl "
+                       "{0}".format(args.dir))
 
 
 def main():

From 53df780f251126b05e67bcaafe5739736e1aad62 Mon Sep 17 00:00:00 2001
From: Yenda Trmal <jtrmal@gmail.com>
Date: Wed, 23 Nov 2016 08:42:40 -0500
Subject: [PATCH 33/71] a couple of files forgotten the last time

---
 ...l1@jhu.edu.2016-11-21-11-32-32 => RESULTS} |  0
 egs/gale_mandarin/s5/path.sh                  |  3 +-
 egs/gale_mandarin/s5/run.sh                   | 98 +++++++++++--------
 3 files changed, 61 insertions(+), 40 deletions(-)
 rename egs/gale_mandarin/s5/{RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32 => RESULTS} (100%)

diff --git a/egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32 b/egs/gale_mandarin/s5/RESULTS
similarity index 100%
rename from egs/gale_mandarin/s5/RESULTS.details.jtrmal1@jhu.edu.2016-11-21-11-32-32
rename to egs/gale_mandarin/s5/RESULTS
diff --git a/egs/gale_mandarin/s5/path.sh b/egs/gale_mandarin/s5/path.sh
index be11b34cbc6..e875e4b585c 100755
--- a/egs/gale_mandarin/s5/path.sh
+++ b/egs/gale_mandarin/s5/path.sh
@@ -1,5 +1,6 @@
 export KALDI_ROOT=$(pwd)/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/kaldi_lm:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
+. $KALDI_ROOT/tools/env.sh
 export LC_ALL=C
diff --git a/egs/gale_mandarin/s5/run.sh b/egs/gale_mandarin/s5/run.sh
index 505ade6a269..f6c9f0828b7 100755
--- a/egs/gale_mandarin/s5/run.sh
+++ b/egs/gale_mandarin/s5/run.sh
@@ -6,31 +6,46 @@
 . ./path.sh
 . ./cmd.sh
 
-nJobs=40
-nDecodeJobs=40
-
-AUDIO_PATH=/export/corpora5/LDC/LDC2013S08/
-TEXT_PATH=/export/corpora5/LDC/LDC2013T20/
-
+nJobs=64
+nDecodeJobs=128
+
+AUDIO=(
+  /scratch/groups/skhudan1/corpora/LDC2013S08/
+  /scratch/groups/skhudan1/corpora/LDC2013S04/
+  /scratch/groups/skhudan1/corpora/LDC2014S09/
+  /scratch/groups/skhudan1/corpora/LDC2015S06/
+  /scratch/groups/skhudan1/corpora/LDC2015S13/
+  /scratch/groups/skhudan1/corpora/LDC2016S03/
+)
+TEXT=(
+  /scratch/groups/skhudan1/corpora/LDC2013T20/
+  /scratch/groups/skhudan1/corpora/LDC2013T08/
+  /scratch/groups/skhudan1/corpora/LDC2014T28/
+  /scratch/groups/skhudan1/corpora/LDC2015T09/
+  /scratch/groups/skhudan1/corpora/LDC2015T25/
+  /scratch/groups/skhudan1/corpora/LDC2016T12/
+)
 galeData=GALE/
 
 # You can run the script from here automatically, but it is recommended to run the data preparation,
 # and features extraction manually and and only once.
 # By copying and pasting into the shell.
 
-local/gale_data_prep_audio.sh $galeData $AUDIO_PATH 
-  
-local/gale_data_prep_txt.sh  $galeData $TEXT_PATH
+set -e -o pipefail
+set -x
+
+local/gale_data_prep_audio.sh "${AUDIO[@]}" $galeData
 
-local/gale_data_prep_split.sh $galeData 
+local/gale_data_prep_txt.sh  "${TEXT[@]}" $galeData
 
-local/gale_prep_dict.sh 
+local/gale_data_prep_split.sh $galeData
+local/gale_prep_dict.sh
 
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang   
+utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 
 local/gale_train_lms.sh
 
-local/gale_format_data.sh 
+local/gale_format_data.sh
 
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
@@ -38,6 +53,7 @@ local/gale_format_data.sh
 mfccdir=mfcc
 
 for x in train dev ; do
+  utils/fix_data_dir.sh data/$x
   steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nJobs \
     data/$x exp/make_mfcc/$x $mfccdir
   utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
@@ -45,23 +61,25 @@ for x in train dev ; do
 done
 
 # Let's create a subset with 10k segments to make quick flat-start training:
-utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+utils/subset_data_dir.sh data/train 10000 data/train.10k || exit 1;
+utils/subset_data_dir.sh data/train 50000 data/train.50k || exit 1;
+utils/subset_data_dir.sh data/train 100000 data/train.100k || exit 1;
 
 # Train monophone models on a subset of the data, 10K segment
 # Note: the --boost-silence option should probably be omitted by default
 steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
-  data/train.10K data/lang exp/mono || exit 1;
+  data/train.10k data/lang exp/mono || exit 1;
 
 # Get alignments from monophone system.
 steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
+  data/train.50k data/lang exp/mono exp/mono_ali.50k || exit 1;
 
 # train tri1 [first triphone pass]
 steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+  2500 30000 data/train.50k data/lang exp/mono_ali.50k exp/tri1 || exit 1;
 
 # First triphone decoding
-utils/mkgraph.sh data/lang_dev exp/tri1 exp/tri1/graph || exit 1;
+utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
 steps/decode.sh  --nj $nDecodeJobs --cmd "$decode_cmd" \
   exp/tri1/graph data/dev exp/tri1/decode &
 
@@ -73,14 +91,14 @@ steps/train_deltas.sh --cmd "$train_cmd" \
   3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
 
 # tri2a decoding
-utils/mkgraph.sh data/lang_dev exp/tri2a exp/tri2a/graph || exit 1;
+utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph || exit 1;
 steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
   exp/tri2a/graph data/dev exp/tri2a/decode &
 
 # train and decode tri2b [LDA+MLLT]
 steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
   data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-utils/mkgraph.sh data/lang_dev exp/tri2b exp/tri2b/graph || exit 1;
+utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1;
 steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode &
 
 # Align all data with LDA+MLLT system (tri2b)
@@ -90,9 +108,9 @@ steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
 #  Do MMI on top of LDA+MLLT.
 steps/make_denlats.sh --nj $nJobs --cmd "$train_cmd" \
  data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1;
- 
+
 steps/train_mmi.sh data/train data/lang exp/tri2b_ali \
- exp/tri2b_denlats exp/tri2b_mmi 
+ exp/tri2b_denlats exp/tri2b_mmi
 
 steps/decode.sh  --iter 4 --nj $nJobs --cmd "$decode_cmd"  exp/tri2b/graph \
  data/dev exp/tri2b_mmi/decode_it4 &
@@ -100,10 +118,10 @@ steps/decode.sh  --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mmi/decode_it3 & # Do the same with boosting.
 
 steps/train_mmi.sh --boost 0.1 data/train data/lang exp/tri2b_ali \
-exp/tri2b_denlats exp/tri2b_mmi_b0.1 
+exp/tri2b_denlats exp/tri2b_mmi_b0.1
 
 steps/decode.sh  --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
- data/dev exp/tri2b_mmi_b0.1/decode_it4 & 
+ data/dev exp/tri2b_mmi_b0.1/decode_it4 &
 steps/decode.sh  --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mmi_b0.1/decode_it3 &
 
@@ -119,7 +137,7 @@ steps/decode.sh  --iter 3 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph
 # From 2b system, train 3b which is LDA + MLLT + SAT.
 steps/train_sat.sh --cmd "$train_cmd" \
   5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-utils/mkgraph.sh data/lang_dev exp/tri3b exp/tri3b/graph|| exit 1;
+utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph|| exit 1;
 steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
   exp/tri3b/graph data/dev exp/tri3b/decode &
 
@@ -130,12 +148,11 @@ steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \
 ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights"
 steps/train_ubm.sh --cmd "$train_cmd" 700 \
  data/train data/lang exp/tri3b_ali exp/ubm5a || exit 1;
- 
+
 steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3b_ali \
   exp/ubm5a/final.ubm exp/sgmm_5a || exit 1;
 
-utils/mkgraph.sh data/lang_dev exp/sgmm_5a exp/sgmm_5a/graph || exit 1;
-
+utils/mkgraph.sh data/lang_test exp/sgmm_5a exp/sgmm_5a/graph || exit 1;
 steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
   --transform-dir exp/tri3b/decode exp/sgmm_5a/graph data/dev exp/sgmm_5a/decode &
 
@@ -143,27 +160,30 @@ steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_al
   --use-graphs true --use-gselect true data/train data/lang exp/sgmm_5a exp/sgmm_5a_ali || exit 1;
 
 ## boosted MMI on SGMM
-steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split 30 --beam 9.0 --lattice-beam 6 \
-  --cmd "$decode_cmd" --transform-dir \
-  exp/tri3b_ali data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1;
-  
+steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split $nJobs --beam 9.0 --lattice-beam 6 \
+  --cmd "$decode_cmd" --num-threads 4 --transform-dir exp/tri3b_ali \
+  data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1;
+
 steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tri3b_ali --boost 0.1 \
   data/train data/lang exp/sgmm_5a exp/sgmm_5a_denlats exp/sgmm_5a_mmi_b0.1
- 
+
 #decode GMM MMI
-utils/mkgraph.sh data/lang_dev exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1;
+utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1;
 
 steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
-  --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode &
-  
+  --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode
+
 for n in 1 2 3 4; do
-  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \
+  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \
     data/dev exp/sgmm_5a_mmi_b0.1/decode exp/sgmm_5a_mmi_b0.1/decode$n
-  
-  steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \
+done
+
+for n in 1 2 3 4; do
+  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \
     data/dev exp/sgmm_5a/decode exp/sgmm_5a_mmi_onlyRescoreb0.1/decode$n
 done
 
+wait
 local/nnet/run_dnn.sh
 
 time=$(date +"%Y-%m-%d-%H-%M-%S")

From b69c16158a933b675167b2a8b75abb9607cd6ae2 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 23 Nov 2016 14:18:27 -0500
Subject: [PATCH 34/71] raw_python_script: Fixing run_job

---
 .../steps/libs/nnet3/train/chain_objf/acoustic_model.py   | 8 ++------
 .../s5/steps/libs/nnet3/train/frame_level_objf/common.py  | 8 ++------
 egs/wsj/s5/steps/nnet3/train_dnn.py                       | 3 +--
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py                   | 3 +--
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py                   | 4 ++--
 egs/wsj/s5/steps/nnet3/train_rnn.py                       | 4 ++--
 6 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 9fa8eef1822..c2378b90c1c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -199,20 +199,16 @@ def train_new_models(dir, iter, srand, num_jobs,
         processes.append(process_handle)
 
     all_success = True
-    error_strs = []
     for process in processes:
         process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            error_strs.append(stderr_value.strip())
+        process.communicate()
         if process.returncode != 0:
             all_success = False
 
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
         raise Exception("There was error during training "
-                        "iteration {0}:\n{1}".format(iter,
-                                                     "\n".join(error_strs)))
+                        "iteration {0}".format(iter))
 
 
 def train_one_iteration(dir, iter, srand, egs_dir,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index b15266c8bdf..060624f44ca 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -120,20 +120,16 @@ def train_new_models(dir, iter, srand, num_jobs,
         processes.append(process_handle)
 
     all_success = True
-    error_strs = []
     for process in processes:
         process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        if stderr_value.strip() != '':
-            error_strs.append(stderr_value.strip())
+        process.communicate()
         if process.returncode != 0:
             all_success = False
 
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
         raise Exception("There was error during training "
-                        "iteration {0}:\n{1}".format(iter,
-                                                     "\n".join(error_strs)))
+                        "iteration {0}".format(iter)
 
 
 def train_one_iteration(dir, iter, srand, egs_dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index d08241f3131..6235f47d205 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -208,8 +208,7 @@ def train(args, run_opts, background_process_handler):
         train_lib.acoustic_model.generate_egs(
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context,
-            valid_right_context=right_context,
+            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
             frames_per_eg=args.frames_per_eg,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 646c5b302c3..9bb0397ad8e 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -216,8 +216,7 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context,
-            valid_right_context=right_context,
+            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
             frames_per_eg=args.frames_per_eg,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 4a370b0e9ae..648892c3ff4 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -275,8 +275,8 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=args.chunk_width + left_context,
-            valid_right_context=args.chunk_width + right_context,
+            valid_left_context=left_context + args.chunk_width,
+            valid_right_context=right_context + args.chunk_width,
             run_opts=run_opts,
             frames_per_eg=args.chunk_width,
             srand=args.srand,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index c14a9d5ff38..0f7961af128 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -265,8 +265,8 @@ def train(args, run_opts, background_process_handler):
         train_lib.acoustic_model.generate_egs(
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=args.chunk_width + left_context,
-            valid_right_context=args.chunk_width + right_context,
+            valid_left_context=left_context + args.chunk_width,
+            valid_right_context=right_context + args.chunk_width,
             run_opts=run_opts,
             frames_per_eg=args.chunk_width,
             srand=args.srand,

From e2524551eca048765c730b5f0eff40bbef200ec0 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 23 Nov 2016 18:27:29 -0500
Subject: [PATCH 35/71] raw_python_script: Bug fix

---
 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 060624f44ca..9f29cebb0d2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -129,7 +129,7 @@ def train_new_models(dir, iter, srand, num_jobs,
     if not all_success:
         open('{0}/.error'.format(dir), 'w').close()
         raise Exception("There was error during training "
-                        "iteration {0}".format(iter)
+                        "iteration {0}".format(iter))
 
 
 def train_one_iteration(dir, iter, srand, egs_dir,

From f4ce92139663bb5308ff4b83cd477f6d1a122b1b Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 23 Nov 2016 18:49:28 -0500
Subject: [PATCH 36/71] raw_python_script: Removing nnet3_libs which is moved
 to libs/nnet3

---
 egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py

diff --git a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
deleted file mode 100644
index e6dc907fe0a..00000000000
--- a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# This module will house the latest training libraries being written by Vimal

From 6102c60f816453daf24d3621c468f569c5e18a35 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 25 Nov 2016 22:06:12 -0500
Subject: [PATCH 37/71] raw_python_script: Reorganized
 steps/nnet3/xconfig_to_configs.py

---
 egs/wsj/s5/steps/nnet3/xconfig_to_configs.py | 133 ++++++++++---------
 1 file changed, 72 insertions(+), 61 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index e29a9404403..c55dae18b19 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -2,14 +2,9 @@
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
-import os
 import argparse
-import shlex
+import os
 import sys
-import warnings
-import copy
-import imp
-import ast
 from collections import defaultdict
 
 sys.path.insert(0, 'steps/')
@@ -17,14 +12,15 @@
 sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
 
 import libs.nnet3.xconfig.parser as xparser
-# do the proper import when python scripts have been refactored
-nnet3_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py')
+import libs.common as common_lib
+
 
 def get_args():
     # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description='Reads an xconfig file and creates config files '
-                                     'for neural net creation and training',
-                                     epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser = argparse.ArgumentParser(
+        description="Reads an xconfig file and creates config files "
+                    "for neural net creation and training",
+        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
     parser.add_argument('--xconfig-file', required=True,
                         help='Filename of input xconfig file')
     parser.add_argument('--config-dir', required=True,
@@ -37,26 +33,28 @@ def get_args():
 
     return args
 
+
 def check_args(args):
     if not os.path.exists(args.config_dir):
         os.makedirs(args.config_dir)
     return args
 
 
-
-
 def backup_xconfig_file(xconfig_file, config_dir):
-    # we write a copy of the xconfig file just to have a record of the original
-    # input.
+    """we write a copy of the xconfig file just to have a record of the
+    original input.
+    """
     try:
         xconfig_file_out = open(config_dir + '/xconfig', 'w')
     except:
-        sys.exit('{0}: error opening file {1}/xconfig for output'.format(
-            sys.argv[0], config_dir))
+        raise Exception('{0}: error opening file '
+                        '{1}/xconfig for output'.format(
+                            sys.argv[0], config_dir))
     try:
         xconfig_file_in = open(xconfig_file)
     except:
-        sys.exit('{0}: error opening file {1} for input'.format(sys.argv[0], config_dir))
+        raise Exception('{0}: error opening file {1} for input'.format(
+                            sys.argv[0], config_dir))
 
     print("# This file was created by the command:\n"
           "# {0}\n"
@@ -73,16 +71,17 @@ def backup_xconfig_file(xconfig_file, config_dir):
     xconfig_file_in.close()
 
 
-# This functions writes config_dir/xconfig.expanded.1 and
-# config_dir/xconfig.expanded.2, showing some of the internal stages of
-# processing the xconfig file before turning it into config files.
 def write_expanded_xconfig_files(config_dir, all_layers):
+    """ This functions writes config_dir/xconfig.expanded.1 and
+    config_dir/xconfig.expanded.2, showing some of the internal stages of
+    processing the xconfig file before turning it into config files.
+    """
     try:
         xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w')
     except:
-        sys.exit('{0}: error opening file {1}/xconfig.expanded.1 for output'.format(
-            sys.argv[0], config_dir))
-
+        raise Exception('{0}: error opening file '
+                        '{1}/xconfig.expanded.1 for output'.format(
+                            sys.argv[0], config_dir))
 
     print('# This file was created by the command:\n'
           '# ' + ' '.join(sys.argv) + '\n'
@@ -97,13 +96,15 @@ def write_expanded_xconfig_files(config_dir, all_layers):
     try:
         xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w')
     except:
-        sys.exit('{0}: error opening file {1}/xconfig.expanded.2 for output'.format(
-                sys.argv[0], config_dir))
+        raise Exception('{0}: error opening file '
+                        '{1}/xconfig.expanded.2 for output'.format(
+                            sys.argv[0], config_dir))
 
     print('# This file was created by the command:\n'
           '# ' + ' '.join(sys.argv) + '\n'
           '# It contains the same content as ./xconfig but it was parsed,\n'
-          '# default config values were set, and Descriptors (input=xxx) were normalized.\n'
+          '# default config values were set, \n'
+          '# and Descriptors (input=xxx) were normalized.\n'
           '# See also ./xconfig.expanded.1\n\n',
           file=xconfig_file_out)
 
@@ -112,33 +113,38 @@ def write_expanded_xconfig_files(config_dir, all_layers):
         print(str(layer), file=xconfig_file_out)
     xconfig_file_out.close()
 
-# This function returns a map from config-file basename
-# e.g. 'init', 'ref', 'layer1' to a documentation string that goes
-# at the top of the file.
-def get_config_headers():
-    ans = defaultdict(str)  # resulting dict will default to the empty string
-                            # for any config files not explicitly listed here.
-    ans['init'] = ('# This file was created by the command:\n'
-                   '# ' + ' '.join(sys.argv) + '\n'
-                   '# It contains the input of the network and is used in\n'
-                   '# accumulating stats for an LDA-like transform of the\n'
-                   '# input features.\n');
-    ans['ref'] = ('# This file was created by the command:\n'
-                  '# ' + ' '.join(sys.argv) + '\n'
-                  '# It contains the entire neural network, but with those\n'
-                  '# components that would normally require fixed vectors/matrices\n'
-                  '# read from disk, replaced with random initialization\n'
-                  '# (this applies to the LDA-like transform and the\n'
-                  '# presoftmax-prior-scale, if applicable).  This file\n'
-                  '# is used only to work out the left-context and right-context\n'
-                  '# of the network.\n');
-    ans['final'] = ('# This file was created by the command:\n'
-                    '# ' + ' '.join(sys.argv) + '\n'
-                    '# It contains the entire neural network.\n')
-
-    return ans;
-
 
+def get_config_headers():
+    """ This function returns a map from config-file basename
+    e.g. 'init', 'ref', 'layer1' to a documentation string that goes
+    at the top of the file.
+    """
+    # resulting dict will default to the empty string for any config files not
+    # explicitly listed here.
+    ans = defaultdict(str)
+
+    ans['init'] = (
+        '# This file was created by the command:\n'
+        '# ' + ' '.join(sys.argv) + '\n'
+        '# It contains the input of the network and is used in\n'
+        '# accumulating stats for an LDA-like transform of the\n'
+        '# input features.\n')
+    ans['ref'] = (
+        '# This file was created by the command:\n'
+        '# ' + ' '.join(sys.argv) + '\n'
+        '# It contains the entire neural network, but with those\n'
+        '# components that would normally require fixed vectors/matrices\n'
+        '# read from disk, replaced with random initialization\n'
+        '# (this applies to the LDA-like transform and the\n'
+        '# presoftmax-prior-scale, if applicable).  This file\n'
+        '# is used only to work out the left-context and right-context\n'
+        '# of the network.\n')
+    ans['final'] = (
+        '# This file was created by the command:\n'
+        '# ' + ' '.join(sys.argv) + '\n'
+        '# It contains the entire neural network.\n')
+
+    return ans
 
 
 # This is where most of the work of this program happens.
@@ -157,13 +163,14 @@ def write_config_files(config_dir, all_layers):
                 config_basename_to_lines[config_basename].append(line)
         except Exception as e:
             print("{0}: error producing config lines from xconfig "
-                    "line '{1}': error was: {2}".format(sys.argv[0], str(layer),
-                                                        repr(e)), file=sys.stderr)
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
             # we use raise rather than raise(e) as using a blank raise
             # preserves the backtrace
             raise
 
-    for basename,lines in config_basename_to_lines.items():
+    for basename, lines in config_basename_to_lines.items():
         header = config_basename_to_header[basename]
         filename = '{0}/{1}.config'.format(config_dir, basename)
         try:
@@ -179,12 +186,15 @@ def write_config_files(config_dir, all_layers):
             # preserves the backtrace
             raise
 
+
 def add_back_compatibility_info(config_dir):
     """This will be removed when python script refactoring is done."""
 
-    nnet3_lib.RunKaldiCommand("nnet3-init {0}/ref.config {0}/ref.raw".format(config_dir))
-    out, err = nnet3_lib.RunKaldiCommand("nnet3-info {0}/ref.raw | head -4".format(config_dir))
-    #out looks like this
+    common_lib.run_kaldi_command("nnet3-init {0}/ref.config "
+                                 "{0}/ref.raw".format(config_dir))
+    out, err = common_lib.run_kaldi_command("nnet3-info {0}/ref.raw | "
+                                            "head -4".format(config_dir))
+    # out looks like this
     # left-context: 7
     # right-context: 0
     # num-parameters: 90543902
@@ -206,8 +216,9 @@ def add_back_compatibility_info(config_dir):
     vf.write('num_hidden_layers=1\n')
     vf.close()
 
-    nnet3_lib.ForceSymlink("final.config".format(config_dir),
-                           "{0}/layer1.config".format(config_dir))
+    common_lib.force_symlink("final.config".format(config_dir),
+                             "{0}/layer1.config".format(config_dir))
+
 
 def main():
     args = get_args()

From f132550e832f4814eb9b19462bb8048e41491689 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Fri, 25 Nov 2016 22:42:33 -0500
Subject: [PATCH 38/71] raw_python_script: Adding steps to syspath

---
 egs/wsj/s5/steps/libs/common.py                 | 4 ++--
 egs/wsj/s5/steps/libs/nnet3/report/log_parse.py | 1 +
 egs/wsj/s5/steps/nnet3/chain/train.py           | 1 +
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py     | 1 +
 egs/wsj/s5/steps/nnet3/tdnn/make_configs.py     | 1 +
 egs/wsj/s5/steps/nnet3/train_dnn.py             | 1 +
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py         | 1 +
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py         | 1 +
 egs/wsj/s5/steps/nnet3/train_rnn.py             | 1 +
 9 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index e8a6020979b..1e0608525ba 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -78,7 +78,7 @@ class KaldiCommandException(Exception):
     """ An Exception class that throws an error string with the
     kaldi command that caused the error and the error string captured.
     """
-    def __init__(self, command, err = None):
+    def __init__(self, command, err=None):
         Exception.__init__(self,
                            "There was an error while running the command "
                            "{0}\n{1}\n{2}".format(command, "-"*10,
@@ -281,7 +281,7 @@ def get_number_of_jobs(alidir):
         num_jobs = int(open('{0}/num_jobs'.format(alidir)).readline().strip())
     except (IOError, ValueError) as e:
         raise Exception("Exception while reading the "
-                        "number of alignment jobs: {0}".format(e.str()))
+                        "number of alignment jobs: {0}".format(e.errstr))
     return num_jobs
 
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 794acd8a8d8..7c34de6d752 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -7,6 +7,7 @@
 from __future__ import division
 import datetime
 import re
+import sys
 
 import libs.common as common_lib
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 4ee6c396911..2acfe56727d 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -15,6 +15,7 @@
 import sys
 import traceback
 
+sys.path.insert(0, 'steps')
 import libs.nnet3.train.common as common_train_lib
 import libs.common as common_lib
 import libs.nnet3.train.chain_objf.acoustic_model as chain_lib
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 22b5fa975dd..eeab313a950 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -9,6 +9,7 @@
 import imp
 
 nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
+sys.path.insert(0, 'steps')
 import libs.common as common_lib
 
 def GetArgs():
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 29c95dc88cd..48c13a1236c 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -12,6 +12,7 @@
 import ast
 
 nodes = imp.load_source('', 'steps/nnet3/components.py')
+sys.path.insert(0, 'steps')
 import libs.common as common_lib
 
 def GetArgs():
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 6235f47d205..83170ea1e8e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -15,6 +15,7 @@
 import sys
 import traceback
 
+sys.path.insert(0, 'steps')
 import libs.nnet3.train.common as common_train_lib
 import libs.common as common_lib
 import libs.nnet3.train.frame_level_objf as train_lib
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 9bb0397ad8e..b67ba8792a8 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -15,6 +15,7 @@
 import sys
 import traceback
 
+sys.path.insert(0, 'steps')
 import libs.nnet3.train.common as common_train_lib
 import libs.common as common_lib
 import libs.nnet3.train.frame_level_objf as train_lib
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 648892c3ff4..1e448ddde98 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -16,6 +16,7 @@
 import sys
 import traceback
 
+sys.path.insert(0, 'steps')
 import libs.nnet3.train.common as common_train_lib
 import libs.common as common_lib
 import libs.nnet3.train.frame_level_objf as train_lib
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 0f7961af128..c735e9f27f6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -15,6 +15,7 @@
 import sys
 import traceback
 
+sys.path.insert(0, 'steps')
 import libs.nnet3.train.common as common_train_lib
 import libs.common as common_lib
 import libs.nnet3.train.frame_level_objf as train_lib

From 6fd32daada5b9c336b5dde226e5c32460d475a16 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 21 Nov 2016 02:43:05 -0500
Subject: [PATCH 39/71] Some partial changes (don't compile yet) towards a more
 efficient LSTM implementation

---
 src/cudamatrix/cu-math.cc         | 160 ++++++++++++++++++++++++++----
 src/cudamatrix/cu-math.h          | 125 +++++++++++++++++++++++
 src/matrix/kaldi-vector.cc        |   5 +-
 src/nnet3/nnet-simple-component.h | 123 +++++++++++++++++++++++
 4 files changed, 390 insertions(+), 23 deletions(-)

diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 97757ba68dd..f4eac392772 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -29,15 +29,15 @@ namespace kaldi {
 namespace cu {
 
 /*
- * templated functions wrapping the ANSI-C CUDA kernel functions 
+ * templated functions wrapping the ANSI-C CUDA kernel functions
  */
 
 
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
@@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
-        
+
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
-        if (weight2(r, c) < 0.0) 
+        if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
@@ -88,16 +88,16 @@ void Randomize(const CuMatrixBase<Real> &src,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     /*
-    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
+    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
-     * and extend the randomizable matrices to: col 4*65535, row 128*65535 
+     * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
@@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase<Real> &src,
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase<Real> &src,
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
-} 
+}
 
 
 
 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {
-  
+
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -171,7 +171,7 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
 
 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
-          CuMatrixBase<Real> *tgt) { 
+          CuMatrixBase<Real> *tgt) {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
@@ -179,14 +179,14 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -234,8 +234,128 @@ void Randomize(const CuMatrixBase<double> &src,
                CuMatrixBase<double> *tgt);
 
 
+// not calling this Sigmoid to reduce the chance of future collisions.
+static inline BaseFloat ScalarSigmoid(BaseFloat a) {
+  if (a > 0.0) {
+    return 1.0 / (1.0 + Exp(-a));
+  } else {
+    Real x = Exp(a);
+    return x / (x + 1.0);
+  }
+}
+
+static inline BaseFloat ScalarTanh(BaseFloat a) {
+  if (a > 0.0) {
+    Real inv_expa = Exp(-a);
+    return -1.0 + 2.0 / (1.0 + inv_expa * inv_expa);
+  } else {
+    Real expa = Exp(a);
+    return = 1.0 - 2.0 / (1.0 + expa * expa);
+  }
+}
+
+
+void ComputeLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
+                             const CuMatrixBase<BaseFloat> &params,
+                             CuMatrixBase<BaseFloat> *output) {
+  int32 num_rows = input.NumRows(),
+      cell_dim = input.NumCols() / 5;
+  KALDI_ASSERT(output->NumRows() == num_rows &&
+               input.NumCols() % 5 == 0 &&
+               params.NumRows() == 3 && params.NumCols() == cell_dim &&
+               output->NumCols() == 2 * cell_dim);
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ERR << "CUDA version not implemented";
+  } else
+#endif
+  {
+    const MatrixBase<BaseFloat> &input_mat = input.Mat(),
+        &params_mat = params.Mat();
+    MatrixBase<BaseFloat> &output_mat = *output;
+    const BaseFloat *params_data = params_mat.Data();
+    int32 params_stride = params_mat.Stride();
+    for (int32 r = 0; r < num_rows; r++) {
+      const BaseFloat *input_row = input_mat.RowData(r);
+      BaseFloat *output_row = output_mat.RowData(r);
+      for (int32 c = 0; c < cell_dim; c++) {
+        BaseFloat i_part = input_row[c], f_part = input_row[c + cell_dim],
+            c_part = input_row[c + 2 * cell_dim],
+            o_part = input_row[c + 3 * cell_dim],
+            c_prev = input_row[c + 4 * cell_dim],
+            w_ic = params_data[c], w_fc = params_data[c + params_stride],
+            w_oc = params_data[c + params_stride * 2];
+        BaseFloat i_t = ScalarSigmoid(i_part + w_ic * c_prev),
+            f_t = ScalarSigmoid(f_part + w_fc * c_prev),
+            c_t = f_t * c_prev + i_t * Tanh(c_part),
+            o_t = ScalarSigmoid(o_part + w_oc * c_t),
+            m_t = o_t * ScalarTanh(c_t);
+        output_row[c] = c_t;
+        output_row[c + cell_dim] = m_t;
+      }
+    }
+  }
+}
+
+
+void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
+                              const CuMatrixBase<BaseFloat> &params,
+                              const CuMatrixBase<BaseFloat> &output_deriv,
+                              const CuMatrixBase<double> &deriv_sum_in,
+                              const CuVectorBase<BaseFloat> &self_repair_config,
+                              double count_in,
+                              CuMatrixBase<BaseFloat> *input_deriv,
+                              CuMatrixBase<BaseFloat> *params_deriv,
+                              CuMatrixBase<double> *value_sum_out,
+                              CuMatrixBase<double> *deriv_sum_out,
+                              CuMatrixBase<BaseFloat> *self_repair_sum_out) {
+  int32 num_rows = input.NumRows(),
+      cell_dim = input.NumCols() / 5;
+  KALDI_ASSERT(output_deriv.NumRows() == num_rows &&
+               input.NumCols() % 5 == 0 &&
+               params.NumRows() == 3 && params.NumCols() == cell_dim &&
+               output_deriv.NumCols() == 2 * cell_dim &&
+               deriv_sum_in.NumRows() == 5 && deriv_sum_in.NumCols() == cell_dim
+               && self_repair_config.Dim() == 10 && count_in >= 0);
+  if (input_deriv != NULL) {
+    KALDI_ASSERT(SameDim(input, *input_deriv));
+  }
+  if (params_deriv == NULL) {
+    KALDI_ASSERT(value_sum_out == NULL && deriv_sum_out == NULL &&
+                 self_repair_sum_out == NULL);
+  } else {
+    KALDI_ASSERT(value_sum_out != NULL && deriv_sum_out != NULL &&
+                 self_repair_sum_out != NULL && SameDim(params, *params_deriv) &&
+                 value_sum_out->NumRows() == 5 &&
+                 value_sum_out->NumCols() == cell_dim &&
+                 SameDim(* ...
+                         // HERE
+
+  KALDI_ASSERT(input.NumRows() == output->NumRows() &&
+               input.NumCols() % 5 == 0 &&
+               output->NumCols() == 2 * (input.NumCols() / 5));
+  int32 num_rows = input.NumRows(),
+      cell_dim = input.NumCols() / 5;
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ERR << "CUDA version not implemented";
+  // notes for Shiyin:
+    //  You could do an 'easy' initial version where we have have one thread per dimension,
+    //  and you can try optimizing this later on.
+    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
+    // But up to you.
+  } else
+#endif
+  {
+
+  }
+}
+
+
+
 
 } //namespace cu
 
 } //namespace kaldi
-
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 65a4c0c4af3..c0626d124c0 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -78,6 +78,131 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 CuMatrixBase<Real> *dest,
                 int32 group_stride);
 
+/**
+ this is a special-purpose function used by class LstmNonlinearityComponent,
+ to do its forward propagation.  It computes the core part of the LSTM nonlinearity.
+ Refer to class LstmNonlinearityComponent in ../nnet3/nnet-simple-component.h
+ for more context.
+
+ @param [in]  input  A matrix, of dimension N by 5C (i.e. its num-cols must be
+                     a multiple of 5).  The column-space is interpreted as 5
+                     consecutive blocks, each of dimension C, which we name:
+                     (i_part, f_part, c_part, o_part, c_{t-1}).
+ @param [in] params  A matrix, of dimension 3 by C, with rows containing the three
+                     diagonal parameter matrices used in LSTMs, namely
+                     w_{ic}, w_{fc} and w_{oc}.
+ @param [out] output A matrix, of dimension N by 2C.  The quantities c_t and m_t
+                     respectively are put there (in two blocks of column-dimension C),
+                     according to the following equations:
+
+                     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})
+                     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})
+                     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)
+                     o_t = Sigmoid(o_part + w_{oc}*c_t)
+                     m_t = o_t * Tanh(c_t)
+
+
+*/
+void ComputeLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
+                             const CuMatrixBase<BaseFloat> &params,
+                             CuMatrixBase<BaseFloat> *output);
+
+
+/**
+   This function does the 'backward' pass corresponding to the function
+   ComputeLstmNonlinearity.  It's a little more complicated than you might
+   expect because of the 'self-repair' mechanism that we use to prevent the
+   sigmoid and tanh nonlinearities oversaturating,  and because of the
+   average-activation and average-derivative stats that we store for these
+   nonlinearites (these stats are used both to control the self-repair
+   mechanism, and for diagnostic purposes).
+
+   Because the forward pass computes various intermediate values that are not
+   output, this function actually has to do the same computations as the
+   forward pass before it actually does the backprop.
+
+
+ @param [in]  input  The same as in ComputeLstmNonlinearity().
+                     A matrix, of dimension N by 5C (i.e. its num-cols must be
+                     a multiple of 5).  The column-space is interpreted as 5
+                     consecutive blocks, each of dimension C, which we name:
+                     (i_part, f_part, c_part, o_part, c_{t-1}).
+ @param [in] params  The same as in ComputeLstmNonlinearity().
+                     A matrix, of dimension 3 by C, with rows containing the three
+                     diagonal parameter matrices used in LSTMs, namely
+                     w_{ic}, w_{fc} and w_{oc}.
+ @param [out] output_deriv
+                     A matrix, of dimension N by 2C, containing the derivative of the
+                     objective function we're backpropagating, w.r.t. the quantities
+                     c_t and m_t (in two blocks of column-dimension C).
+ @param [in] deriv_sum_in
+                     This is used in the self-repair code to identify oversaturated
+                     nonlinearities.  It is a matrix, of dimension 5 by C, corresponding
+                     to the totals of the derivatives of the 5 sigmoid and tanh
+                     nonlinearities, in they order they appear in the equations
+                     in the documentation of ComputeLstmNonlinearity() Rspectively,
+                     they appear in the equations for (i_t, f_t, c_t, o_t, m_t).
+                     This will be divided by 'count_in' to get the average derivative
+                     value so far, for each of the nonlinearities.
+ @param [in] self_repair_config
+                     A vector of dimension 10, containing the configuration of the self-repair
+                     to be used for the 5 nonlinearities.  The first 5 elements are the
+                     self_repair_lower_threshold values (typically 0.05 for sigmoid and 0.2
+                     for tanh), and the next 5 elements are the corresponding
+                     self-repair-scales (typically 10^-5).
+ @param [in] count_in  The data-count that corresponds to the stats in 'deriv_sum_in'
+                     at entry to the function.  This function should tolerate the count
+                     being zero (in that case, it is free to do the self-repair or not,
+                     as this should only happen on the 1st minibatch of each training job).
+ @param [out] input_deriv
+                     May be NULL; if not, this function writes, to this
+                     location, the backpropagated derivative of the objective
+                     function w.r.t. the 'input' matrix.  This matrix should
+                     have the same dimension as 'input' i.e.  N by 5C.  In
+                     addition to the regular backpropagated derivative, the
+                     output will include small values relating to 'self-repair'.
+ @param [out] params_deriv
+                     May be NULL; if not, this is where this function *writes*
+                     [not adds] the backpropagated derivative of the objective
+                     function w.r.t. 'params'; it should have the same dimension
+                     as 'params' (3 by C).  (This matrix will then be processed
+                     by the natural gradient code and added to the appropriate
+                     copy of the parameter matrix, outside this function).
+ @param [out] value_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C.  This function *adds* to this location
+                     the total value of each of the sigmoid/tanh nonlinearities
+                     that it computes (this is for diagnostic purposes).
+ @param [out] deriv_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C; this function *adds* to this location the
+                     total of the derivative of each of the sigmoid/tanh
+                     nonlinearities that it computes (this is for diagnostic
+                     purposes and to control the self-repair).  This function
+                     should tolerate the case when 'deriv_sum_out' points to the
+                     same data as 'deriv_sum_in'.
+ @param [out] self_repair_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C; this function *writes* to this location
+                     the sum of the number of times the self-repair code was
+                     activated (integer values 0 <= k <= N).  This will be
+                     processed outside this function into self-repair stats for
+                     diagnostics.
+*/
+
+void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
+                              const CuMatrixBase<BaseFloat> &params,
+                              const CuMatrixBase<BaseFloat> &output_deriv,
+                              const CuMatrixBase<double> &deriv_sum_in,
+                              const CuVectorBase<BaseFloat> &self_repair_config,
+                              double count_in,
+                              CuMatrixBase<BaseFloat> *input_deriv,
+                              CuMatrixBase<BaseFloat> *params_deriv,
+                              CuMatrixBase<double> *value_sum_out,
+                              CuMatrixBase<double> *deriv_sum_out,
+                              CuMatrixBase<BaseFloat> *self_repair_sum_out);
+
+
 
 
 
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 666e4fad7c5..851db1a1d2f 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -882,8 +882,8 @@ void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
       Real inv_expx = Exp(-x);
       x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
     } else {
-      Real inv_expx = Exp(x);
-      x = 1.0 - 2.0 / (1.0 + inv_expx * inv_expx);
+      Real expx = Exp(x);
+      x = 1.0 - 2.0 / (1.0 + expx * expx);
     }
     data_[i] = x;
   }
@@ -1324,4 +1324,3 @@ template class VectorBase<float>;
 template class VectorBase<double>;
 
 }  // namespace kaldi
-
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index a4d8f6fbacd..da45791a065 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1655,6 +1655,129 @@ class ConvolutionComponent: public UpdatableComponent {
 };
 
 
+// LstmNonlinearityComponent is a component that implements part of an LSTM, by
+// combining together the sigmoids and tanh's, plus some diagonal terms, into
+// a single block.
+// We will refer to the LSTM formulation used in
+//
+// Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
+// by H. Sak et al,
+// http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
+//
+// Suppose the cell dimension is C.  Then outside this component, we compute
+// the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
+// matrix multiplication:
+//
+// i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
+// f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
+// c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
+// o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
+//
+// The part of the computation that takes place in this component is as follows.
+// Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
+// c_{t-1}).  Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
+//
+// To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
+//
+//
+// This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
+// and w_o.
+//
+//
+// In the forward pass (Propagate), this component computes the following:
+//
+//    i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
+//    f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
+//    c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
+//    o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
+//    m_t = o_t * Tanh(c_t)                    (5)
+//   # note: the outputs are just c_t and m_t.
+//
+// The backprop is as you would think, but for the "self-repair" we need to pass
+// in additional vectors (of the same dim as the parameters of the layer) that
+// dictate whether or not we add an additional term to the backpropagated
+// derivatives.  (This term helps force the input to the nonlinearities into the
+// range where the derivatives are not too small).
+//
+// This component stores stats of the same form as are normally stored by the
+// StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+// activations and derivatives, but this is done inside the Backprop() functions.
+// [the StoreStats() functions don't take the input data as an argument, so
+// storing this data that way is impossible, and anyway it's more efficient to
+// do it as part of backprop.]
+class LstmNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
+  virtual std::string Type() const {
+    return "NaturalGradientPerElementScaleComponent";
+  }
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions that are specific to this class:
+  explicit NaturalGradientPerElementScaleComponent(
+      const NaturalGradientPerElementScaleComponent &other);
+
+  void Init(int32 dim, BaseFloat param_mean,
+            BaseFloat param_stddev, int32 rank, int32 update_period,
+            BaseFloat num_samples_history, BaseFloat alpha,
+            BaseFloat max_change_per_minibatch);
+  void Init(std::string vector_filename,
+            int32 rank, int32 update_period, BaseFloat num_samples_history,
+            BaseFloat alpha, BaseFloat max_change_per_minibatch);
+
+ private:
+
+
+  // Notation: C is the cell dimension; it equals params_.NumCols().
+
+  // The dimension of the parameter matrix is (3 x C);
+  // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
+  CuMatrix<BaseFloat> params_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the values of the nonliearities
+  // (used for diagnostics only).  It is comparable to value_sum_ vector
+  // in base-class NonlinearComponent.
+  // Note: to save time and simplify the code, when using GPU we don't always
+  // store stats for all of the members of the minibatch, just a subset.
+  CuMatrix<double> value_sum_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the derivatives of the
+  // nonliearities (used for diagnostics and to control self-repair).  It is
+  // comparable to the deriv_sum_ vector in base-class
+  // NonlinearComponent.
+  // Note: to save time and simplify the code, when using GPU we don't always
+  // store stats for all of the members of the minibatch, just a subset.
+  CuMatrix<double> deriv_sum_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_
+  // and deriv_sum_.
+  double count_;
+
+
+  // Preconditioner for the parameters of this component [operates in the space
+  // of dimension C].
+  // The preconditioner stores its own configuration values; we write and read
+  // these, but not the preconditioner object itself.
+  OnlineNaturalGradient preconditioner_;
+
+  const LstmNonlinearityComponent &operator
+      = (const LstmNonlinearityComponent &other); // Disallow.
+};
+
+
+
+
 /*
  * MaxPoolingComponent :
  * Maxpooling component was firstly used in ConvNet for selecting an

From 5470ad12af36269aba38afc5613ae45e7fc47b86 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Tue, 22 Nov 2016 00:56:31 +0800
Subject: [PATCH 40/71] cuda kernel for lstm nonlinearity

unit test

pass unit and speed test

looks better
---
 src/cudamatrix/cu-kernels-ansi.h |  11 ++
 src/cudamatrix/cu-kernels.cu     |  75 +++++++++++
 src/cudamatrix/cu-kernels.h      |  17 +++
 src/cudamatrix/cu-math-test.cc   |  53 +++++++-
 src/cudamatrix/cu-math.cc        | 213 +++++++++++++++++++------------
 src/cudamatrix/cu-math.h         |  13 +-
 6 files changed, 291 insertions(+), 91 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 4642048989e..b7571383193 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -641,6 +641,17 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
                               MatrixDim mat1_dim, int mat2_stride,
                               int mask_stride);
 
+void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                             const int in_stride, const double* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             double* out);
+void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                             const int in_stride, const float* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             float* out);
+
 } // extern "C"
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index bddd1227441..614f8ec4cc6 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2630,6 +2630,64 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
 }
 
 
+/**
+ this function computes the core part of the LSTM nonlinearity.
+ @param [in] in      A matrix, of dimension num_rows by 5*cell_dim
+                     (i.e. its num-cols must be a multiple of 5).
+                     The column-space is interpreted as 5
+                     consecutive blocks, each of dimension cell_dim,
+                     which we name:
+                     (i_part, f_part, c_part, o_part, c_{t-1}).
+ @param [in] params  A matrix, of dimension 3 by cell_dim,
+                     with rows containing the 3 diagonal parameter matrices
+                     used in LSTMs, namely
+                     w_{ic}, w_{fc} and w_{oc}.
+ @param [out] out    A matrix, of dimension num_rows by 2*cell_dim.
+                     The quantities c_t and m_t respectively are put there
+                     (in two blocks of column-dimension cell_dim),
+                     according to the following equations:
+
+                     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})
+                     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})
+                     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)
+                     o_t = Sigmoid(o_part + w_{oc}*c_t)
+                     m_t = o_t * Tanh(c_t)
+
+We use 1D thread block with CU1DBLOCK threads.
+It works best when cell_dim is a multiple of CU1DBLOCK.
+We use 1d Grid. Each block is working on one row of the in and out matrices.
+*/
+template<typename Real>
+__global__
+static void _lstm_nonlinearity(const Real* in, const int in_stride,
+                               const Real* params, const int params_stride,
+                               const int out_stride, const int cell_dim,
+                               const int num_rows, Real* out) {
+  const int tid = threadIdx.x;
+  const int i = blockIdx.x;
+  const Real* i_part = in + i * in_stride;
+  const Real* f_part = in + i * in_stride + cell_dim;
+  const Real* c_part = in + i * in_stride + cell_dim * 2;
+  const Real* o_part = in + i * in_stride + cell_dim * 3;
+  const Real* c_tm1 = in + i * in_stride + cell_dim * 4;
+  const Real* w_ic = params;
+  const Real* w_fc = params + params_stride;
+  const Real* w_oc = params + params_stride * 2;
+  Real* c_t = out + i * out_stride;
+  Real* m_t = out + i * out_stride + cell_dim;
+
+  for (int j = tid; j < cell_dim; j += CU1DBLOCK) {
+    Real c_tm1_j = c_tm1[j];
+    Real i_t_j = Real(1) / (Real(1) + exp(-i_part[j] - w_ic[j] * c_tm1_j));
+    Real f_t_j = Real(1) / (Real(1) + exp(-f_part[j] - w_fc[j] * c_tm1_j));
+    Real c_t_j = f_t_j * c_tm1_j + i_t_j * tanh(c_part[j]);
+    Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j));
+    c_t[j] = c_t_j;
+    m_t[j] = o_t_j * tanh(c_t_j);
+  }
+}
+
+
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
  */
@@ -4031,3 +4089,20 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
   _trace_mat_smat_trans<<<Gr,Bl>>>(mat_in, smat_in, mat_d_in, smat_d_in,
       trace_vec_out);
 }
+void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                             const int in_stride, const double* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             double* out) {
+  _lstm_nonlinearity<<<Gr, Bl>>>(in, in_stride, params, params_stride,
+      out_stride, cell_dim, num_rows, out);
+}
+void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                             const int in_stride, const float* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             float* out) {
+  _lstm_nonlinearity<<<Gr, Bl>>>(in, in_stride, params, params_stride,
+      out_stride, cell_dim, num_rows, out);
+}
+
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index a6e81db5d6c..c8912b4ebfc 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1242,6 +1242,23 @@ inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
   return cublasDscal_v2(handle, n, &alpha, x, incx);
 }
 
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                                   const int in_stride, const double* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, double* out) {
+  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
+}
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                                   const int in_stride, const float* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, float* out) {
+  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
+}
+
 } // namespace kaldi
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 2e096e76ae8..f205d1e3a8a 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -139,16 +139,61 @@ static void UnitTestCuMathSplice() {
   }
 }
 
+template<typename Real>
+static void UnitTestCuMathComputeLstmNonlinearity() {
+  for (int i = 0; i < 3; i++) {
+    int32 num_rows = 1 + Rand() % 100;
+    int32 cell_dim = 1 + Rand() % 2000;
+    Matrix<Real> Hinput(num_rows, 5 * cell_dim);
+    Matrix<Real> Hparams(3, cell_dim);
+    Matrix<Real> Houtput(num_rows, 2 * cell_dim);
+    Hinput.SetRandn();
+    Hparams.SetRandn();
+
+    CuMatrix<Real> Dinput(Hinput);
+    CuMatrix<Real> Dparams(Hparams);
+    CuMatrix<Real> Doutput(Houtput);
+
+    cu::CpuComputeLstmNonlinearity(Hinput, Hparams, &Houtput);
+    cu::ComputeLstmNonlinearity(Dinput, Dparams, &Doutput);
+
+    Matrix<Real> HDoutput(Doutput);
+    AssertEqual(Houtput, HDoutput);
+  }
+
+  for (int i = 16; i <= 1024; i *= 2) {
+    BaseFloat time_in_secs = 0.025;
+    int32 num_rows = i;
+    int32 cell_dim = i;
+    CuMatrix<Real> input(num_rows, 5 * cell_dim);
+    CuMatrix<Real> params(3, cell_dim);
+    CuMatrix<Real> output(num_rows, 2 * cell_dim);
+    input.SetRandn();
+    params.SetRandn();
+
+    Timer tim;
+    int32 iter = 0;
+    for (; tim.Elapsed() < time_in_secs; iter++)
+      cu::ComputeLstmNonlinearity(input, params, &output);
+
+    BaseFloat gflops = ((BaseFloat) i * i * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For ComputeLstmNonlinearity"
+              << (sizeof(Real)==8 ? "<double>" : "<float>") << ", for dim = "
+              << i << ", speed was " << gflops << " gigaflops";
+  }
+}
+
 template<typename Real> void CudaMathUnitTest() {
-  #if HAVE_CUDA == 1  
-    if (CuDevice::Instantiate().DoublePrecisionSupported())
-  #endif
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().DoublePrecisionSupported())
+#endif
+
+  UnitTestCuMathComputeLstmNonlinearity<Real>();
   UnitTestCuMathRandomize<Real>();
   UnitTestCuMathSplice<Real>();
   UnitTestCuMathCopy<Real>();
 }
 
-
 } // namespace kaldi
 
 
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index f4eac392772..f63ce754ece 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -235,69 +235,116 @@ void Randomize(const CuMatrixBase<double> &src,
 
 
 // not calling this Sigmoid to reduce the chance of future collisions.
-static inline BaseFloat ScalarSigmoid(BaseFloat a) {
-  if (a > 0.0) {
-    return 1.0 / (1.0 + Exp(-a));
+template<typename Real>
+static inline Real ScalarSigmoid(Real a) {
+  if (a > Real(0)) {
+    return Real(1) / (Real(1) + Exp(-a));
   } else {
     Real x = Exp(a);
-    return x / (x + 1.0);
+    return x / (x + Real(1));
   }
 }
 
-static inline BaseFloat ScalarTanh(BaseFloat a) {
-  if (a > 0.0) {
+template<typename Real>
+static inline Real ScalarTanh(Real a) {
+  if (a > Real(0)) {
     Real inv_expa = Exp(-a);
-    return -1.0 + 2.0 / (1.0 + inv_expa * inv_expa);
+    return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
   } else {
     Real expa = Exp(a);
-    return = 1.0 - 2.0 / (1.0 + expa * expa);
+    return Real(1) - Real(2) / (Real(1) + expa * expa);
   }
 }
 
+template<typename Real>
+void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
+                                const MatrixBase<Real> &params_mat,
+                                MatrixBase<Real> *output) {
+  int32 num_rows = input_mat.NumRows();
+  int32 cell_dim = input_mat.NumCols() / 5;
+  KALDI_ASSERT(output->NumRows() == num_rows);
+  KALDI_ASSERT(input_mat.NumCols() % 5 == 0);
+  KALDI_ASSERT(params_mat.NumRows() == 3);
+  KALDI_ASSERT(params_mat.NumCols() == cell_dim);
+  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
+
+  MatrixBase<Real> &output_mat = *output;
+  const Real *params_data = params_mat.Data();
+  int32 params_stride = params_mat.Stride();
+  for (int32 r = 0; r < num_rows; r++) {
+    const Real *input_row = input_mat.RowData(r);
+    Real *output_row = output_mat.RowData(r);
+    for (int32 c = 0; c < cell_dim; c++) {
+      Real i_part = input_row[c];
+      Real f_part = input_row[c + cell_dim];
+      Real c_part = input_row[c + 2 * cell_dim];
+      Real o_part = input_row[c + 3 * cell_dim];
+      Real c_prev = input_row[c + 4 * cell_dim];
+      Real w_ic = params_data[c];
+      Real w_fc = params_data[c + params_stride];
+      Real w_oc = params_data[c + params_stride * 2];
+      Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
+      Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
+      Real c_t = f_t * c_prev + i_t * ScalarTanh(c_part);
+      Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
+      Real m_t = o_t * ScalarTanh(c_t);
+      output_row[c] = c_t;
+      output_row[c + cell_dim] = m_t;
+    }
+  }
+}
 
-void ComputeLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
-                             const CuMatrixBase<BaseFloat> &params,
-                             CuMatrixBase<BaseFloat> *output) {
-  int32 num_rows = input.NumRows(),
-      cell_dim = input.NumCols() / 5;
-  KALDI_ASSERT(output->NumRows() == num_rows &&
-               input.NumCols() % 5 == 0 &&
-               params.NumRows() == 3 && params.NumCols() == cell_dim &&
-               output->NumCols() == 2 * cell_dim);
+template<typename Real>
+void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
+                             const CuMatrixBase<Real> &params,
+                             CuMatrixBase<Real> *output) {
+  int32 num_rows = input.NumRows();
+  int32 cell_dim = input.NumCols() / 5;
+  KALDI_ASSERT(output->NumRows() == num_rows);
+  KALDI_ASSERT(input.NumCols() % 5 == 0);
+  KALDI_ASSERT(params.NumRows() == 3);
+  KALDI_ASSERT(params.NumCols() == cell_dim);
+  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    KALDI_ERR << "CUDA version not implemented";
+    Timer tim;
+
+    // Each thread block is working on 1 row of the data.
+    // It's best that cell dim is a multiple fo CU1DBLOCK
+    dim3 dimBlock(CU1DBLOCK);
+    dim3 dimGrid(num_rows);
+
+    cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
+                           params.Data(), params.Stride(), output->Stride(),
+                           cell_dim, num_rows, output->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
-    const MatrixBase<BaseFloat> &input_mat = input.Mat(),
-        &params_mat = params.Mat();
-    MatrixBase<BaseFloat> &output_mat = *output;
-    const BaseFloat *params_data = params_mat.Data();
-    int32 params_stride = params_mat.Stride();
-    for (int32 r = 0; r < num_rows; r++) {
-      const BaseFloat *input_row = input_mat.RowData(r);
-      BaseFloat *output_row = output_mat.RowData(r);
-      for (int32 c = 0; c < cell_dim; c++) {
-        BaseFloat i_part = input_row[c], f_part = input_row[c + cell_dim],
-            c_part = input_row[c + 2 * cell_dim],
-            o_part = input_row[c + 3 * cell_dim],
-            c_prev = input_row[c + 4 * cell_dim],
-            w_ic = params_data[c], w_fc = params_data[c + params_stride],
-            w_oc = params_data[c + params_stride * 2];
-        BaseFloat i_t = ScalarSigmoid(i_part + w_ic * c_prev),
-            f_t = ScalarSigmoid(f_part + w_fc * c_prev),
-            c_t = f_t * c_prev + i_t * Tanh(c_part),
-            o_t = ScalarSigmoid(o_part + w_oc * c_t),
-            m_t = o_t * ScalarTanh(c_t);
-        output_row[c] = c_t;
-        output_row[c + cell_dim] = m_t;
-      }
-    }
+    CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
   }
 }
 
+template
+void CpuComputeLstmNonlinearity(const MatrixBase<float> &input_mat,
+                                const MatrixBase<float> &params_mat,
+                                MatrixBase<float> *output);
+template
+void CpuComputeLstmNonlinearity(const MatrixBase<double> &input_mat,
+                                const MatrixBase<double> &params_mat,
+                                MatrixBase<double> *output);
+template
+void ComputeLstmNonlinearity(const CuMatrixBase<float> &input,
+                             const CuMatrixBase<float> &params,
+                             CuMatrixBase<float> *output);
+template
+void ComputeLstmNonlinearity(const CuMatrixBase<double> &input,
+                             const CuMatrixBase<double> &params,
+                             CuMatrixBase<double> *output);
+
 
 void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
                               const CuMatrixBase<BaseFloat> &params,
@@ -310,47 +357,47 @@ void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<BaseFloat> *self_repair_sum_out) {
-  int32 num_rows = input.NumRows(),
-      cell_dim = input.NumCols() / 5;
-  KALDI_ASSERT(output_deriv.NumRows() == num_rows &&
-               input.NumCols() % 5 == 0 &&
-               params.NumRows() == 3 && params.NumCols() == cell_dim &&
-               output_deriv.NumCols() == 2 * cell_dim &&
-               deriv_sum_in.NumRows() == 5 && deriv_sum_in.NumCols() == cell_dim
-               && self_repair_config.Dim() == 10 && count_in >= 0);
-  if (input_deriv != NULL) {
-    KALDI_ASSERT(SameDim(input, *input_deriv));
-  }
-  if (params_deriv == NULL) {
-    KALDI_ASSERT(value_sum_out == NULL && deriv_sum_out == NULL &&
-                 self_repair_sum_out == NULL);
-  } else {
-    KALDI_ASSERT(value_sum_out != NULL && deriv_sum_out != NULL &&
-                 self_repair_sum_out != NULL && SameDim(params, *params_deriv) &&
-                 value_sum_out->NumRows() == 5 &&
-                 value_sum_out->NumCols() == cell_dim &&
-                 SameDim(* ...
-                         // HERE
-
-  KALDI_ASSERT(input.NumRows() == output->NumRows() &&
-               input.NumCols() % 5 == 0 &&
-               output->NumCols() == 2 * (input.NumCols() / 5));
-  int32 num_rows = input.NumRows(),
-      cell_dim = input.NumCols() / 5;
-
-#if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
-    KALDI_ERR << "CUDA version not implemented";
-  // notes for Shiyin:
-    //  You could do an 'easy' initial version where we have have one thread per dimension,
-    //  and you can try optimizing this later on.
-    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
-    // But up to you.
-  } else
-#endif
-  {
-
-  }
+//  int32 num_rows = input.NumRows(),
+//      cell_dim = input.NumCols() / 5;
+//  KALDI_ASSERT(output_deriv.NumRows() == num_rows &&
+//               input.NumCols() % 5 == 0 &&
+//               params.NumRows() == 3 && params.NumCols() == cell_dim &&
+//               output_deriv.NumCols() == 2 * cell_dim &&
+//               deriv_sum_in.NumRows() == 5 && deriv_sum_in.NumCols() == cell_dim
+//               && self_repair_config.Dim() == 10 && count_in >= 0);
+//  if (input_deriv != NULL) {
+//    KALDI_ASSERT(SameDim(input, *input_deriv));
+//  }
+//  if (params_deriv == NULL) {
+//    KALDI_ASSERT(value_sum_out == NULL && deriv_sum_out == NULL &&
+//                 self_repair_sum_out == NULL);
+//  } else {
+//    KALDI_ASSERT(value_sum_out != NULL && deriv_sum_out != NULL &&
+//                 self_repair_sum_out != NULL && SameDim(params, *params_deriv) &&
+//                 value_sum_out->NumRows() == 5 &&
+//                 value_sum_out->NumCols() == cell_dim &&
+//                 SameDim(* ...
+//                         // HERE
+//
+//  KALDI_ASSERT(input.NumRows() == output->NumRows() &&
+//               input.NumCols() % 5 == 0 &&
+//               output->NumCols() == 2 * (input.NumCols() / 5));
+//  int32 num_rows = input.NumRows(),
+//      cell_dim = input.NumCols() / 5;
+//
+//#if HAVE_CUDA == 1
+//  if (CuDevice::Instantiate().Enabled()) {
+//    KALDI_ERR << "CUDA version not implemented";
+//  // notes for Shiyin:
+//    //  You could do an 'easy' initial version where we have have one thread per dimension,
+//    //  and you can try optimizing this later on.
+//    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
+//    // But up to you.
+//  } else
+//#endif
+//  {
+//
+//  }
 }
 
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index c0626d124c0..eb741f7bb8d 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -102,10 +102,15 @@ void Group2norm(const CuMatrixBase<Real> &src,
                      m_t = o_t * Tanh(c_t)
 
 
-*/
-void ComputeLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
-                             const CuMatrixBase<BaseFloat> &params,
-                             CuMatrixBase<BaseFloat> *output);
+ */
+template<typename Real>
+void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
+                                const MatrixBase<Real> &params,
+                                MatrixBase<Real> *output);
+template<typename Real>
+void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
+                             const CuMatrixBase<Real> &params,
+                             CuMatrixBase<Real> *output);
 
 
 /**

From e56552b43695100d06bb04aded65f45e7a872d71 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 22 Nov 2016 00:36:00 -0500
Subject: [PATCH 41/71] Adding code for the backprop of LSTM.

Add test of derivatives of LSTM.

cpu function

refactor the cpu version; pass cpu test
---
 src/cudamatrix/cu-math-test.cc | 137 ++++++++++--
 src/cudamatrix/cu-math.cc      | 367 ++++++++++++++++++++++++++++-----
 src/cudamatrix/cu-math.h       |  31 ++-
 src/cudamatrix/cu-matrix.cc    |   5 +-
 src/cudamatrix/cu-vector.h     |  69 ++++---
 5 files changed, 498 insertions(+), 111 deletions(-)

diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index f205d1e3a8a..243e696187b 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -41,15 +41,15 @@ namespace kaldi {
 /*
  * Unit tests
  */
-      
-template<typename Real> 
+
+template<typename Real>
 static void UnitTestCuMathRandomize() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   CuMatrix<Real> src(M, N);
   CuMatrix<Real> tgt(M, N);
   CuArray<int32> copy_from_idx;
 
-  src.SetRandn(); 
+  src.SetRandn();
   int32 n_rows = src.NumRows();
   int32 n_columns = src.NumCols();
   std::vector<int32> copy_from_idx_vec;
@@ -59,7 +59,7 @@ static void UnitTestCuMathRandomize() {
   }
   copy_from_idx.CopyFromVec(copy_from_idx_vec);
   cu::Randomize(src, copy_from_idx, &tgt);
-  
+
   for (int32 i = 0; i < n_rows; i++) {
     for (int32 j = 0; j < n_columns; j++) {
       Real src_val = src(copy_from_idx_vec.at(i), j);
@@ -70,14 +70,14 @@ static void UnitTestCuMathRandomize() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMathCopy() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   CuMatrix<Real> src(M, N);
   CuMatrix<Real> tgt(M, N);
   CuArray<int32> copy_from_idx;
 
-  src.SetRandn(); 
+  src.SetRandn();
   int32 n_rows = src.NumRows();
   int32 n_columns = src.NumCols();
   std::vector<int32> copy_from_idx_vec;
@@ -87,7 +87,7 @@ static void UnitTestCuMathCopy() {
   }
   copy_from_idx.CopyFromVec(copy_from_idx_vec);
   cu::Copy(src, copy_from_idx, &tgt);
-  
+
   for (int32 i = 0; i < n_rows; i++) {
     for (int32 j = 0; j < n_columns; j++) {
       Real src_val = src(i, copy_from_idx_vec.at(j));
@@ -97,19 +97,19 @@ static void UnitTestCuMathCopy() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMathSplice() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   CuMatrix<Real> src(M, N);
   CuArray<int32> frame_offsets;
 
-  src.SetRandn(); 
+  src.SetRandn();
   int32 n_rows = src.NumRows();
   int32 n_columns = src.NumCols();
   std::vector<int32> frame_offsets_vec;
 
-  // The number of columns of tgt is rows(src) 
-  // times n_frame_offsets, so we keep n_frame_offsets 
+  // The number of columns of tgt is rows(src)
+  // times n_frame_offsets, so we keep n_frame_offsets
   // reasonably small (2 <= n <= 6).
   int32 n_frame_offsets = Rand() % 7 + 2;
   for (int32 i = 0; i < n_frame_offsets; i++) {
@@ -124,13 +124,13 @@ static void UnitTestCuMathSplice() {
   for (int32 i = 0; i < n_rows; i++) {
     for (int32 k = 0; k < n_frame_offsets; k++) {
       for (int32 j = 0; j < n_columns; j++) {
-        Real src_val; 
+        Real src_val;
         if (i + frame_offsets_vec.at(k) >= n_rows) {
           src_val = src_copy(n_rows-1, j);
         } else if (i + frame_offsets_vec.at(k) <= 0) {
           src_val = src_copy(0, j);
         } else {
-          src_val = src_copy(i + frame_offsets_vec.at(k), j); 
+          src_val = src_copy(i + frame_offsets_vec.at(k), j);
         }
         Real tgt_val = tgt_copy(i, k * n_columns + j);
         AssertEqual(src_val, tgt_val);
@@ -183,6 +183,113 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
   }
 }
 
+void UnitTestLstmNonlinearity() {
+  for (int32 loop = 0; loop < 100; loop++) {
+
+    // problem dimensions.
+    int32 num_rows = RandInt(5, 20),
+        cell_dim = RandInt(2, 200);
+
+    // Pick the (input or params block), and output block, for which we'll
+    // spot-check the derivative values.  This will give us test failures
+    // that are fine-grained enough to assist debugging.
+    int32 test_input = RandInt(0, 4),
+        test_params = RandInt(0, 2),
+        test_output = RandInt(0, 1);
+
+    // set one of test_input or test_params to -1, meaning we're not testing that
+    // thing.  only test one at a time.
+    if (RandInt(0, 1) == 0)
+      test_input = -1;
+    else
+      test_params = -1;
+
+
+    CuMatrix<BaseFloat> input(num_rows, cell_dim * 5),
+        params(3, cell_dim),
+        output_deriv(num_rows, cell_dim * 2);
+    input.SetRandn();
+    params.SetRandn();
+    // set just one block of the output deriv to a random value.
+    output_deriv.ColRange(test_output * cell_dim, cell_dim).SetRandn();
+
+
+
+    CuMatrix<BaseFloat> output(num_rows, cell_dim * 2);
+
+    cu::ComputeLstmNonlinearity(input, params, &output);
+
+    BaseFloat baseline_objf = TraceMatMat(output, output_deriv, kTrans);
+
+    // not really testing self repair here... will debug it when we actually run
+    // it, by looking at the diagnostics.
+    CuMatrix<double> deriv_sum(5, cell_dim),
+        value_sum(5, cell_dim);
+    CuVector<BaseFloat> self_repair_config(10.0); // leave at zero... we don't really test this here.
+    CuMatrix<BaseFloat>
+        self_repair_sum(5, cell_dim),
+        input_deriv(num_rows, 5 * cell_dim),
+        params_deriv(3, cell_dim);
+
+    double count_in = 0.0;
+
+    // get derivative w.r.t. input and params, which we are testing.
+    cu::BackpropLstmNonlinearity(input, params, output_deriv, deriv_sum,
+                                 self_repair_config, count_in,
+                                 &input_deriv, &params_deriv,
+                                 &value_sum, &deriv_sum, &self_repair_sum);
+
+
+    int32 test_dim = 5;  // number of separate offsets we add while testing the
+    // derivatives... reduces randomness in test.
+    BaseFloat delta = 1.0e-03;
+    Vector<BaseFloat> predicted_objf_change(test_dim),
+        measured_objf_change(test_dim);
+
+    for (int32 i = 0; i < test_dim; i++) {
+      CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim),
+          delta_params(3, cell_dim);
+      if (test_input >= 0) {
+        delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
+        delta_input.Scale(delta);
+      }
+      if (test_params >= 0) {
+        delta_params.Row(test_params).SetRandn();
+        delta_params.Scale(delta);
+      }
+
+
+
+      predicted_objf_change(i) = TraceMatMat(delta_input, input_deriv, kTrans) +
+          TraceMatMat(delta_params, params_deriv, kTrans);
+
+
+      CuMatrix<BaseFloat> perturbed_input(input);
+      perturbed_input.AddMat(1.0, delta_input);
+
+      CuMatrix<BaseFloat> perturbed_params(params);
+      perturbed_params.AddMat(1.0, delta_params);
+
+      CuMatrix<BaseFloat> perturbed_output(num_rows, 2 * cell_dim);
+      cu::ComputeLstmNonlinearity(perturbed_input, perturbed_params,
+                                  &perturbed_output);
+      BaseFloat new_objf = TraceMatMat(perturbed_output, output_deriv, kTrans),
+          objf_change = new_objf - baseline_objf;
+      measured_objf_change(i) = objf_change;
+    }
+    KALDI_LOG << "LSTM nonlinearity test: num_rows=" << num_rows
+              << ", cell_dim=" << cell_dim << ", test_input=" << test_input
+              << ", test_params=" << test_params
+              << ", test_output=" << test_output
+              << ", predicted_objf_change=" << predicted_objf_change
+              << ", measured_objf_change=" << measured_objf_change;
+
+    if (!ApproxEqual(predicted_objf_change, measured_objf_change, BaseFloat(0.1F))) {
+      KALDI_ERR << "LSTM nonlinearity test failed.";
+    }
+  }
+}
+
 template<typename Real> void CudaMathUnitTest() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().DoublePrecisionSupported())
@@ -192,6 +299,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestCuMathRandomize<Real>();
   UnitTestCuMathSplice<Real>();
   UnitTestCuMathCopy<Real>();
+  UnitTestLstmNonlinearity();
 }
 
 } // namespace kaldi
@@ -208,7 +316,7 @@ int main() {
 #endif
     srand(time(NULL));
     kaldi::CudaMathUnitTest<float>();
-    
+
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaMathUnitTest<double>();
@@ -229,4 +337,3 @@ int main() {
 #endif
   return 0;
 }
-
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index f63ce754ece..806f4e309ab 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -345,61 +345,332 @@ void ComputeLstmNonlinearity(const CuMatrixBase<double> &input,
                              const CuMatrixBase<double> &params,
                              CuMatrixBase<double> *output);
 
+template<typename Real>
+void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
+                                 const MatrixBase<Real> &params,
+                                 const MatrixBase<Real> &output_deriv,
+                                 const MatrixBase<double> &deriv_sum_in,
+                                 const VectorBase<Real> &self_repair_config,
+                                 double count_in,
+                                 MatrixBase<Real> *input_deriv,
+                                 MatrixBase<Real> *params_deriv,
+                                 MatrixBase<double> *value_sum_out,
+                                 MatrixBase<double> *deriv_sum_out,
+                                 MatrixBase<Real> *self_repair_sum_out) {
+  int32 num_rows = input.NumRows();
+  int32 cell_dim = input.NumCols() / 5;
+  // Check dimensions.
+  KALDI_ASSERT(input.NumCols() % 5 == 0);
+  KALDI_ASSERT(params.NumRows() == 3);
+  KALDI_ASSERT(params.NumCols() == cell_dim);
+  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
+  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
+  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
+  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
+  KALDI_ASSERT(self_repair_config.Dim() == 10);
+  KALDI_ASSERT(count_in >= 0);
+  if (input_deriv != NULL) {
+    KALDI_ASSERT(SameDim(input, *input_deriv));
+  }
+  if (params_deriv == NULL) {
+    KALDI_ASSERT(value_sum_out == NULL);
+    KALDI_ASSERT(deriv_sum_out == NULL);
+    KALDI_ASSERT(self_repair_sum_out == NULL);
+  } else {
+    KALDI_ASSERT(value_sum_out != NULL);
+    KALDI_ASSERT(deriv_sum_out != NULL);
+    KALDI_ASSERT(self_repair_sum_out != NULL);
+    KALDI_ASSERT(SameDim(params, *params_deriv));
+    KALDI_ASSERT(value_sum_out->NumRows() == 5);
+    KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
+    KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
+    KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
+    KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
+  }
+
+  const MatrixBase<Real> &input_mat = input;
+  const MatrixBase<Real> &params_mat = params;
+  const MatrixBase<Real> &output_deriv_mat = output_deriv;
+  const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
+  const VectorBase<Real> &sr_config = self_repair_config;
+  MatrixBase<Real> *input_deriv_mat = (
+      input_deriv == NULL ? NULL : input_deriv);
+  MatrixBase<Real> *params_deriv_mat = NULL;
+  MatrixBase<Real> *self_repair_sum_out_mat = NULL;
+  MatrixBase<double> *value_sum_out_mat = NULL;
+  MatrixBase<double> *deriv_sum_out_mat = NULL;
+  if (params_deriv != NULL) {
+    params_deriv_mat = params_deriv;
+    value_sum_out_mat = value_sum_out;
+    deriv_sum_out_mat = deriv_sum_out;
+    self_repair_sum_out_mat = self_repair_sum_out;
+  }
+
+
+  // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
+  Real count = 1.0 + count_in;
+  for (int32 c = 0; c < cell_dim; c++) {
+    // parameters
+    Real w_ic = params_mat(0, c);
+    Real w_fc = params_mat(1, c);
+    Real w_oc = params_mat(2, c);
+    // derivative sums w.r.t. parameters.
+    Real w_ic_deriv_sum = 0.0;
+    Real w_fc_deriv_sum = 0.0;
+    Real w_oc_deriv_sum = 0.0;
+
+    // average derivatives, for self-repair.
+    // The 5 nonlinearities that are subject to self-repair are written as:
+    //  Sigmoid(i_t_input), Sigmoid(f_t_input),
+    //  Tanh(c_part), Sigmoid(o_t_input),  Tanh(c_t)
+    Real i_t_self_repair = (
+        deriv_sum_in(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
+    Real f_t_self_repair = (
+        deriv_sum_in(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
+    Real c_part_self_repair = (
+        deriv_sum_in(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
+    Real o_t_self_repair = (
+        deriv_sum_in(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
+    Real c_t_self_repair = (
+        deriv_sum_in(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
+    // Note on how we add self-repair for sigmoids/tanh's.  If self-repair
+    // is activated for this unit, then...
+    // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
+    // ... to the input-deriv;
+    // For tanh's we'd add -self_repair_scale * tanh(x)
+    // If self-repair is not activated, the 'self_repair' scales are set to zero.
+
+    // The following variables are for the accumulation of stats on the
+    // sigmoid and tanh units.
+    Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
+    Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
+    Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
+    Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
+    Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
+
+
+    for (int32 r = 0; r < num_rows; r++) {
+      Real i_part = input_mat(r, c),
+          f_part = input_mat(r, c + cell_dim),
+          c_part = input_mat(r, c + 2 * cell_dim),
+          o_part = input_mat(r, c + 3 * cell_dim),
+          c_prev = input_mat(r, c + 4 * cell_dim);
+      // For greater clarity, we give some of the quantities in the
+      // forward equations their own names.
+      Real i_t_input = i_part + w_ic * c_prev,
+          i_t = ScalarSigmoid(i_t_input),
+          f_t_input = f_part + w_fc * c_prev,
+          f_t = ScalarSigmoid(f_t_input),
+          tanh_c_part = ScalarTanh(c_part),
+          c_t = f_t * c_prev + i_t * tanh_c_part,
+          o_t_input = o_part + w_oc * c_t,
+          o_t = ScalarSigmoid(o_t_input),
+          tanh_c_t = ScalarTanh(c_t);
+      // we'd also compute, in the forward pass,
+      //   m_t = o_t * tanh_c_t;
+      // but this variable is not needed.
+
+      // Accumulate nonlinearity value and derivative stats.
+      // Note:
+      //    tanh'(x)  = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
+      //  sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
+      i_t_value_sum += i_t;
+      i_t_deriv_sum += i_t * (1.0F - i_t);
+      f_t_value_sum += f_t;
+      f_t_deriv_sum += f_t * (1.0F - f_t);
+      c_part_value_sum += tanh_c_part;
+      c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
+      o_t_value_sum += o_t;
+      o_t_deriv_sum += o_t * (1.0F - o_t);
+      c_t_value_sum += tanh_c_t;
+      c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
+
+
+      // the derivative of the objective function w.r.t. a particular quantity
+      // will be written by prepending "d" to the name.
+      // We compute these derivatives in the reverse of the order in which
+      // we computed the original quantities.
+      // dc_t_out is the part of the derivative w.r.t. c_t that
+      // comes directly from the output of this function.
+      Real dc_t_out = output_deriv_mat(r, c);
+      Real dm_t = output_deriv_mat(r, c + cell_dim);
+      Real dtanh_c_t = o_t * dm_t;
+      Real do_t = tanh_c_t * dm_t;
+      Real do_t_input = (o_t * (1.0F - o_t) * do_t
+          - (2.0F * o_t - 1.0F) * o_t_self_repair);
+      Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
+          + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
+      Real dtanh_c_part = i_t * dc_t;
+      Real df_t = dc_t * c_prev;
+      Real df_t_input = (df_t * f_t * (1.0F - f_t)
+          - (2.0F * f_t - 1.0F) * f_t_self_repair);
+      Real di_t = dc_t * tanh_c_part;
+      Real di_t_input = (di_t * i_t * (1.0F - i_t)
+          - (2.0F * i_t - 1.0F) * i_t_self_repair);
+
+      w_ic_deriv_sum += c_prev * di_t_input;
+      w_fc_deriv_sum += c_prev * df_t_input;
+      w_oc_deriv_sum += c_t * do_t_input;
+
+      Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t;
+      Real do_part = do_t_input;
+      Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
+          - tanh_c_part * c_part_self_repair);
+      Real df_part = df_t_input;
+      Real di_part = di_t_input;
+
+      if (input_deriv_mat != NULL) {
+        (*input_deriv_mat)(r, c) += di_part;
+        (*input_deriv_mat)(r, c + cell_dim) += df_part;
+        (*input_deriv_mat)(r, c + 2 * cell_dim) += dc_part;
+        (*input_deriv_mat)(r, c + 3 * cell_dim) += do_part;
+        (*input_deriv_mat)(r, c + 4 * cell_dim) += dc_prev;
+      }
+    }
 
-void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
-                              const CuMatrixBase<BaseFloat> &params,
-                              const CuMatrixBase<BaseFloat> &output_deriv,
+    if (params_deriv != NULL) {
+      // note: for optimizing things you can assume that params_deriv and
+      // input_deriv_mat are non-NULL (i.e. all the output matrices are
+      // non-NULL).  The situations when some of the output matrices are NULL
+      // does not happen often (mainly only in testing code).
+
+      (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
+      (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
+      (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
+
+      (*value_sum_out_mat)(0, c) += i_t_value_sum;
+      (*value_sum_out_mat)(1, c) += f_t_value_sum;
+      (*value_sum_out_mat)(2, c) += c_part_value_sum;
+      (*value_sum_out_mat)(3, c) += o_t_value_sum;
+      (*value_sum_out_mat)(4, c) += c_t_value_sum;
+
+      // need to update self_repair_sum_out before deriv_sum_out, because
+      // deriv_sum_out and deriv_sum_in might point to the same memory.
+      for (int32 i = 0; i < 5; i++)
+        (*self_repair_sum_out_mat)(i, c) +=
+            (deriv_sum_in(i, c) / count < sr_config(i) ? num_rows : 0);
+
+      (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
+      (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
+      (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
+      (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
+      (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
+    }
+  }
+}
+
+template<typename Real>
+void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
+                              const CuMatrixBase<Real> &params,
+                              const CuMatrixBase<Real> &output_deriv,
                               const CuMatrixBase<double> &deriv_sum_in,
-                              const CuVectorBase<BaseFloat> &self_repair_config,
+                              const CuVectorBase<Real> &self_repair_config,
                               double count_in,
-                              CuMatrixBase<BaseFloat> *input_deriv,
-                              CuMatrixBase<BaseFloat> *params_deriv,
+                              CuMatrixBase<Real> *input_deriv,
+                              CuMatrixBase<Real> *params_deriv,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
-                              CuMatrixBase<BaseFloat> *self_repair_sum_out) {
-//  int32 num_rows = input.NumRows(),
-//      cell_dim = input.NumCols() / 5;
-//  KALDI_ASSERT(output_deriv.NumRows() == num_rows &&
-//               input.NumCols() % 5 == 0 &&
-//               params.NumRows() == 3 && params.NumCols() == cell_dim &&
-//               output_deriv.NumCols() == 2 * cell_dim &&
-//               deriv_sum_in.NumRows() == 5 && deriv_sum_in.NumCols() == cell_dim
-//               && self_repair_config.Dim() == 10 && count_in >= 0);
-//  if (input_deriv != NULL) {
-//    KALDI_ASSERT(SameDim(input, *input_deriv));
-//  }
-//  if (params_deriv == NULL) {
-//    KALDI_ASSERT(value_sum_out == NULL && deriv_sum_out == NULL &&
-//                 self_repair_sum_out == NULL);
-//  } else {
-//    KALDI_ASSERT(value_sum_out != NULL && deriv_sum_out != NULL &&
-//                 self_repair_sum_out != NULL && SameDim(params, *params_deriv) &&
-//                 value_sum_out->NumRows() == 5 &&
-//                 value_sum_out->NumCols() == cell_dim &&
-//                 SameDim(* ...
-//                         // HERE
-//
-//  KALDI_ASSERT(input.NumRows() == output->NumRows() &&
-//               input.NumCols() % 5 == 0 &&
-//               output->NumCols() == 2 * (input.NumCols() / 5));
-//  int32 num_rows = input.NumRows(),
-//      cell_dim = input.NumCols() / 5;
-//
-//#if HAVE_CUDA == 1
-//  if (CuDevice::Instantiate().Enabled()) {
-//    KALDI_ERR << "CUDA version not implemented";
-//  // notes for Shiyin:
-//    //  You could do an 'easy' initial version where we have have one thread per dimension,
-//    //  and you can try optimizing this later on.
-//    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
-//    // But up to you.
-//  } else
-//#endif
-//  {
-//
-//  }
+                              CuMatrixBase<Real> *self_repair_sum_out) {
+  int32 num_rows = input.NumRows();
+  int32 cell_dim = input.NumCols() / 5;
+  // Check dimensions.
+  KALDI_ASSERT(input.NumCols() % 5 == 0);
+  KALDI_ASSERT(params.NumRows() == 3);
+  KALDI_ASSERT(params.NumCols() == cell_dim);
+  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
+  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
+  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
+  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
+  KALDI_ASSERT(self_repair_config.Dim() == 10);
+  KALDI_ASSERT(count_in >= 0);
+  if (input_deriv != NULL) {
+    KALDI_ASSERT(SameDim(input, *input_deriv));
+  }
+  if (params_deriv == NULL) {
+    KALDI_ASSERT(value_sum_out == NULL);
+    KALDI_ASSERT(deriv_sum_out == NULL);
+    KALDI_ASSERT(self_repair_sum_out == NULL);
+  } else {
+    KALDI_ASSERT(value_sum_out != NULL);
+    KALDI_ASSERT(deriv_sum_out != NULL);
+    KALDI_ASSERT(self_repair_sum_out != NULL);
+    KALDI_ASSERT(SameDim(params, *params_deriv));
+    KALDI_ASSERT(value_sum_out->NumRows() == 5);
+    KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
+    KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
+    KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
+    KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
+  }
+
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ERR << "CUDA version not implemented";
+  // notes for Shiyin:
+    //  You could do an 'easy' initial version where we have have one thread per dimension,
+    //  and you can try optimizing this later on.
+    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
+    // But up to you.
+  } else
+#endif
+  {
+    CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
+                                deriv_sum_in.Mat(), self_repair_config.Vec(),
+                                count_in, &(input_deriv->Mat()),
+                                &(params_deriv->Mat()), &(value_sum_out->Mat()),
+                                &(deriv_sum_out->Mat()),
+                                &(self_repair_sum_out->Mat()));
+  }
 }
 
+template
+void CpuBackpropLstmNonlinearity(const MatrixBase<float> &input,
+                                 const MatrixBase<float> &params,
+                                 const MatrixBase<float> &output_deriv,
+                                 const MatrixBase<double> &deriv_sum_in,
+                                 const VectorBase<float> &self_repair_config,
+                                 double count_in,
+                                 MatrixBase<float> *input_deriv,
+                                 MatrixBase<float> *params_deriv,
+                                 MatrixBase<double> *value_sum_out,
+                                 MatrixBase<double> *deriv_sum_out,
+                                 MatrixBase<float> *self_repair_sum_out);
+template
+void CpuBackpropLstmNonlinearity(const MatrixBase<double> &input,
+                                 const MatrixBase<double> &params,
+                                 const MatrixBase<double> &output_deriv,
+                                 const MatrixBase<double> &deriv_sum_in,
+                                 const VectorBase<double> &self_repair_config,
+                                 double count_in,
+                                 MatrixBase<double> *input_deriv,
+                                 MatrixBase<double> *params_deriv,
+                                 MatrixBase<double> *value_sum_out,
+                                 MatrixBase<double> *deriv_sum_out,
+                                 MatrixBase<double> *self_repair_sum_out);
+template
+void BackpropLstmNonlinearity(const CuMatrixBase<float> &input,
+                              const CuMatrixBase<float> &params,
+                              const CuMatrixBase<float> &output_deriv,
+                              const CuMatrixBase<double> &deriv_sum_in,
+                              const CuVectorBase<float> &self_repair_config,
+                              double count_in,
+                              CuMatrixBase<float> *input_deriv,
+                              CuMatrixBase<float> *params_deriv,
+                              CuMatrixBase<double> *value_sum_out,
+                              CuMatrixBase<double> *deriv_sum_out,
+                              CuMatrixBase<float> *self_repair_sum_out);
+template
+void BackpropLstmNonlinearity(const CuMatrixBase<double> &input,
+                              const CuMatrixBase<double> &params,
+                              const CuMatrixBase<double> &output_deriv,
+                              const CuMatrixBase<double> &deriv_sum_in,
+                              const CuVectorBase<double> &self_repair_config,
+                              double count_in,
+                              CuMatrixBase<double> *input_deriv,
+                              CuMatrixBase<double> *params_deriv,
+                              CuMatrixBase<double> *value_sum_out,
+                              CuMatrixBase<double> *deriv_sum_out,
+                              CuMatrixBase<double> *self_repair_sum_out);
 
 
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index eb741f7bb8d..33b2c4e6473 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -195,21 +195,30 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
                      diagnostics.
 */
 
-void BackpropLstmNonlinearity(const CuMatrixBase<BaseFloat> &input,
-                              const CuMatrixBase<BaseFloat> &params,
-                              const CuMatrixBase<BaseFloat> &output_deriv,
+template<typename Real>
+void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
+                              const CuMatrixBase<Real> &params,
+                              const CuMatrixBase<Real> &output_deriv,
                               const CuMatrixBase<double> &deriv_sum_in,
-                              const CuVectorBase<BaseFloat> &self_repair_config,
+                              const CuVectorBase<Real> &self_repair_config,
                               double count_in,
-                              CuMatrixBase<BaseFloat> *input_deriv,
-                              CuMatrixBase<BaseFloat> *params_deriv,
+                              CuMatrixBase<Real> *input_deriv,
+                              CuMatrixBase<Real> *params_deriv,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
-                              CuMatrixBase<BaseFloat> *self_repair_sum_out);
-
-
-
-
+                              CuMatrixBase<Real> *self_repair_sum_out);
+template<typename Real>
+void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
+                                 const MatrixBase<Real> &params,
+                                 const MatrixBase<Real> &output_deriv,
+                                 const MatrixBase<double> &deriv_sum_in,
+                                 const VectorBase<Real> &self_repair_config,
+                                 double count_in,
+                                 MatrixBase<Real> *input_deriv,
+                                 MatrixBase<Real> *params_deriv,
+                                 MatrixBase<double> *value_sum_out,
+                                 MatrixBase<double> *deriv_sum_out,
+                                 MatrixBase<Real> *self_repair_sum_out);
 
 } // namespace cu
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index afe884b2b76..f16b7f0bf52 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1542,8 +1542,7 @@ void CuMatrixBase<Real>::ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
   }
 }
 
-// DiffSigmoid(Ein, Y, Eout) -> Eout.DiffSigmoid(Y, Ein).
-template<typename Real> // Eout -> *this, Ein -> diff, Y -> value
+template<typename Real>
 void CuMatrixBase<Real>::DiffSigmoid(const CuMatrixBase<Real> &value,
                                      const CuMatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
@@ -1588,7 +1587,7 @@ void CuMatrixBase<Real>::Tanh(const CuMatrixBase<Real> &src) {
 
 
 
-template<typename Real> // Ein -> diff, Y -> value
+template<typename Real>
 void CuMatrixBase<Real>::DiffTanh(const CuMatrixBase<Real> &value,
                                   const CuMatrixBase<Real> &diff) {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 54c1ac0ad4f..cff5270e6cf 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -62,20 +62,20 @@ class CuVectorBase {
                                const CuArray<int32> &frame_offsets,
                                CuMatrixBase<Real> *tgt);
   friend class CuRand<Real>;
-  
+
   /// Dimensions
-  MatrixIndexT Dim() const { return dim_;  }   
+  MatrixIndexT Dim() const { return dim_;  }
 
   /// Returns a pointer to the start of the vector's data.
   inline Real* Data() { return data_; }
   /// Returns a pointer to the start of the vector's data (const).
   inline const Real* Data() const { return data_; }
-  
+
   /// Copy functions; these will crash if the dimension
   /// do not match.  The operator = in class CuVector will
   /// also change the sizes for you.
   void CopyFromVec(const CuVectorBase<Real> &src);
-  
+
   template<typename OtherReal>
   void CopyFromVec(const CuVectorBase<OtherReal> &M);
 
@@ -84,17 +84,17 @@ class CuVectorBase {
 
   template<typename OtherReal>
   void CopyToVec(VectorBase<OtherReal> *dst) const;
-  
+
   void CopyRowsFromMat(const CuMatrixBase<Real> &M);
 
   void CopyRowsFromMat(const MatrixBase<Real> &M);
-  
+
   /// Math operations
   void SetZero();
   void Set(Real value);
   void Add(Real value);
   void Scale(Real value);
-  
+
   void AddVec(Real alpha, const CuVectorBase<Real> &vec, Real beta = 1.0);
 
   template<typename OtherReal>
@@ -103,20 +103,20 @@ class CuVectorBase {
   /// Sum the rows of the matrix, add to vector
   void AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
   /// Sum the columns of the matrix, add to vector
-  void AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0); 
+  void AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat, Real beta = 1.0);
 
   /// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
   /// Works even if rv == *this.
   void AddTpVec(const Real alpha, const CuTpMatrix<Real>&M,
                 const MatrixTransposeType trans, const CuVectorBase<Real> &v,
                 const Real beta);  // **beta previously defaulted to 0.0**
-  
+
   /// Multiplies this vector by lower-triangular marix:  *this <-- *this *M
   void MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeType trans);
 
   bool ApproxEqual(const CuVectorBase<Real> &other, float tol = 0.01) const;
-  
-  void InvertElements(); 
+
+  void InvertElements();
 
   void ApplySoftMax();
   void ApplyExp();
@@ -126,7 +126,7 @@ class CuVectorBase {
   void ApplyPow(Real power);
   Real Sum() const;
   void SetRandn();
-  
+
   CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
     return CuSubVector<Real>(*this, o, l);
   }
@@ -161,7 +161,7 @@ class CuVectorBase {
   /// as you would expect.
   void AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M, MatrixTransposeType transM,
                      const CuMatrixBase<Real> &N, MatrixTransposeType transN,
-                     Real beta = 1.0);  
+                     Real beta = 1.0);
 
   inline CuValue<Real> operator() (MatrixIndexT i) {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
@@ -170,7 +170,7 @@ class CuVectorBase {
   }
 
   Real Norm(Real p); // Only works for p = 1 and p = 2.
-  
+
   inline Real operator() (MatrixIndexT i) const {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                           static_cast<UnsignedMatrixIndexT>(dim_));
@@ -183,32 +183,33 @@ class CuVectorBase {
   /// Extracts the diagonal of a matrix.
   void CopyDiagFromMat(const CuMatrix<Real> &M);
 
-  /// Returns the maximum value of any element, or -infinity for the empty vector.  
+  /// Returns the maximum value of any element, or -infinity for the empty vector.
   Real Max() const;
 
-  /// Returns the minimum value of any element, or +infinity for the empty vector.  
+  /// Returns the minimum value of any element, or +infinity for the empty vector.
   Real Min() const;
-  
+
   // Set each element to y = (x == orig ? changed : x).
   void ReplaceValue(Real orig, Real changed);
-  
-  void MulElements(const CuVectorBase<Real> &v);
- protected:
 
+  void MulElements(const CuVectorBase<Real> &v);
   // The following two functions should only be called if we did not compile
   // with CUDA or could not get a CUDA card; in that case the contents are
   // interpreted the same as a regular vector.
+  // Do not use the following functions unless you know what you are doing!
   inline const VectorBase<Real> &Vec() const {
     return *(reinterpret_cast<const VectorBase<Real>* >(this));
   }
   inline VectorBase<Real> &Vec() {
     return *(reinterpret_cast<VectorBase<Real>* >(this));
   }
-  
+
+ protected:
+
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
   CuVectorBase<Real>(): data_(NULL), dim_(0) { }
-  
+
   Real *data_; ///< GPU data pointer (or regular data pointer
                ///< if CUDA is not compiled in or we have no GPU).
   MatrixIndexT dim_; ///< dimension of the vector
@@ -225,14 +226,14 @@ class CuVector: public CuVectorBase<Real> {
   friend class CuPackedMatrix<Real>;
   friend class CuSpMatrix<Real>;
   friend class CuTpMatrix<Real>;
-  
+
  public:
   CuVector() { }
   CuVector(MatrixIndexT dim, MatrixResizeType t = kSetZero) { Resize(dim, t); }
-  
+
   CuVector(const CuVectorBase<Real> &v);
 
-  CuVector(const VectorBase<Real> &v);  
+  CuVector(const VectorBase<Real> &v);
   explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {
     Resize(v.Dim(), kUndefined);
     this->CopyFromVec(v);
@@ -252,7 +253,7 @@ class CuVector: public CuVectorBase<Real> {
 
   /// Allocate the memory
   void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero);
-  
+
   ~CuVector() { Destroy(); }
 
   CuVector<Real> &operator = (const CuVectorBase<Real> &other) {
@@ -271,9 +272,9 @@ class CuVector: public CuVectorBase<Real> {
     this->CopyFromVec(other);
     return *this;
   }
-      
 
-  /// I/O 
+
+  /// I/O
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &is, bool binary) const;
 
@@ -286,7 +287,7 @@ class CuVector: public CuVectorBase<Real> {
 // We'll fill out the following class if it's needed.
 template<typename Real>
 class CuSubVector: public CuVectorBase<Real> {
- public:  
+ public:
   CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,
               const MatrixIndexT length) : CuVectorBase<Real>() {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
@@ -308,20 +309,20 @@ class CuSubVector: public CuVectorBase<Real> {
     CuVectorBase<Real>::data_ = const_cast<Real*>(data);
     CuVectorBase<Real>::dim_ = length;
   }
-    
+
   /// This operation does not preserve const-ness, so be careful.
   CuSubVector(const CuMatrixBase<Real> &matrix, MatrixIndexT row) {
     CuVectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
     CuVectorBase<Real>::dim_ = matrix.NumCols();
   }
-  
+
 
 };
 
 /// I/O
 template<typename Real>
 std::ostream &operator << (std::ostream &out, const CuVectorBase<Real> &vec);
- 
+
 
 template<typename Real>
 bool ApproxEqual(const CuVectorBase<Real> &a,
@@ -330,7 +331,7 @@ bool ApproxEqual(const CuVectorBase<Real> &a,
 }
 
 template<typename Real>
-inline void AssertEqual(const CuVectorBase<Real> &a, 
+inline void AssertEqual(const CuVectorBase<Real> &a,
                         const CuVectorBase<Real> &b, Real tol = 0.01) {
   KALDI_ASSERT(a.ApproxEqual(b, tol));
 }
@@ -349,7 +350,7 @@ void VectorBase<Real>::CopyFromVec(const CuVectorBase<OtherReal> &cu) {
 
 // declare template specializations.
 template <>
-template <>    
+template <>
 void CuVectorBase<double>::CopyFromVec<float>(const CuVectorBase<float> &src);
 
 template<>

From 15bb1951048ac1a2a534b0e05811a82f74742ff0 Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Wed, 23 Nov 2016 22:05:31 +0800
Subject: [PATCH 42/71] cuda kernel for backprop of lstm

working on kernel code

compilable kernel code

fix bug

pass unit test and deriv test

make nnet3 compilable.

speed test for backprop lstm
---
 src/cudamatrix/cu-kernels-ansi.h  |  40 +++
 src/cudamatrix/cu-kernels.cu      | 455 ++++++++++++++++++++++++++++++
 src/cudamatrix/cu-kernels.h       |  60 ++++
 src/cudamatrix/cu-math-test.cc    | 158 ++++++++++-
 src/cudamatrix/cu-math.cc         |  92 +++++-
 src/cudamatrix/cu-math.h          |   2 +-
 src/nnet3/nnet-simple-component.h |   8 +-
 7 files changed, 803 insertions(+), 12 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index b7571383193..bf504347872 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -651,6 +651,46 @@ void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                              const int params_stride, const int out_stride,
                              const int cell_dim, const int num_rows,
                              float* out);
+void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const double* input,
+                                  const int in_stride, const double* params,
+                                  const int params_stride,
+                                  const double* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const double* self_repair_config,
+                                  double count, double* input_deriv,
+                                  const int input_deriv_stride,
+                                  double* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  double* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const float* input,
+                                  const int in_stride, const float* params,
+                                  const int params_stride,
+                                  const float* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const float* self_repair_config, double count,
+                                  float* input_deriv,
+                                  const int input_deriv_stride,
+                                  float* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  float* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+
+
 
 } // extern "C"
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 614f8ec4cc6..3fb38b54c20 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2688,6 +2688,409 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
 }
 
 
+/**
+   This function does the 'backward' pass corresponding to the function
+   ComputeLstmNonlinearity.  It's a little more complicated than you might
+   expect because of the 'self-repair' mechanism that we use to prevent the
+   sigmoid and tanh nonlinearities oversaturating,  and because of the
+   average-activation and average-derivative stats that we store for these
+   nonlinearites (these stats are used both to control the self-repair
+   mechanism, and for diagnostic purposes).
+
+   Because the forward pass computes various intermediate values that are not
+   output, this function actually has to do the same computations as the
+   forward pass before it actually does the backprop.
+
+   In the following description, `C` is for `cell_dim`, `N` is for `num_rows`.
+
+ @param [in]  input  The same as in ComputeLstmNonlinearity().
+                     A matrix, of dimension N by 5C (i.e. its num-cols must be
+                     a multiple of 5).  The column-space is interpreted as 5
+                     consecutive blocks, each of dimension C, which we name:
+                     (i_part, f_part, c_part, o_part, c_{t-1}).
+ @param [in] params  The same as in ComputeLstmNonlinearity().
+                     A matrix, of dimension 3 by C, with rows containing the
+                     three diagonal parameter matrices used in LSTMs, namely
+                     w_{ic}, w_{fc} and w_{oc}.
+ @param [in] output_deriv
+                     A matrix, of dimension N by 2C, containing the derivative
+                     of the objective function we're backpropagating,
+                     w.r.t. the quantities c_t and m_t (in two blocks of
+                     column-dimension C).
+ @param [in] deriv_sum_in
+                     This is used in the self-repair code to identify
+                     oversaturated nonlinearities.
+                     It is a matrix, of dimension 5 by C, corresponding to
+                     the totals of the derivatives of the 5 sigmoid and tanh
+                     nonlinearities, in they order they appear in the equations
+                     in the documentation of ComputeLstmNonlinearity()
+                     respectively,
+                     they appear in the equations for (i_t, f_t, c_t, o_t, m_t).
+                     This will be divided by 'count_in' to get the average
+                     derivative value so far, for each of the nonlinearities.
+ @param [in] self_repair_config
+                     A vector of dimension 10, containing the configuration of
+                     the self-repair to be used for the 5 nonlinearities.
+                     The first 5 elements are the self_repair_lower_threshold
+                     values (typically 0.05 for sigmoid and 0.2 for tanh),
+                     and the next 5 elements are the corresponding
+                     self-repair-scales (typically 10^-5).
+ @param [in] count_in  The data-count that corresponds to the stats in
+                     'deriv_sum_in' at entry to the function.
+                     This function should tolerate the count being zero
+                     (in that case, it is free to do the self-repair or not,
+                     as this should only happen on the 1st minibatch of each
+                     training job).
+ @param [out] input_deriv
+                     May be NULL; if not, this function writes, to this
+                     location, the backpropagated derivative of the objective
+                     function w.r.t. the 'input' matrix.  This matrix should
+                     have the same dimension as 'input' i.e.  N by 5C.  In
+                     addition to the regular backpropagated derivative, the
+                     output will include small values relating to 'self-repair'.
+ @param [out] params_deriv
+                     May be NULL; if not, this is where this function *writes*
+                     [not adds] the backpropagated derivative of the objective
+                     function w.r.t. 'params'; it should have the same dimension
+                     as 'params' (3 by C).  (This matrix will then be processed
+                     by the natural gradient code and added to the appropriate
+                     copy of the parameter matrix, outside this function).
+ @param [out] value_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C.  This function *adds* to this location
+                     the total value of each of the sigmoid/tanh nonlinearities
+                     that it computes (this is for diagnostic purposes).
+ @param [out] deriv_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C; this function *adds* to this location the
+                     total of the derivative of each of the sigmoid/tanh
+                     nonlinearities that it computes (this is for diagnostic
+                     purposes and to control the self-repair).  This function
+                     should tolerate the case when 'deriv_sum_out' points to the
+                     same data as 'deriv_sum_in'.
+ @param [out] self_repair_sum_out
+                     Must be NULL if params_deriv is NULL; if not, a matrix of
+                     dimension 5 by C; this function *writes* to this location
+                     the sum of the number of times the self-repair code was
+                     activated (integer values 0 <= k <= N).  This will be
+                     processed outside this function into self-repair stats for
+                     diagnostics.
+// Use 2D block (8x32 threads) as we need to compute column sum.
+// Use 1D grid to cover the data matrix `cell_dim`.
+*/
+template<typename Real>
+__global__
+static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
+                                    const Real* input, const int input_stride,
+                                    const Real* params, const int params_stride,
+                                    const Real* output_deriv,
+                                    const int output_deriv_stride,
+                                    const double* deriv_sum_in,
+                                    const int deriv_sum_in_stride,
+                                    const Real* self_repair_config,
+                                    double count, Real* input_deriv,
+                                    const int input_deriv_stride,
+                                    Real* params_deriv,
+                                    const int params_deriv_stride,
+                                    double* value_sum_out,
+                                    const int value_sum_out_stride,
+                                    double* deriv_sum_out,
+                                    const int deriv_sum_out_stride,
+                                    Real* self_repair_sum_out,
+                                    const int self_repair_sum_out_stride) {
+  __shared__ Real smem[CU1DBLOCK];
+
+  const int j = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int grid_stride = gridDim.y * blockDim.y;
+  const int i0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+  Real w_ic_deriv_sum = 0;
+  Real w_fc_deriv_sum = 0;
+  Real w_oc_deriv_sum = 0;
+
+  Real i_t_value_sum = 0, i_t_deriv_sum = 0;
+  Real f_t_value_sum = 0, f_t_deriv_sum = 0;
+  Real c_part_value_sum = 0, c_part_deriv_sum = 0;
+  Real o_t_value_sum = 0, o_t_deriv_sum = 0;
+  Real c_t_value_sum = 0, c_t_deriv_sum = 0;
+
+  bool update_sr[5];
+
+  if (j < cell_dim) {
+    const Real w_ic = params[j];
+    const Real w_fc = params[params_stride + j];
+    const Real w_oc = params[2 * params_stride + j];
+
+    const Real* sr_config = self_repair_config;
+#   pragma unroll
+    for (int i = 0; i < 5; i++) {
+      update_sr[i] = deriv_sum_in[i * deriv_sum_in_stride + j] / count
+          < sr_config[i];
+    }
+    const Real i_t_self_repair = (update_sr[0] ? sr_config[5] : 0);
+    const Real f_t_self_repair = (update_sr[1] ? sr_config[6] : 0);
+    const Real c_part_self_repair = (update_sr[2] ? sr_config[7] : 0);
+    const Real o_t_self_repair = (update_sr[3] ? sr_config[8] : 0);
+    const Real c_t_self_repair = (update_sr[4] ? sr_config[9] : 0);
+
+    for (int i = i0; i < num_rows; i += grid_stride) {
+      const Real i_part = input[i * input_stride + j];
+      const Real f_part = input[i * input_stride + j + cell_dim];
+      const Real c_part = input[i * input_stride + j + 2 * cell_dim];
+      const Real o_part = input[i * input_stride + j + 3 * cell_dim];
+      const Real c_prev = input[i * input_stride + j + 4 * cell_dim];
+
+      const Real i_t = 1 / (1 + exp(-i_part - w_ic * c_prev));
+      const Real f_t = 1 / (1 + exp(-f_part - w_fc * c_prev));
+      const Real tanh_c_part = tanh(c_part);
+      const Real c_t = f_t * c_prev + i_t * tanh_c_part;
+      const Real o_t = 1 / (1 + exp(-o_part - w_oc * c_t));
+      const Real tanh_c_t = tanh(c_t);
+
+      const Real i_t_deriv = i_t * (1 - i_t);
+      const Real f_t_deriv = f_t * (1 - f_t);
+      const Real c_part_deriv = 1 - tanh_c_part * tanh_c_part;
+      const Real o_t_deriv = o_t * (1 - o_t);
+      const Real c_t_deriv = 1 - tanh_c_t * tanh_c_t;
+
+      if (params_deriv) {
+        i_t_value_sum += i_t;
+        f_t_value_sum += f_t;
+        c_part_value_sum += tanh_c_part;
+        o_t_value_sum += o_t;
+        c_t_value_sum += tanh_c_t;
+
+        i_t_deriv_sum += i_t_deriv;
+        f_t_deriv_sum += f_t_deriv;
+        c_part_deriv_sum += c_part_deriv;
+        o_t_deriv_sum += o_t_deriv;
+        c_t_deriv_sum += c_t_deriv;
+      }
+
+      const Real dc_t_out = output_deriv[i * output_deriv_stride + j];
+      const Real dm_t = output_deriv[i * output_deriv_stride + j + cell_dim];
+
+      const Real dtanh_c_t = o_t * dm_t;
+      const Real do_t = tanh_c_t * dm_t;
+      const Real do_t_input = (o_t_deriv * do_t
+          - (2 * o_t - 1) * o_t_self_repair);
+
+      const Real dc_t = (c_t_deriv * dtanh_c_t + dc_t_out + do_t_input * w_oc)
+          - tanh_c_t * c_t_self_repair;
+      const Real dtanh_c_part = i_t * dc_t;
+      const Real df_t = dc_t * c_prev;
+      const Real df_t_input = (df_t * f_t_deriv
+          - (2 * f_t - 1) * f_t_self_repair);
+      const Real di_t = dc_t * tanh_c_part;
+      const Real di_t_input = (di_t * i_t_deriv
+          - (2 * i_t - 1) * i_t_self_repair);
+
+      if (params_deriv) {
+        w_ic_deriv_sum += c_prev * di_t_input;
+        w_fc_deriv_sum += c_prev * df_t_input;
+        w_oc_deriv_sum += c_t * do_t_input;
+      }
+
+      const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t;
+      const Real do_part = do_t_input;
+      const Real dc_part = (c_part_deriv * dtanh_c_part
+          - tanh_c_part * c_part_self_repair);
+      const Real df_part = df_t_input;
+      const Real di_part = di_t_input;
+
+      if (input_deriv) {
+        input_deriv[i * input_deriv_stride + j] += di_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim] += df_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 2] += dc_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 3] += do_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 4] += dc_prev;
+      }
+    }
+  }
+
+  if (params_deriv) {
+    // compute params_deriv
+    smem[tid] = w_ic_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      params_deriv[j] = smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = w_fc_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      params_deriv[params_deriv_stride + j] = smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = w_oc_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      params_deriv[2 * params_deriv_stride + j] = smem[tid];
+    }
+
+    // compute value_sum_out
+    __syncthreads();
+    smem[tid] = i_t_value_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      value_sum_out[j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = f_t_value_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      value_sum_out[value_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = c_part_value_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      value_sum_out[2 * value_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = o_t_value_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      value_sum_out[3 * value_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = c_t_value_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      value_sum_out[4 * value_sum_out_stride + j] += smem[tid];
+    }
+
+    // need to update self_repair_sum_out before deriv_sum_out, because
+    // deriv_sum_out and deriv_sum_in might point to the same memory.
+    if (i0 < 5 && j < cell_dim) {
+      self_repair_sum_out[i0 * self_repair_sum_out_stride + j] +=
+          update_sr[i0] ? num_rows : 0;
+    }
+
+    // compute derive_sum_out
+    __syncthreads();
+    smem[tid] = i_t_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      deriv_sum_out[j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = f_t_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      deriv_sum_out[deriv_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = c_part_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      deriv_sum_out[2 * deriv_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = o_t_deriv_sum;
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      deriv_sum_out[3 * deriv_sum_out_stride + j] += smem[tid];
+    }
+
+    __syncthreads();
+    smem[tid] = c_t_deriv_sum;
+    __syncthreads();
+#   pragma unroll
+    for (int shift = CU1DBLOCK / 2; shift >= warpSize; shift >>= 1) {
+      __syncthreads();
+      if (tid < shift) {
+        smem[tid] += smem[tid + shift];
+      }
+    }
+    if (tid < warpSize && j < cell_dim) {
+      deriv_sum_out[4 * deriv_sum_out_stride + j] += smem[tid];
+    }
+  }
+}
+
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
  */
@@ -4105,4 +4508,56 @@ void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
   _lstm_nonlinearity<<<Gr, Bl>>>(in, in_stride, params, params_stride,
       out_stride, cell_dim, num_rows, out);
 }
+void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const double* input,
+                                  const int input_stride, const double* params,
+                                  const int params_stride,
+                                  const double* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const double* self_repair_config,
+                                  double count, double* input_deriv,
+                                  const int input_deriv_stride,
+                                  double* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  double* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride) {
+  _diff_lstm_nonlinearity<<<Gr, Bl>>>(cell_dim, num_rows, input,
+      input_stride, params, params_stride, output_deriv, output_deriv_stride,
+      deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv,
+      input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out,
+      value_sum_out_stride, deriv_sum_out, deriv_sum_out_stride,
+      self_repair_sum_out, self_repair_sum_out_stride);
+}
+void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const float* input,
+                                  const int input_stride, const float* params,
+                                  const int params_stride,
+                                  const float* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const float* self_repair_config, double count,
+                                  float* input_deriv,
+                                  const int input_deriv_stride,
+                                  float* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  float* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride) {
+  _diff_lstm_nonlinearity<<<Gr, Bl>>>(cell_dim, num_rows, input,
+      input_stride, params, params_stride, output_deriv, output_deriv_stride,
+      deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv,
+      input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out,
+      value_sum_out_stride, deriv_sum_out, deriv_sum_out_stride,
+      self_repair_sum_out, self_repair_sum_out_stride);
+}
 
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index c8912b4ebfc..656b82326a0 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1258,6 +1258,66 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
   cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
                           out_stride, cell_dim, num_rows, out);
 }
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const double* input,
+                                        const int input_stride,
+                                        const double* params,
+                                        const int params_stride,
+                                        const double* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const double* self_repair_config,
+                                        double count, double* input_deriv,
+                                        const int input_deriv_stride,
+                                        double* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        double* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
+}
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const float* input,
+                                        const int input_stride,
+                                        const float* params,
+                                        const int params_stride,
+                                        const float* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const float* self_repair_config,
+                                        double count, float* input_deriv,
+                                        const int input_deriv_stride,
+                                        float* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        float* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
+}
 
 } // namespace kaldi
 
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 243e696187b..027eb4fd2a7 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -161,7 +161,7 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
     AssertEqual(Houtput, HDoutput);
   }
 
-  for (int i = 16; i <= 1024; i *= 2) {
+  for (int i = 16; i <= 2048; i *= 2) {
     BaseFloat time_in_secs = 0.025;
     int32 num_rows = i;
     int32 cell_dim = i;
@@ -290,6 +290,161 @@ void UnitTestLstmNonlinearity() {
   }
 }
 
+template<typename Real>
+static void UnitTestBackpropLstmNonlinearity() {
+  for (int i = 0; i < 3; i++) {
+    int32 num_rows = 1 + Rand() % 200;
+    int32 cell_dim = 1 + Rand() % 2000;
+//    KALDI_LOG << num_rows << ", " << cell_dim;
+
+    Matrix<Real> hinput(num_rows, 5 * cell_dim);
+    Matrix<Real> hparams(3, cell_dim);
+    Matrix<Real> houtput_deriv(num_rows, 2 * cell_dim);
+    Matrix<double> hderiv_sum_in(5, cell_dim);
+    Vector<Real> hself_repair_config(10);
+    double count_in;
+    Matrix<Real> hinput_deriv(num_rows, 5 * cell_dim);
+    Matrix<Real> hparams_deriv(3, cell_dim);
+    Matrix<double> hvalue_sum_out(5, cell_dim);
+    Matrix<double> hderiv_sum_out(5, cell_dim);
+    Matrix<Real> hself_repair_sum_out(5, cell_dim);
+
+    hinput.SetRandn();
+    hparams.SetRandn();
+    houtput_deriv.SetRandn();
+    hderiv_sum_in.SetRandn();
+    hself_repair_config.SetRandn();
+    count_in = Rand() % num_rows;
+
+    hinput_deriv.SetRandn();
+    hparams_deriv.SetRandn();
+    hvalue_sum_out.SetRandn();
+    hderiv_sum_out.SetRandn();
+    hself_repair_sum_out.SetRandn();
+
+    CuMatrix<Real> dinput(hinput);
+    CuMatrix<Real> dparams(hparams);
+    CuMatrix<Real> doutput_deriv(houtput_deriv);
+    CuMatrix<double> dderiv_sum_in(hderiv_sum_in);
+    CuVector<Real> dself_repair_config(hself_repair_config);
+
+    CuMatrix<Real> dinput_deriv(hinput_deriv);
+    CuMatrix<Real> dparams_deriv(hparams_deriv);
+    CuMatrix<double> dvalue_sum_out(hvalue_sum_out);
+    CuMatrix<double> dderiv_sum_out(hderiv_sum_out);
+    CuMatrix<Real> dself_repair_sum_out(hself_repair_sum_out);
+
+    cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
+                                    hderiv_sum_in, hself_repair_config,
+                                    count_in, (MatrixBase<Real>*) NULL,
+                                    (MatrixBase<Real>*) NULL,
+                                    (MatrixBase<double>*) NULL,
+                                    (MatrixBase<double>*) NULL,
+                                    (MatrixBase<Real>*) NULL);
+    cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
+                                 dself_repair_config, count_in,
+                                 (CuMatrixBase<Real>*) NULL,
+                                 (CuMatrixBase<Real>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<Real>*) NULL);
+
+    cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
+                                    hderiv_sum_in, hself_repair_config,
+                                    count_in, (MatrixBase<Real>*) NULL,
+                                    &hparams_deriv, &hvalue_sum_out,
+                                    &hderiv_sum_out, &hself_repair_sum_out);
+    cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
+                                 dself_repair_config, count_in,
+                                 (CuMatrixBase<Real>*) NULL, &dparams_deriv,
+                                 &dvalue_sum_out, &dderiv_sum_out,
+                                 &dself_repair_sum_out);
+
+    cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
+                                    hderiv_sum_in, hself_repair_config,
+                                    count_in, &hinput_deriv,
+                                    (MatrixBase<Real>*) NULL,
+                                    (MatrixBase<double>*) NULL,
+                                    (MatrixBase<double>*) NULL,
+                                    (MatrixBase<Real>*) NULL);
+    cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
+                                 dself_repair_config, count_in, &dinput_deriv,
+                                 (CuMatrixBase<Real>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<Real>*) NULL);
+
+    cu::CpuBackpropLstmNonlinearity(hinput, hparams, houtput_deriv,
+                                    hderiv_sum_in, hself_repair_config,
+                                    count_in, &hinput_deriv, &hparams_deriv,
+                                    &hvalue_sum_out, &hderiv_sum_out,
+                                    &hself_repair_sum_out);
+    cu::BackpropLstmNonlinearity(dinput, dparams, doutput_deriv, dderiv_sum_in,
+                                 dself_repair_config, count_in, &dinput_deriv,
+                                 &dparams_deriv, &dvalue_sum_out,
+                                 &dderiv_sum_out, &dself_repair_sum_out);
+
+    Matrix<Real> hdinput_deriv(dinput_deriv);
+    Matrix<Real> hdparams_deriv(dparams_deriv);
+    Matrix<double> hdvalue_sum_out(dvalue_sum_out);
+    Matrix<double> hdderiv_sum_out(dderiv_sum_out);
+    Matrix<Real> hdself_repair_sum_out(dself_repair_sum_out);
+
+//    KALDI_LOG<< "input_deriv" << hinput_deriv << "d" << hdinput_deriv;
+//    KALDI_LOG<< "hparams_deriv" << hparams_deriv << "d" << hdparams_deriv;
+//    KALDI_LOG<< "hvalue_sum_out" << hvalue_sum_out << "d" << hdvalue_sum_out;
+//    KALDI_LOG<< "hderiv_sum_out" << hderiv_sum_out << "d" << hdderiv_sum_out;
+//    KALDI_LOG<< "hself_repair_sum_out" << hself_repair_sum_out << "d" << hdself_repair_sum_out;
+
+    AssertEqual(hinput_deriv, hdinput_deriv);
+    AssertEqual(hparams_deriv, hdparams_deriv);
+    AssertEqual(hvalue_sum_out, hdvalue_sum_out);
+    AssertEqual(hderiv_sum_out, hdderiv_sum_out);
+    AssertEqual(hself_repair_sum_out, hdself_repair_sum_out);
+  }
+
+  for (int i = 16; i <= 2048; i *= 2) {
+    BaseFloat time_in_secs = 0.025;
+    int32 num_rows = i;
+    int32 cell_dim = i;
+
+    CuMatrix<Real> input(num_rows, 5 * cell_dim);
+    CuMatrix<Real> params(3, cell_dim);
+    CuMatrix<Real> output_deriv(num_rows, 2 * cell_dim);
+    CuMatrix<double> deriv_sum_in(5, cell_dim);
+    CuVector<Real> self_repair_config(10);
+    double count_in;
+
+    CuMatrix<Real> input_deriv(num_rows, 5 * cell_dim);
+    CuMatrix<Real> params_deriv(3, cell_dim);
+    CuMatrix<double> value_sum_out(5, cell_dim);
+    CuMatrix<double> deriv_sum_out(5, cell_dim);
+    CuMatrix<Real> self_repair_sum_out(5, cell_dim);
+
+    input.SetRandn();
+    params.SetRandn();
+    output_deriv.SetRandn();
+    deriv_sum_in.SetRandn();
+    self_repair_config.SetRandn();
+    count_in = Rand() % num_rows;
+
+    Timer tim;
+    int32 iter = 0;
+    for (; tim.Elapsed() < time_in_secs; iter++)
+      cu::BackpropLstmNonlinearity(input, params, output_deriv, deriv_sum_in,
+                                   self_repair_config, count_in, &input_deriv,
+                                   &params_deriv, &value_sum_out,
+                                   &deriv_sum_out, &self_repair_sum_out);
+
+
+    BaseFloat gflops = ((BaseFloat) i * i * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For BackpropLstmNonlinearity"
+              << (sizeof(Real) == 8 ? "<double>" : "<float>") << ", for dim = "
+              << i << ", speed was " << gflops << " gigaflops";
+  }
+}
+
+
 template<typename Real> void CudaMathUnitTest() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().DoublePrecisionSupported())
@@ -300,6 +455,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestCuMathSplice<Real>();
   UnitTestCuMathCopy<Real>();
   UnitTestLstmNonlinearity();
+  UnitTestBackpropLstmNonlinearity<Real>();
 }
 
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 806f4e309ab..9d667207914 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -559,6 +559,8 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
   }
 }
 
+
+
 template<typename Real>
 void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               const CuMatrixBase<Real> &params,
@@ -605,12 +607,90 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    KALDI_ERR << "CUDA version not implemented";
-  // notes for Shiyin:
-    //  You could do an 'easy' initial version where we have have one thread per dimension,
-    //  and you can try optimizing this later on.
-    //  Since the cell-dim is usually quite large, like 1024, this is fairly reasonable.
-    // But up to you.
+    Timer tim;
+    // Each thread block is working on 1 row of the data.
+    // It's best that cell dim is a multiple fo CU1DBLOCK
+
+
+    // Use 2D block (8x32 threads) as we need to compute column sum.
+    // Use 1D grid to cover the data matrix width `cell_dim`.
+    const int kWarpSize = 32;
+    dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
+//    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
+//                 n_blocks(num_rows, dimBlock.y));
+//    if (dimGrid.x * dimGrid.y > 1024) {
+//      dimGrid.y = std::max(1024 / dimGrid.x, 1);
+//    }
+    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
+    if (input_deriv == NULL) {
+      if (params_deriv == NULL) {
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+                                    input.Data(), input.Stride(), params.Data(),
+                                    params.Stride(), output_deriv.Data(),
+                                    output_deriv.Stride(), deriv_sum_in.Data(),
+                                    deriv_sum_in.Stride(),
+                                    self_repair_config.Data(), count_in + 1,
+                                    NULL,
+                                    0,
+                                    NULL,
+                                    0,
+                                    NULL,
+                                    0,
+                                    NULL,
+                                    0,
+                                    NULL,
+                                    0);
+
+      } else {
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+                                    input.Data(), input.Stride(), params.Data(),
+                                    params.Stride(), output_deriv.Data(),
+                                    output_deriv.Stride(), deriv_sum_in.Data(),
+                                    deriv_sum_in.Stride(),
+                                    self_repair_config.Data(), count_in + 1,
+                                    NULL,
+                                    0, params_deriv->Data(),
+                                    params_deriv->Stride(),
+                                    value_sum_out->Data(),
+                                    value_sum_out->Stride(),
+                                    deriv_sum_out->Data(),
+                                    deriv_sum_out->Stride(),
+                                    self_repair_sum_out->Data(),
+                                    self_repair_sum_out->Stride());
+      }
+    } else {
+      if (params_deriv == NULL) {
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+                                    input.Data(), input.Stride(), params.Data(),
+                                    params.Stride(), output_deriv.Data(),
+                                    output_deriv.Stride(), deriv_sum_in.Data(),
+                                    deriv_sum_in.Stride(),
+                                    self_repair_config.Data(), count_in + 1,
+                                    input_deriv->Data(), input_deriv->Stride(),
+                                    NULL,
+                                    0, NULL, 0, NULL, 0, NULL, 0);
+      } else {
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+                                    input.Data(), input.Stride(), params.Data(),
+                                    params.Stride(), output_deriv.Data(),
+                                    output_deriv.Stride(), deriv_sum_in.Data(),
+                                    deriv_sum_in.Stride(),
+                                    self_repair_config.Data(), count_in + 1,
+                                    input_deriv->Data(), input_deriv->Stride(),
+                                    params_deriv->Data(),
+                                    params_deriv->Stride(),
+                                    value_sum_out->Data(),
+                                    value_sum_out->Stride(),
+                                    deriv_sum_out->Data(),
+                                    deriv_sum_out->Stride(),
+                                    self_repair_sum_out->Data(),
+                                    self_repair_sum_out->Stride());
+      }
+    }
+
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 33b2c4e6473..c4b442e12b1 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -136,7 +136,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
                      A matrix, of dimension 3 by C, with rows containing the three
                      diagonal parameter matrices used in LSTMs, namely
                      w_{ic}, w_{fc} and w_{oc}.
- @param [out] output_deriv
+ @param [in] output_deriv
                      A matrix, of dimension N by 2C, containing the derivative of the
                      objective function we're backpropagating, w.r.t. the quantities
                      c_t and m_t (in two blocks of column-dimension C).
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index da45791a065..85c91c5eb0e 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1712,9 +1712,9 @@ class LstmNonlinearityComponent: public UpdatableComponent {
 
   virtual void InitFromConfig(ConfigLine *cfl);
 
-  NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
+  LstmNonlinearityComponent() { } // use Init to really initialize.
   virtual std::string Type() const {
-    return "NaturalGradientPerElementScaleComponent";
+    return "LstmNonlinearityComponent";
   }
 
   virtual void Read(std::istream &is, bool binary);
@@ -1723,8 +1723,8 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   virtual Component* Copy() const;
 
   // Some functions that are specific to this class:
-  explicit NaturalGradientPerElementScaleComponent(
-      const NaturalGradientPerElementScaleComponent &other);
+  explicit LstmNonlinearityComponent(
+      const LstmNonlinearityComponent &other);
 
   void Init(int32 dim, BaseFloat param_mean,
             BaseFloat param_stddev, int32 rank, int32 update_period,

From ce7a32464b423712390d053c32fbf18f1a223d4c Mon Sep 17 00:00:00 2001
From: Shiyin Kang <kangshiyin@gmail.com>
Date: Fri, 25 Nov 2016 04:27:02 +0800
Subject: [PATCH 43/71] Finish LstmNonlinearityComponent; tests now succeed.

Conflicts:
	src/cudamatrix/cu-math.cc
	src/nnet3/nnet-simple-component.cc
	src/nnet3/nnet-simple-component.h

compilable in gcc5.2

match the cpu impl

add comments for the cpu version api
---
 src/cudamatrix/cu-kernels.cu       |  10 +-
 src/cudamatrix/cu-math.cc          |  10 +-
 src/cudamatrix/cu-math.h           |  12 +-
 src/nnet3/nnet-component-itf.cc    |   8 +-
 src/nnet3/nnet-parse.cc            |   2 +-
 src/nnet3/nnet-parse.h             |   2 +-
 src/nnet3/nnet-simple-component.cc | 356 ++++++++++++++++++++++++++++-
 src/nnet3/nnet-simple-component.h  |  65 ++++--
 src/nnet3/nnet-test-utils.cc       |   9 +-
 9 files changed, 430 insertions(+), 44 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 3fb38b54c20..9da59f92eaa 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2900,11 +2900,11 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
       const Real di_part = di_t_input;
 
       if (input_deriv) {
-        input_deriv[i * input_deriv_stride + j] += di_part;
-        input_deriv[i * input_deriv_stride + j + cell_dim] += df_part;
-        input_deriv[i * input_deriv_stride + j + cell_dim * 2] += dc_part;
-        input_deriv[i * input_deriv_stride + j + cell_dim * 3] += do_part;
-        input_deriv[i * input_deriv_stride + j + cell_dim * 4] += dc_prev;
+        input_deriv[i * input_deriv_stride + j] = di_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim] = df_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 2] = dc_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 3] = do_part;
+        input_deriv[i * input_deriv_stride + j + cell_dim * 4] = dc_prev;
       }
     }
   }
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 9d667207914..2d05ee1cfdc 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -520,11 +520,11 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       Real di_part = di_t_input;
 
       if (input_deriv_mat != NULL) {
-        (*input_deriv_mat)(r, c) += di_part;
-        (*input_deriv_mat)(r, c + cell_dim) += df_part;
-        (*input_deriv_mat)(r, c + 2 * cell_dim) += dc_part;
-        (*input_deriv_mat)(r, c + 3 * cell_dim) += do_part;
-        (*input_deriv_mat)(r, c + 4 * cell_dim) += dc_prev;
+        (*input_deriv_mat)(r, c) = di_part;
+        (*input_deriv_mat)(r, c + cell_dim) = df_part;
+        (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
+        (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
+        (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
       }
     }
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index c4b442e12b1..109f18349da 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -104,13 +104,15 @@ void Group2norm(const CuMatrixBase<Real> &src,
 
  */
 template<typename Real>
-void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
-                                const MatrixBase<Real> &params,
-                                MatrixBase<Real> *output);
-template<typename Real>
 void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
                              const CuMatrixBase<Real> &params,
                              CuMatrixBase<Real> *output);
+// This is a version of ComputeLstmNonlinearity that only uses the CPU
+// even if a GPU is available. It's made available for testing purposes.
+template<typename Real>
+void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
+                                const MatrixBase<Real> &params,
+                                MatrixBase<Real> *output);
 
 
 /**
@@ -207,6 +209,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<Real> *self_repair_sum_out);
+// This is a version of BackpropLstmNonlinearity that only uses the CPU
+// even if a GPU is available. It's made available for testing purposes.
 template<typename Real>
 void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  const MatrixBase<Real> &params,
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index b40670407c8..00dd802e091 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -147,6 +147,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
+  } else if (component_type == "LstmNonlinearityComponent") {
+    ans = new LstmNonlinearityComponent();
   }
   if (ans != NULL) {
     KALDI_ASSERT(component_type == ans->Type());
@@ -306,12 +308,14 @@ void NonlinearComponent::ZeroStats() {
 
 std::string NonlinearComponent::Info() const {
   std::stringstream stream;
-  if (InputDim() == OutputDim())
+  if (InputDim() == OutputDim()) {
     stream << Type() << ", dim=" << InputDim();
-  else
+  } else {
+    // Note: this is a very special case tailored for class NormalizeComponent.
     stream << Type() << ", input-dim=" << InputDim()
            << ", output-dim=" << OutputDim()
            << ", add-log-stddev=true";
+  }
 
   if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold))
     stream << ", self-repair-lower-threshold=" << self_repair_lower_threshold_;
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 3bacf455f3b..353b32189d8 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -519,7 +519,7 @@ std::string SummarizeVector(const Vector<BaseFloat> &vec) {
 
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
-                         const CuVector<BaseFloat> &params,
+                         const CuVectorBase<BaseFloat> &params,
                          bool include_mean) {
   os << std::setprecision(4);
   os << ", " << name << '-';
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index cd0c3da0654..da3199454da 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -204,7 +204,7 @@ std::string SummarizeVector(const Vector<BaseFloat> &vec);
  */
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
-                         const CuVector<BaseFloat> &params,
+                         const CuVectorBase<BaseFloat> &params,
                          bool include_mean = false);
 
 /** Print to 'os' some information about the mean and standard deviation of
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index f48885175b4..a745f0cded0 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -25,6 +25,7 @@
 #include <iomanip>
 #include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-math.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -105,8 +106,8 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
-  stream << Type() << ", dim = " << dim_
-         << ", dropout-proportion = " << dropout_proportion_;
+  stream << Type() << ", dim=" << dim_
+         << ", dropout-proportion=" << dropout_proportion_;
   return stream.str();
 }
 
@@ -4366,15 +4367,15 @@ void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
 std::string MaxpoolingComponent::Info() const {
   std::ostringstream stream;
   stream << Type()
-         << ", input-x-dim = " << input_x_dim_
-         << ", input-y-dim = " << input_y_dim_
-         << ", input-z-dim = " << input_z_dim_
-         << ", pool-x-size = " << pool_x_size_
-         << ", pool-y-size = " << pool_y_size_
-         << ", pool-z-size = " << pool_z_size_
-         << ", pool-x-step = " << pool_x_step_
-         << ", pool-y-step = " << pool_y_step_
-         << ", pool-z-step = " << pool_z_step_;
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", pool-x-size=" << pool_x_size_
+         << ", pool-y-size=" << pool_y_size_
+         << ", pool-z-size=" << pool_z_size_
+         << ", pool-x-step=" << pool_x_step_
+         << ", pool-y-step=" << pool_y_step_
+         << ", pool-z-step=" << pool_z_step_;
   return stream.str();
 }
 
@@ -5001,5 +5002,338 @@ void CompositeComponent::SetComponent(int32 i, Component *component) {
   components_[i] = component;
 }
 
+
+int32 LstmNonlinearityComponent::InputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 5;
+}
+
+int32 LstmNonlinearityComponent::OutputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 2;
+}
+
+
+void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<Params>");
+  params_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairConfig>");
+  self_repair_config_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairProb>");
+  self_repair_total_.Read(is, binary);
+
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+
+  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
+  // self_repair_total_ by dividing by the count, but in memory they are scaled
+  // by the count.  [for self_repair_total_, the scaling factor is count_ *
+  // cell_dim].
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+  int32 cell_dim = params_.NumCols();
+  self_repair_total_.Scale(count_ * cell_dim);
+
+  InitNaturalGradient();
+
+  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
+
+}
+
+void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Read opening tag and learning rate.
+
+  WriteToken(os, binary, "<Params>");
+  params_.Write(os, binary);
+  WriteToken(os, binary, "<ValueAvg>");
+  {
+    Matrix<BaseFloat> value_avg(value_sum_);
+    if (count_ != 0.0)
+      value_avg.Scale(1.0 / count_);
+    value_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<DerivAvg>");
+  {
+    Matrix<BaseFloat> deriv_avg(deriv_sum_);
+    if (count_ != 0.0)
+      deriv_avg.Scale(1.0 / count_);
+    deriv_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairConfig>");
+  self_repair_config_.Write(os, binary);
+  WriteToken(os, binary, "<SelfRepairProb>");
+  {
+    int32 cell_dim = params_.NumCols();
+    Vector<BaseFloat> self_repair_prob(self_repair_total_);
+    if (count_ != 0.0)
+      self_repair_prob.Scale(1.0 / (count_ * cell_dim));
+    self_repair_prob.Write(os, binary);
+  }
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "</LstmNonlinearityComponent>");
+}
+
+
+
+std::string LstmNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  int32 cell_dim = params_.NumCols();
+  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim;
+  PrintParameterStats(stream, "w_ic", params_.Row(0));
+  PrintParameterStats(stream, "w_fc", params_.Row(1));
+  PrintParameterStats(stream, "w_oc", params_.Row(2));
+
+  // Note: some of the following code mirrors the code in
+  // UpdatableComponent::Info(), in nnet-component-itf.cc.
+  if (count_ > 0) {
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+  }
+  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
+                                        "o_t_sigmoid", "m_t_tanh" };
+  for (int32 i = 0; i < 5; i++) {
+    stream << ", " << nonlin_names[i] << "={";
+    stream << " self-repair-lower-threshold=" << self_repair_config_(i)
+           << ", self-repair-scale=" << self_repair_config_(i + 5);
+
+    if (count_ != 0) {
+      BaseFloat self_repaired_proportion =
+          self_repair_total_(i) / (count_ * cell_dim);
+      stream << ", self-repaired-proportion=" << self_repaired_proportion;
+      Vector<double> value_sum(value_sum_.Row(i)),
+          deriv_sum(deriv_sum_.Row(i));
+      Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
+      value_avg.Scale(1.0 / count_);
+      deriv_avg.Scale(1.0 / count_);
+      stream << ", value-avg=" << SummarizeVector(value_avg)
+             << ", deriv-avg=" << SummarizeVector(deriv_avg);
+    }
+    stream << " }";
+  }
+  return stream.str();
+}
+
+
+Component* LstmNonlinearityComponent::Copy() const {
+  return new LstmNonlinearityComponent(*this);
+}
+
+void LstmNonlinearityComponent::Scale(BaseFloat scale) {
+  params_.Scale(scale);
+  value_sum_.Scale(scale);
+  deriv_sum_.Scale(scale);
+  self_repair_total_.Scale(scale);
+  count_ *= scale;
+}
+
+void LstmNonlinearityComponent::Add(BaseFloat alpha,
+                                    const Component &other_in) {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  params_.AddMat(alpha, other->params_);
+  value_sum_.AddMat(alpha, other->value_sum_);
+  deriv_sum_.AddMat(alpha, other->deriv_sum_);
+  self_repair_total_.AddVec(alpha, other->self_repair_total_);
+  count_ += alpha * other->count_;
+}
+
+void LstmNonlinearityComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  params_.SetZero();
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_.SetZero();
+  count_ = 0.0;
+}
+
+void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
+  temp_params.SetRandn();
+  params_.AddMat(stddev, temp_params);
+}
+
+BaseFloat LstmNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return TraceMatMat(params_, other->params_, kTrans);
+}
+
+int32 LstmNonlinearityComponent::NumParameters() const {
+  return params_.NumRows() * params_.NumCols();
+}
+
+void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyRowsFromMat(params_);
+}
+
+
+void LstmNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  params_.CopyRowsFromVec(params);
+}
+
+
+void LstmNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  cu::ComputeLstmNonlinearity(in, params_, out);
+}
+
+
+void LstmNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (to_update_in == NULL) {
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv,
+                                 (CuMatrixBase<BaseFloat>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<BaseFloat>*) NULL);
+  } else {
+    LstmNonlinearityComponent *to_update =
+        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
+    KALDI_ASSERT(to_update != NULL);
+
+    int32 cell_dim = params_.NumCols();
+    CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
+    CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
+
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv, &params_deriv,
+                                 &(to_update->value_sum_),
+                                 &(to_update->deriv_sum_),
+                                 &self_repair_total);
+
+    CuVector<BaseFloat> self_repair_total_sum(5);
+    self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
+    to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
+    to_update->count_ += static_cast<double>(in_value.NumRows());
+
+    BaseFloat scale = 1.0;
+    if (!to_update->is_gradient_) {
+      to_update->preconditioner_.PreconditionDirections(
+          &params_deriv, NULL, &scale);
+    }
+    to_update->params_.AddMat(to_update->learning_rate_ * scale,
+                              params_deriv);
+  }
+}
+
+LstmNonlinearityComponent::LstmNonlinearityComponent(
+    const LstmNonlinearityComponent &other):
+    UpdatableComponent(other),
+    params_(other.params_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_config_(other.self_repair_config_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    preconditioner_(other.preconditioner_) { }
+
+void LstmNonlinearityComponent::Init(
+    int32 cell_dim, BaseFloat param_stddev,
+    BaseFloat tanh_self_repair_threshold,
+    BaseFloat sigmoid_self_repair_threshold,
+    BaseFloat self_repair_scale) {
+  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
+               tanh_self_repair_threshold >= 0.0 &&
+               tanh_self_repair_threshold <= 1.0 &&
+               sigmoid_self_repair_threshold >= 0.0 &&
+               sigmoid_self_repair_threshold <= 0.25 &&
+               self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
+  params_.Resize(3, cell_dim);
+  params_.SetRandn();
+  params_.Scale(param_stddev);
+  value_sum_.Resize(5, cell_dim);
+  deriv_sum_.Resize(5, cell_dim);
+  self_repair_config_.Resize(10);
+  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
+  self_repair_config_(2) = tanh_self_repair_threshold;
+  self_repair_config_(4) = tanh_self_repair_threshold;
+  self_repair_config_.Range(5, 5).Set(self_repair_scale);
+  self_repair_total_.Resize(5);
+  count_ = 0.0;
+  InitNaturalGradient();
+
+}
+
+void LstmNonlinearityComponent::InitNaturalGradient() {
+  // As regards the configuration for the natural-gradient preconditioner, we
+  // don't make it configurable from the command line-- it's unlikely that any
+  // differences from changing this would be substantial enough to effectively
+  // tune the configuration.  Because the preconditioning code doesn't 'see' the
+  // derivatives from individual frames, but only averages over the minibatch,
+  // there is a fairly small amount of data available to estimate the Fisher
+  // information matrix, so we set the rank, update period and
+  // num-samples-history to smaller values than normal.
+  preconditioner_.SetRank(20);
+  preconditioner_.SetUpdatePeriod(2);
+  preconditioner_.SetNumSamplesHistory(1000.0);
+}
+
+
+void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  InitLearningRatesFromConfig(cfl);
+  bool ok = true;
+  int32 cell_dim;
+  // these self-repair thresholds are the normal defaults for tanh and sigmoid
+  // respectively.  If, later on, we decide that we want to support different
+  // self-repair config values for the individual sigmoid and tanh
+  // nonlinearities, we can modify this code then.
+  BaseFloat tanh_self_repair_threshold = 0.2,
+      sigmoid_self_repair_threshold = 0.05,
+      self_repair_scale = 1.0e-05;
+  // param_stddev is the stddev of the parameters.  it may be better to
+  // use a smaller value but this was the default in the python scripts
+  // for a while.
+  BaseFloat param_stddev = 1.0;
+  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("tanh-self-repair-threshold",
+                &tanh_self_repair_threshold);
+  cfl->GetValue("sigmoid-self-repair-threshold",
+                &sigmoid_self_repair_threshold);
+  cfl->GetValue("self-repair-scale", &self_repair_scale);
+
+  // We may later on want to make it possible to initialize the different
+  // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
+  // that when and if it's needed.
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+	      << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(cell_dim, param_stddev, tanh_self_repair_threshold,
+       sigmoid_self_repair_threshold, self_repair_scale);
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 85c91c5eb0e..f09a989759a 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -180,6 +180,8 @@ class NormalizeComponent: public Component {
     Init(input_dim, target_rms, add_log_stddev);
   }
   explicit NormalizeComponent(const NormalizeComponent &other);
+  // note: there is some special code in NonlinerComponent::Info() that
+  // specifically caters to this class.
   virtual int32 Properties() const {
     return (add_log_stddev_ ?
             kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
@@ -1560,7 +1562,6 @@ class ConvolutionComponent: public UpdatableComponent {
               const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
 
 
-
   virtual void Read(std::istream &is, bool binary);
   virtual void Write(std::ostream &os, bool binary) const;
 
@@ -1708,34 +1709,61 @@ class ConvolutionComponent: public UpdatableComponent {
 class LstmNonlinearityComponent: public UpdatableComponent {
  public:
 
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
   virtual std::string Info() const;
-
   virtual void InitFromConfig(ConfigLine *cfl);
-
   LstmNonlinearityComponent() { } // use Init to really initialize.
-  virtual std::string Type() const {
-    return "LstmNonlinearityComponent";
+  virtual std::string Type() const { return "LstmNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
   }
 
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
   virtual void Read(std::istream &is, bool binary);
   virtual void Write(std::ostream &os, bool binary) const;
 
   virtual Component* Copy() const;
 
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
   // Some functions that are specific to this class:
   explicit LstmNonlinearityComponent(
       const LstmNonlinearityComponent &other);
 
-  void Init(int32 dim, BaseFloat param_mean,
-            BaseFloat param_stddev, int32 rank, int32 update_period,
-            BaseFloat num_samples_history, BaseFloat alpha,
-            BaseFloat max_change_per_minibatch);
+  void Init(int32 cell_dim, BaseFloat param_stddev,
+            BaseFloat tanh_self_repair_threshold,
+            BaseFloat sigmoid_self_repair_threshold,
+            BaseFloat self_repair_scale);
+
   void Init(std::string vector_filename,
             int32 rank, int32 update_period, BaseFloat num_samples_history,
             BaseFloat alpha, BaseFloat max_change_per_minibatch);
 
  private:
 
+  // Initializes the natural-gradient object with the configuration we
+  // use for this object, which for now is hardcoded at the C++ level.
+  void InitNaturalGradient();
+
 
   // Notation: C is the cell dimension; it equals params_.NumCols().
 
@@ -1747,8 +1775,6 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // equations (1) through (5), this is the sum of the values of the nonliearities
   // (used for diagnostics only).  It is comparable to value_sum_ vector
   // in base-class NonlinearComponent.
-  // Note: to save time and simplify the code, when using GPU we don't always
-  // store stats for all of the members of the minibatch, just a subset.
   CuMatrix<double> value_sum_;
 
   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
@@ -1756,15 +1782,26 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // nonliearities (used for diagnostics and to control self-repair).  It is
   // comparable to the deriv_sum_ vector in base-class
   // NonlinearComponent.
-  // Note: to save time and simplify the code, when using GPU we don't always
-  // store stats for all of the members of the minibatch, just a subset.
   CuMatrix<double> deriv_sum_;
 
+  // This matrix has dimension 10.  The contents are a block of 5 self-repair
+  // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
+  // self-repair scales (typically all 0.00001).  These are for each of the 5
+  // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
+  // more info).
+  CuVector<BaseFloat> self_repair_config_;
+
+  // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
+  // component (see comments in cu-math.h for more info), it contains the total,
+  // over all frames represented in count_, of the number of dimensions that
+  // were subject to self_repair.  To get the self-repair proportion you should
+  // divide by (count_ times cell_dim_).
+  CuVector<double> self_repair_total_;
+
   // The total count (number of frames) corresponding to the stats in value_sum_
   // and deriv_sum_.
   double count_;
 
-
   // Preconditioner for the parameters of this component [operates in the space
   // of dimension C].
   // The preconditioner stores its own configuration values; we write and read
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 0b000b5b4ef..170ea51ca8f 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1104,7 +1104,7 @@ void ComputeExampleComputationRequestSimple(
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
 
-  int32 n = RandInt(0, 29);
+  int32 n = RandInt(0, 30);
   BaseFloat learning_rate = 0.001 * RandInt(1, 3);
 
   std::ostringstream os;
@@ -1403,6 +1403,13 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " dropout-proportion=" << RandUniform();
       break;
     }
+    case 30: {
+      *component_type = "LstmNonlinearityComponent";
+      // set self-repair scale to zero so the derivative tests will pass.
+      os << "cell-dim=" << RandInt(1, 200)
+         << " self-repair-scale=0.0";
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }

From ed6880cb0d689f637257f415f84a8a31c4c8e62b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 27 Nov 2016 01:02:46 -0500
Subject: [PATCH 44/71] Part of the way towards testing the 'new' LSTM layer
 (first building xconfig-based setups on tedlium s5_r2)

---
 egs/tedlium/s5_r2/local/chain/run_tdnn.sh     | 200 +-------------
 .../s5_r2/local/chain/tuning/run_tdnn_1a.sh   | 199 ++++++++++++++
 .../s5_r2/local/chain/tuning/run_tdnn_1b.sh   | 248 ++++++++++++++++++
 egs/tedlium/s5_r2/local/nnet3/run_lstm.sh     | 178 +------------
 .../s5_r2/local/nnet3/tuning/run_lstm_1a.sh   | 177 +++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 143 +++++++++-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   5 +-
 7 files changed, 767 insertions(+), 383 deletions(-)
 mode change 100755 => 120000 egs/tedlium/s5_r2/local/chain/run_tdnn.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
 mode change 100755 => 120000 egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
 create mode 100755 egs/tedlium/s5_r2/local/nnet3/tuning/run_lstm_1a.sh

diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
deleted file mode 100755
index 82647b81767..00000000000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/bin/bash
-
-
-# by default, with cleanup:
-# local/chain/run_tdnn.sh
-
-# without cleanup:
-# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run the corresponding non-chain nnet3 system
-# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 450 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize 0.1 \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
-fi
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..82647b81767
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+
+
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/${train_set}_sp_hires_comb \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $tree_dir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..72070551ec8
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/${train_set}_sp_hires_comb \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $tree_dir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize "$xent_regularize" \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
deleted file mode 100755
index 5fbeb79991c..00000000000
--- a/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/bin/bash
-
-#    This is the standard "lstm" system, built in nnet3; this script
-# is the version that's meant to run with data-cleanup, that doesn't
-# support parallel alignments.
-
-
-# by default, with cleanup:
-# local/nnet3/run_lstm.sh
-
-# without cleanup:
-# local/nnet3/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-
-set -e -o pipefail -u
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
-                  # should have alignments for the specified training data.
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
-
-# Options which are not passed through to run_ivector_common.sh
-affix=
-common_egs_dir=
-reporting_email=
-
-# LSTM options
-train_stage=-10
-splice_indexes="-2,-1,0,1,2 0 0"
-lstm_delay=" -1 -2 -3 "
-label_delay=5
-num_lstm_layers=3
-cell_dim=1024
-hidden_dim=1024
-recurrent_projection_dim=256
-non_recurrent_projection_dim=256
-chunk_width=20
-chunk_left_context=40
-chunk_right_context=0
-max_param_change=2.0
-
-# training options
-srand=0
-num_epochs=6
-initial_effective_lrate=0.0003
-final_effective_lrate=0.00003
-num_jobs_initial=3
-num_jobs_final=15
-momentum=0.5
-num_chunk_per_minibatch=100
-samples_per_iter=20000
-remove_egs=true
-
-#decode options
-extra_left_context=
-extra_right_context=
-frames_per_chunk=
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-
-gmm_dir=exp/${gmm}
-graph_dir=$gmm_dir/graph
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
-if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
-dir=${dir}_sp
-train_data_dir=data/${train_set}_sp_hires_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs"
-  config_extra_opts=()
-  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
-  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
-    --feat-dir $train_data_dir \
-    --ivector-dir $train_ivector_dir \
-    --ali-dir $ali_dir \
-    --num-lstm-layers $num_lstm_layers \
-    --splice-indexes "$splice_indexes " \
-    --cell-dim $cell_dim \
-    --hidden-dim $hidden_dim \
-    --recurrent-projection-dim $recurrent_projection_dim \
-    --non-recurrent-projection-dim $non_recurrent_projection_dim \
-    --label-delay $label_delay \
-    --self-repair-scale-nonlinearity 0.00001 \
-  $dir/configs || exit 1;
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_rnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir=$train_ivector_dir \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.srand=$srand \
-    --trainer.num-epochs=$num_epochs \
-    --trainer.samples-per-iter=$samples_per_iter \
-    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
-    --trainer.optimization.num-jobs-final=$num_jobs_final \
-    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
-    --trainer.optimization.momentum=$momentum \
-    --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.dir="$common_egs_dir" \
-    --cleanup.remove-egs=$remove_egs \
-    --cleanup.preserve-model-interval=1 \
-    --use-gpu=true \
-    --feat-dir=$train_data_dir \
-    --ali-dir=$ali_dir \
-    --lang=data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-fi
-
-if [ $stage -le 14 ]; then
-  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
-  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
-  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-   (
-    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-        --extra-left-context $extra_left_context \
-        --extra-right-context $extra_right_context \
-        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-
-exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
new file mode 120000
index 00000000000..c53740399ce
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..5fbeb79991c
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_lstm_1a.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+#    This is the standard "lstm" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# by default, with cleanup:
+# local/nnet3/run_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=15
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 7b37958f81b..db7424e3f0f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -24,7 +24,6 @@
 #   cell-dim=-1              [Dimension of the cell]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
-#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
@@ -38,7 +37,6 @@ def set_default_configs(self):
         self.config = {'input':'[-1]',
                         'cell-dim' : -1, # this is a compulsory argument
                         'clipping-threshold' : 30.0,
-                        'norm-based-clipping' : True,
                         'delay' : -1,
                         'ng-per-element-scale-options' : ' max-change=0.75',
                         'ng-affine-options' : ' max-change=0.75 ',
@@ -220,7 +218,6 @@ def generate_lstm_config(self):
 #   non_recurrent_projection_dim        [Dimension of the projection in non-recurrent connections]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
-#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
@@ -237,7 +234,6 @@ def set_default_configs(self):
                         'recurrent-projection-dim' : -1,
                         'non-recurrent-projection-dim' : -1,
                         'clipping-threshold' : 30.0,
-                        'norm-based-clipping' : True,
                         'delay' : -1,
                         'ng-per-element-scale-options' : ' max-change=0.75 ',
                         'ng-affine-options' : ' max-change=0.75 ',
@@ -530,3 +526,142 @@ def generate_lstm_config(self):
         configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
 
         return configs
+
+
+# This class is for lines like
+#   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
+# It generates an LSTM sub-graph without output projections.
+# Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose
+# component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
+# into one.
+#
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigFastLstmLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-lstm-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        # if you want to set 'self-repair-scale' (c.f. the
+                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
+                        # add 'self-repair-scale=xxx' to
+                        # lstm-nonlinearity-options.
+                        'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # the affine layer contains 4 of our old layers -> use a
+                        # larger max-change than the normal value of 0.75.
+                        'ng-affine-options' : ' max-change=1.5',
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+    def check_configs(self):
+        key = 'cell-dim'
+        if self.config['cell-dim'] <= 0:
+            raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str())
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'm'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        lstm_str = self.config['lstm-nonlinearity-options']
+
+
+        configs = []
+
+        # the equations implemented here are
+        # TODO: write these
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("###  Components for the LTSM layer named '{0}'".format(name))
+        configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
+        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim * 4, affine_str))
+        configs.append("# The core LSTM nonlinearity, implemented as a single component.")
+        configs.append("Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
+        configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
+        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str))
+        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
+        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        configs.append("###  Nodes for the components above.")
+        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
+                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
+                       "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+        configs.append("dim-range-node name={0}.c input={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m input={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("component-node name={0}.c_trunc input={0}.c".format(name))
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 4976084a977..111abd4cb89 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -27,7 +27,8 @@
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
-        'lstmpc-layer' : xlayers.XconfigLstmpcLayer
+        'lstmpc-layer' : xlayers.XconfigLstmpcLayer,
+        'fast-lstm-layer' : xlayers.XconfigFastLstmLayer
         }
 
 # Converts a line as parsed by ParseConfigLine() into a first
@@ -90,5 +91,3 @@ def read_xconfig_file(xconfig_filename):
             sys.argv[0], xconfig_filename))
     f.close()
     return all_layers
-
-

From 0c6c424555cea6a5fac7aefeb71432d8307f4c86 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 27 Nov 2016 15:27:59 -0500
Subject: [PATCH 45/71] raw_python_script: generate_plots

---
 .../s5/steps/libs/nnet3/report/__init__.py    |   2 +
 .../s5/steps/libs/nnet3/report/log_parse.py   |  15 +-
 .../s5/steps/nnet3/report/generate_plots.py   | 555 ++++++++++++------
 3 files changed, 377 insertions(+), 195 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/report/__init__.py b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py
index 2c94aa7e20b..0566735d709 100644
--- a/egs/wsj/s5/steps/libs/nnet3/report/__init__.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/__init__.py
@@ -3,4 +3,6 @@
 # Copyright 2016    Vimal Manohar
 # Apache 2.0.
 
+import log_parse
+
 __all__ = ["log_parse"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 7c34de6d752..88a77d4d2d0 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -6,11 +6,14 @@
 
 from __future__ import division
 import datetime
+import logging
 import re
-import sys
 
 import libs.common as common_lib
 
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
 
 def parse_progress_logs_for_nonlinearity_stats(exp_dir):
     """ Parse progress logs for mean and std stats for non-linearities.
@@ -158,7 +161,7 @@ def parse_progress_logs_for_clipped_proportion(exp_dir):
             'cp_per_iter_per_component': cp_per_iter_per_component}
 
 
-def parse_progress_logs_for_param_diffP(exp_dir, pattern, logger=None):
+def parse_progress_logs_for_param_diff(exp_dir, pattern):
     """ Parse progress logs for per-component parameter differences.
 
     e.g. for a line that is parsed from progress.*.log:
@@ -256,7 +259,7 @@ def parse_train_logs(exp_dir):
     return train_times
 
 
-def parse_prob_logs(exp_dir, key='accuracy'):
+def parse_prob_logs(exp_dir, key='accuracy', output="output"):
     train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
     valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
     train_prob_strings = common_lib.run_kaldi_command(
@@ -278,7 +281,7 @@ def parse_prob_logs(exp_dir, key='accuracy'):
         ".*compute_prob_.*\.([0-9]+).log:LOG "
         ".nnet3.*compute-prob:PrintTotalStats..:"
         "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for "
-        "'output'.*is ([0-9.\-e]+) .*per frame")
+        "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output))
 
     train_loss = {}
     valid_loss = {}
@@ -301,9 +304,9 @@ def parse_prob_logs(exp_dir, key='accuracy'):
                           float(valid_loss[x])), iters)
 
 
-def generate_accuracy_report(exp_dir, key="accuracy"):
+def generate_accuracy_report(exp_dir, key="accuracy", output="output"):
     times = parse_train_logs(exp_dir)
-    data = parse_prob_logs(exp_dir, key)
+    data = parse_prob_logs(exp_dir, key, output)
     report = []
     report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
     for x in data:
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index ba1bb6c8b01..b06cfc03e5c 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -1,80 +1,107 @@
 #!/usr/bin/env python
 
-
-# Copyright 2016 Vijayaditya Peddinti.
+# Copyright 2016    Vijayaditya Peddinti
+#           2016    Vimal Manohar
 # Apache 2.0.
 
-import warnings
-import imp
 import argparse
-import os
 import errno
 import logging
+import os
 import re
-import subprocess
-train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+import sys
+import warnings
+
+sys.path.insert(0, 'steps')
+import libs.nnet3.report.log_parse as log_parse
+import libs.common as common_lib
 
 try:
     import matplotlib as mpl
     mpl.use('Agg')
     import matplotlib.pyplot as plt
-    from matplotlib.backends.backend_pdf import PdfPages
     import numpy as np
 
-    plot = True
+    g_plot = True
 except ImportError:
-    warnings.warn("""
-This script requires matplotlib and numpy. Please install them to generate plots. Proceeding with generation of tables.
-If you are on a cluster where you do not have admin rights you could try using virtualenv.""")
-    plot = False
+    warnings.warn(
+        """This script requires matplotlib and numpy.
+        Please install them to generate plots.
+        Proceeding with generation of tables.
+        If you are on a cluster where you do not have admin rights you could
+        try using virtualenv.""")
+    g_plot = False
 
-nlp = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger('libs')
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 logger.info('Generating plots')
 
 
-def GetArgs():
-    parser = argparse.ArgumentParser(description="""
-Parses the training logs and generates a variety of plots.
-example : steps/nnet3/report/generate_plots.py --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 exp/nnet3/tdnn exp/nnet3/tdnn/report
-""")
-    parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
-    parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
-    parser.add_argument("--objective-type", type=str, default="linear", choices=["linear","quadratic","chain"], help="Objective function used during training -- determines which plots are to be plotted.");
-    parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
-    parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""Parses the training logs and generates a variety of
+        plots.
+        e.g.: steps/nnet3/report/generate_plots.py \\
+        --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 \\
+        exp/nnet3/tdnn exp/nnet3/tdnn/report""")
+
+    parser.add_argument("--comparison-dir", type=str, action='append',
+                        help="other experiment directories for comparison. "
+                        "These will only be used for plots, not tables")
+    parser.add_argument("--start-iter", type=int,
+                        help="Iteration from which plotting will start",
+                        default=1)
+    parser.add_argument("--is-chain", type=str, default=False,
+                        action=common_lib.StrToBoolAction,
+                        help="Iteration from which plotting will start")
+    parser.add_argument("--output-nodes", type=str, default=None,
+                        action=common_lib.NullstrToNoneAction,
+                        help="""List of space separated
+                        <output-node>:<objective-type> entities,
+                        one for each output node""")
+    parser.add_argument("exp_dir",
+                        help="experiment directory, e.g. exp/nnet3/tdnn")
+    parser.add_argument("output_dir",
+                        help="experiment directory, "
+                        "e.g. exp/nnet3/tdnn/report")
 
     args = parser.parse_args()
     if args.comparison_dir is not None and len(args.comparison_dir) > 6:
-        raise Exception("max 6 --comparison-dir options can be specified. If you want to compare with more comparison_dir, you would have to carefully tune the plot_colors variable which specified colors used for plotting.")
-    assert(args.start_iter >= 1)
+        raise Exception(
+            """max 6 --comparison-dir options can be specified.
+            If you want to compare with more comparison_dir, you would have to
+            carefully tune the plot_colors variable which specified colors used
+            for plotting.""")
+    assert args.start_iter >= 1
     return args
 
-plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan' ]
 
+g_plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan']
 
 
 class LatexReport:
+    """Class for writing a Latex report"""
+
     def __init__(self, pdf_file):
         self.pdf_file = pdf_file
-        self.document=[]
+        self.document = []
         self.document.append("""
 \documentclass[prl,10pt,twocolumn]{revtex4}
 \usepackage{graphicx}    % Used to import the graphics
 \\begin{document}
 """)
 
-    def AddFigure(self, figure_pdf, title):
-        # we will have keep extending this replacement list based on errors during compilation
-        # escaping underscores in the title
-        title = "\\texttt{"+re.sub("_","\_", title)+"}"
+    def add_figure(self, figure_pdf, title):
+        """we will have keep extending this replacement list based on errors
+        during compilation escaping underscores in the title"""
+        title = "\\texttt{"+re.sub("_", "\_", title)+"}"
         fig_latex = """
 %...
 \\newpage
@@ -89,11 +116,11 @@ def AddFigure(self, figure_pdf, title):
 """
         self.document.append(fig_latex)
 
-    def Close(self):
+    def close(self):
         self.document.append("\end{document}")
-        return self.Compile()
+        return self.compile()
 
-    def Compile(self):
+    def compile(self):
         root, ext = os.path.splitext(self.pdf_file)
         dir_name = os.path.dirname(self.pdf_file)
         latex_file = root + ".tex"
@@ -102,25 +129,33 @@ def Compile(self):
         lat_file.close()
         logger.info("Compiling the latex report.")
         try:
-            proc = subprocess.Popen(['pdflatex', '-interaction=batchmode', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            proc.communicate()
+            common_lib.run_kaldi_command(
+                "pdflatex -interaction=batchmode "
+                "-output-directory={0} {1}".format(dir_name, latex_file))
         except Exception as e:
-            logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file))
+            logger.warning("There was an error compiling the latex file {0}, "
+                           "please do it manually: {1}".format(latex_file,
+                                                               e.errstr))
             return False
         return True
 
-def LatexCompliantName(name_string):
-    # this function is required as latex does not allow all the component names
-    # allowed by nnet3.
-    # Identified incompatibilities :
-    #   1. latex does not allow dot(.) in file names
-    #
+
+def latex_compliant_name(name_string):
+    """this function is required as latex does not allow all the component names
+    allowed by nnet3.
+    Identified incompatibilities :
+        1. latex does not allow dot(.) in file names
+    """
     node_name_string = re.sub("\.", "_dot_", name_string)
 
     return node_name_string
 
-def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None):
-    assert(start_iter >= 1)
+
+def generate_accuracy_plots(exp_dir, output_dir, plot, key='accuracy',
+                            file_basename='accuracy', comparison_dir=None,
+                            start_iter=1,
+                            latex_report=None, output_name='output'):
+    assert start_iter >= 1
 
     if plot:
         fig = plt.figure()
@@ -130,37 +165,51 @@ def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_base
     dirs = [exp_dir] + comparison_dir
     index = 0
     for dir in dirs:
-        [accuracy_report, accuracy_times, accuracy_data] = nlp.GenerateAccuracyReport(dir, key)
+        [accuracy_report, accuracy_times,
+         accuracy_data] = log_parse.generate_accuracy_report(dir, key,
+                                                             output_name)
         if index == 0:
             # this is the main experiment directory
-            acc_file = open("{0}/{1}.log".format(output_dir, file_basename), "w")
-            acc_file.write(accuracy_report)
-            acc_file.close()
+            with open("{0}/{1}.log".format(output_dir,
+                                           file_basename), "w") as f:
+                f.write(accuracy_report)
 
         if plot:
-            color_val = plot_colors[index]
+            color_val = g_plot_colors[index]
             data = np.array(accuracy_data)
             if data.shape[0] == 0:
                 raise Exception("Couldn't find any rows for the accuracy plot")
-            data = data[data[:,0]>=start_iter, :]
-            plot_handle, = plt.plot(data[:, 0], data[:, 1], color = color_val, linestyle = "--", label = "train {0}".format(dir))
+            data = data[data[:, 0] >= start_iter, :]
+            plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val,
+                                    linestyle="--",
+                                    label="train {0}".format(dir))
             plots.append(plot_handle)
-            plot_handle, = plt.plot(data[:, 0], data[:, 2], color = color_val, label = "valid {0}".format(dir))
+            plot_handle, = plt.plot(data[:, 0], data[:, 2], color=color_val,
+                                    label="valid {0}".format(dir))
             plots.append(plot_handle)
         index += 1
     if plot:
         plt.xlabel('Iteration')
         plt.ylabel(key)
-        lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1 ), ncol=1, borderaxespad=0.)
+        lgd = plt.legend(handles=plots, loc='lower center',
+                         bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1),
+                         ncol=1, borderaxespad=0.)
         plt.grid(True)
-        fig.suptitle("{0} plot".format(key))
-        figfile_name = '{0}/{1}.pdf'.format(output_dir, file_basename)
-        plt.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+        fig.suptitle("{0} plot for {1}".format(key, output_name))
+        figfile_name = '{0}/{1}_{2}.pdf'.format(
+            output_dir, file_basename,
+            latex_compliant_name(output_name))
+        plt.savefig(figfile_name, bbox_extra_artists=(lgd,),
+                    bbox_inches='tight')
         if latex_report is not None:
-            latex_report.AddFigure(figfile_name, "Plot of {0} vs iterations".format(key))
+            latex_report.add_figure(
+                figfile_name,
+                "Plot of {0} vs iterations for {1}".format(key, output_name))
 
-def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None):
-    assert(start_iter >= 1)
+
+def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
+                                start_iter=1, latex_report=None):
+    assert start_iter >= 1
 
     comparison_dir = [] if comparison_dir is None else comparison_dir
     dirs = [exp_dir] + comparison_dir
@@ -168,7 +217,8 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
     stats_per_dir = {}
 
     for dir in dirs:
-        stats_per_component_per_iter = nlp.ParseProgressLogsForNonlinearityStats(dir)
+        stats_per_component_per_iter = (
+            log_parse.parse_progress_logs_for_nonlinearity_stats(dir))
         stats_per_dir[dir] = stats_per_component_per_iter
 
     # convert the nonlin stats into tables
@@ -192,14 +242,16 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
     main_stat_tables = stat_tables_per_component_per_dir[exp_dir]
     for component_name in main_stat_tables.keys():
         # this is the main experiment directory
-        file = open("{dir}/nonlinstats_{comp_name}.log".format(dir = output_dir, comp_name = component_name), "w")
-        file.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n")
-        iter_stat_report = ""
-        iter_stats = main_stat_tables[component_name]
-        for row in iter_stats:
-            iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n"
-        file.write(iter_stat_report)
-        file.close()
+        with open("{dir}/nonlinstats_{comp_name}.log".format(
+                    dir=output_dir, comp_name=component_name), "w") as f:
+            f.write(
+                "Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n")
+            iter_stat_report = []
+            iter_stats = main_stat_tables[component_name]
+            for row in iter_stats:
+                iter_stat_report.append("\t".join([str(x) for x in row]))
+            f.write("\n".join(iter_stat_report))
+            f.close()
 
     if plot:
         main_component_names = main_stat_tables.keys()
@@ -208,11 +260,15 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
         plot_component_names = set(main_component_names)
         for dir in dirs:
             component_names = set(stats_per_dir[dir].keys())
-            plot_component_names = plot_component_names.intersection(component_names)
+            plot_component_names = plot_component_names.intersection(
+                component_names)
         plot_component_names = list(plot_component_names)
         plot_component_names.sort()
         if plot_component_names != main_component_names:
-            logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.")
+            logger.warning("""The components in all the neural networks in the
+            given experiment dirs are not the same, so comparison plots are
+            provided only for common component names. Make sure that these are
+            comparable experiments before analyzing these plots.""")
 
         fig = plt.figure()
         for component_name in main_component_names:
@@ -220,43 +276,62 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
             index = 0
             plots = []
             for dir in dirs:
-                color_val = plot_colors[index]
+                color_val = g_plot_colors[index]
                 index += 1
                 try:
-                    iter_stats = stat_tables_per_component_per_dir[dir][component_name]
+                    iter_stats = (
+                        stat_tables_per_component_per_dir[dir][component_name])
                 except KeyError:
-                    # this component is not available in this network so lets not just plot it
+                    # this component is not available in this network so lets
+                    # not just plot it
                     continue
 
                 data = np.array(iter_stats)
-                data = data[data[:,0] >=start_iter, :]
+                data = data[data[:, 0] >= start_iter, :]
                 ax = plt.subplot(211)
-                mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Mean {0}".format(dir))
-                msph, = ax.plot(data[:,0], data[:,1] + data[:,2], color=color_val, linestyle='--', label = "Mean+-Stddev {0}".format(dir))
-                mspl, = ax.plot(data[:,0], data[:,1] - data[:,2], color=color_val, linestyle='--')
+                mp, = ax.plot(data[:, 0], data[:, 1], color=color_val,
+                              label="Mean {0}".format(dir))
+                msph, = ax.plot(data[:, 0], data[:, 1] + data[:, 2],
+                                color=color_val, linestyle='--',
+                                label="Mean+-Stddev {0}".format(dir))
+                mspl, = ax.plot(data[:, 0], data[:, 1] - data[:, 2],
+                                color=color_val, linestyle='--')
                 plots.append(mp)
                 plots.append(msph)
                 ax.set_ylabel('Value-{0}'.format(comp_type))
                 ax.grid(True)
 
                 ax = plt.subplot(212)
-                mp, = ax.plot(data[:,0], data[:,3], color=color_val)
-                msph, = ax.plot(data[:,0], data[:,3] + data[:,4], color=color_val, linestyle='--')
-                mspl, = ax.plot(data[:,0], data[:,3] - data[:,4], color=color_val, linestyle='--')
+                mp, = ax.plot(data[:, 0], data[:, 3], color=color_val)
+                msph, = ax.plot(data[:, 0], data[:, 3] + data[:, 4],
+                                color=color_val, linestyle='--')
+                mspl, = ax.plot(data[:, 0], data[:, 3] - data[:, 4],
+                                color=color_val, linestyle='--')
                 ax.set_xlabel('Iteration')
                 ax.set_ylabel('Derivative-{0}'.format(comp_type))
                 ax.grid(True)
 
-            lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
+            lgd = plt.legend(handles=plots, loc='lower center',
+                             bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2),
+                             ncol=1, borderaxespad=0.)
             plt.grid(True)
-            fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name))
-            comp_name = LatexCompliantName(component_name)
-            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
-            fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+            fig.suptitle("Mean and stddev of the value and derivative at "
+                         "{comp_name}".format(comp_name=component_name))
+            comp_name = latex_compliant_name(component_name)
+            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(
+                dir=output_dir, comp_name=comp_name)
+            fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
+                        bbox_inches='tight')
             if latex_report is not None:
-                latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name))
+                latex_report.add_figure(
+                    figfile_name,
+                    "Mean and stddev of the value and derivative "
+                    "at {0}".format(component_name))
+
 
-def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None):
+def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
+                                      comparison_dir=None, start_iter=1,
+                                      latex_report=None):
     assert(start_iter >= 1)
 
     comparison_dir = [] if comparison_dir is None else comparison_dir
@@ -265,10 +340,11 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
     stats_per_dir = {}
     for dir in dirs:
         try:
-            stats_per_dir[dir] = nlp.ParseProgressLogsForClippedProportion(dir)
-        except nlp.MalformedClippedProportionLineException as e:
+            stats_per_dir[dir] = (
+                log_parse.parse_progress_logs_for_clipped_proportion(dir))
+        except log_parse.MalformedClippedProportionLineException as e:
             raise e
-        except train_lib.KaldiCommandException as e:
+        except common_lib.KaldiCommandException as e:
             warnings.warn("Could not extract the clipped proportions for {0},"
                           " this might be because there are no "
                           "ClipGradientComponents.".format(dir))
@@ -276,12 +352,13 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:
-        warnings.warn("The main experiment directory {0} does not have clipped"
-                      " proportions. So not generating clipped proportion plots.".format(exp_dir))
+        warnings.warn("The main experiment directory {0} does not have "
+                      "clipped proportions. So not generating clipped "
+                      "proportion plots.".format(exp_dir))
         return
 
     # this is the main experiment directory
-    file = open("{dir}/clipped_proportion.log".format(dir = output_dir), "w")
+    file = open("{dir}/clipped_proportion.log".format(dir=output_dir), "w")
     iter_stat_report = ""
     for row in main_cp_stats:
         iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n"
@@ -289,19 +366,26 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
     file.close()
 
     if plot:
-        main_component_names = stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys()
+        main_component_names = (
+            stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys())
         main_component_names.sort()
         plot_component_names = set(main_component_names)
         for dir in dirs:
             try:
-                component_names = set(stats_per_dir[dir]['cp_per_iter_per_component'].keys())
-                plot_component_names = plot_component_names.intersection(component_names)
+                component_names = set(
+                    stats_per_dir[dir]['cp_per_iter_per_component'].keys())
+                plot_component_names = (
+                    plot_component_names.intersection(component_names))
             except KeyError:
                 continue
         plot_component_names = list(plot_component_names)
         plot_component_names.sort()
         if plot_component_names != main_component_names:
-            logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.")
+            logger.warning(
+                """The components in all the neural networks in the given
+                experiment dirs are not the same, so comparison plots are
+                provided only for common component names. Make sure that these
+                are comparable experiments before analyzing these plots.""")
 
         fig = plt.figure()
         for component_name in main_component_names:
@@ -309,133 +393,187 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
             index = 0
             plots = []
             for dir in dirs:
-                color_val = plot_colors[index]
+                color_val = g_plot_colors[index]
                 index += 1
                 try:
-                    iter_stats = stats_per_dir[dir]['cp_per_iter_per_component'][component_name]
+                    iter_stats = stats_per_dir[dir][
+                        'cp_per_iter_per_component'][component_name]
                 except KeyError:
-                    # this component is not available in this network so lets not just plot it
+                    # this component is not available in this network so lets
+                    # not just plot it
                     continue
 
                 data = np.array(iter_stats)
-                data = data[data[:,0] >=start_iter, :]
+                data = data[data[:, 0] >= start_iter, :]
                 ax = plt.subplot(111)
-                mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Clipped Proportion {0}".format(dir))
+                mp, = ax.plot(data[:, 0], data[:, 1], color=color_val,
+                              label="Clipped Proportion {0}".format(dir))
                 plots.append(mp)
                 ax.set_ylabel('Clipped Proportion')
                 ax.set_ylim([0, 1.2])
                 ax.grid(True)
-            lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
+            lgd = plt.legend(handles=plots, loc='lower center',
+                             bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2),
+                             ncol=1, borderaxespad=0.)
             plt.grid(True)
-            fig.suptitle("Clipped-proportion value at {comp_name}".format(comp_name = component_name))
-            comp_name = LatexCompliantName(component_name)
-            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
-            fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+            fig.suptitle("Clipped-proportion value at {comp_name}".format(
+                            comp_name=component_name))
+            comp_name = latex_compliant_name(component_name)
+            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(
+                dir=output_dir, comp_name=comp_name)
+            fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
+                        bbox_inches='tight')
             if latex_report is not None:
-                latex_report.AddFigure(figfile_name, "Clipped proportion at {0}".format(component_name))
+                latex_report.add_figure(
+                    figfile_name,
+                    "Clipped proportion at {0}".format(component_name))
 
-def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None):
+
+def generate_parameter_diff_plots(exp_dir, output_dir, plot,
+                                  comparison_dir=None, start_iter=1,
+                                  latex_report=None):
     # Parameter changes
-    assert(start_iter >= 1)
+    assert start_iter >= 1
 
     comparison_dir = [] if comparison_dir is None else comparison_dir
     dirs = [exp_dir] + comparison_dir
     index = 0
     stats_per_dir = {}
-    key_file = {"Parameter differences" : "parameter.diff",
-                "Relative parameter differences" : "relative_parameter.diff"}
+    key_file = {"Parameter differences": "parameter.diff",
+                "Relative parameter differences": "relative_parameter.diff"}
     stats_per_dir = {}
     for dir in dirs:
         stats_per_dir[dir] = {}
-        for key in key_file.keys():
-            stats_per_dir[dir][key] = nlp.ParseProgressLogsForParamDiff(dir, key, logger)
+        for key in key_file:
+            stats_per_dir[dir][key] = (
+                log_parse.parse_progress_logs_for_param_diff(dir, key))
 
     # write down the stats for the main experiment directory
-    for diff_type in key_file.keys():
-        file = open("{0}/{1}".format(output_dir, key_file[diff_type]), "w")
-        diff_per_component_per_iter = stats_per_dir[exp_dir][diff_type]['progress_per_component']
-        component_names = stats_per_dir[exp_dir][diff_type]['component_names']
-        max_iter = stats_per_dir[exp_dir][diff_type]['max_iter']
-        file.write(" ".join(["Iteration"] + component_names)+"\n")
-        total_missing_iterations = 0
-        gave_user_warning = False
-        for iter in range(max_iter + 1):
-            iter_data = [str(iter)]
-            for c in component_names:
-                try:
-                    iter_data.append(str(diff_per_component_per_iter[c][iter]))
-                except KeyError:
-                    total_missing_iterations += 1
-                    iter_data.append("NA")
-            if (total_missing_iterations/len(component_names) > 20) and not gave_user_warning :
-                logger.warning("There are more than {0} missing iterations per component. Something might be wrong.".format(total_missing_iterations/len(component_names)))
-                gave_user_warning = True
-
-            file.write(" ".join(iter_data)+"\n")
-        file.close()
+    for diff_type in key_file:
+        with open("{0}/{1}".format(output_dir, key_file[diff_type]), "w") as f:
+            diff_per_component_per_iter = (
+                stats_per_dir[exp_dir][diff_type]['progress_per_component'])
+            component_names = (
+                stats_per_dir[exp_dir][diff_type]['component_names'])
+            max_iter = stats_per_dir[exp_dir][diff_type]['max_iter']
+            f.write(" ".join(["Iteration"] + component_names)+"\n")
+            total_missing_iterations = 0
+            gave_user_warning = False
+            for iter in range(max_iter + 1):
+                iter_data = [str(iter)]
+                for c in component_names:
+                    try:
+                        iter_data.append(
+                            str(diff_per_component_per_iter[c][iter]))
+                    except KeyError:
+                        total_missing_iterations += 1
+                        iter_data.append("NA")
+                if (total_missing_iterations/len(component_names) > 20
+                        and not gave_user_warning):
+                    logger.warning("There are more than {0} missing "
+                                   "iterations per component. "
+                                   "Something might be wrong.".format(
+                                       total_missing_iterations
+                                       / len(component_names)))
+                    gave_user_warning = True
+
+                f.write(" ".join(iter_data)+"\n")
 
     if plot:
         # get the component names
         diff_type = key_file.keys()[0]
-        main_component_names = stats_per_dir[exp_dir][diff_type]['progress_per_component'].keys()
+        main_component_names = stats_per_dir[exp_dir][diff_type][
+            'progress_per_component'].keys()
         main_component_names.sort()
         plot_component_names = set(main_component_names)
 
         for dir in dirs:
             try:
-                component_names = set(stats_per_dir[dir][diff_type]['progress_per_component'].keys())
-                plot_component_names = plot_component_names.intersection(component_names)
+                component_names = set(stats_per_dir[dir][diff_type][
+                    'progress_per_component'].keys())
+                plot_component_names = plot_component_names.intersection(
+                    component_names)
             except KeyError:
                 continue
         plot_component_names = list(plot_component_names)
         plot_component_names.sort()
         if plot_component_names != main_component_names:
-            logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.")
+            logger.warning("The components in all the neural networks in the "
+                           "given experiment dirs are not the same, "
+                           "so comparison plots are provided only for common "
+                           "component names. "
+                           "Make sure that these are comparable experiments "
+                           "before analyzing these plots.")
 
-        assert(main_component_names)
+        assert main_component_names
 
         fig = plt.figure()
-        logger.info("Generating parameter-difference plots for the following components:{0}".format(', '.join(main_component_names)))
-
+        logger.info("Generating parameter-difference plots for the "
+                    "following components:{0}".format(
+                        ', '.join(main_component_names)))
 
         for component_name in main_component_names:
             fig.clf()
             index = 0
             plots = []
             for dir in dirs:
-                color_val = plot_colors[index]
+                color_val = g_plot_colors[index]
                 index += 1
                 iter_stats = []
                 try:
-                    for diff_type in ['Parameter differences', 'Relative parameter differences']:
-                        iter_stats.append(np.array(sorted(stats_per_dir[dir][diff_type]['progress_per_component'][component_name].items())))
+                    for diff_type in ['Parameter differences',
+                                      'Relative parameter differences']:
+                        iter_stats.append(np.array(
+                            sorted(stats_per_dir[dir][diff_type][
+                                'progress_per_component'][
+                                    component_name].items())))
                 except KeyError as e:
-                    # this component is not available in this network so lets not just plot it
-                    if dir==exp_dir:
-                        raise Exception("No parameter differences were available even in the main experiment dir for the component {0}. Something went wrong.".format(component_name))
+                    # this component is not available in this network so lets
+                    # not just plot it
+                    if dir == exp_dir:
+                        raise Exception("No parameter differences were "
+                                        "available even in the main "
+                                        "experiment dir for the component "
+                                        "{0}. Something went wrong: "
+                                        "{1}.".format(
+                                            component_name, str(e)))
                     continue
                 ax = plt.subplot(211)
-                mp, = ax.plot(iter_stats[0][:,0], iter_stats[0][:,1], color=color_val, label="Parameter Differences {0}".format(dir))
+                mp, = ax.plot(iter_stats[0][:, 0], iter_stats[0][:, 1],
+                              color=color_val,
+                              label="Parameter Differences {0}".format(dir))
                 plots.append(mp)
                 ax.set_ylabel('Parameter Differences')
                 ax.grid(True)
 
                 ax = plt.subplot(212)
-                mp, = ax.plot(iter_stats[1][:,0], iter_stats[1][:,1], color=color_val, label="Relative Parameter Differences {0}".format(dir))
+                mp, = ax.plot(iter_stats[1][:, 0], iter_stats[1][:, 1],
+                              color=color_val,
+                              label="Relative Parameter "
+                                    "Differences {0}".format(dir))
                 ax.set_xlabel('Iteration')
                 ax.set_ylabel('Relative Parameter Differences')
                 ax.grid(True)
 
-            lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
+            lgd = plt.legend(handles=plots, loc='lower center',
+                             bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2),
+                             ncol=1, borderaxespad=0.)
             plt.grid(True)
-            fig.suptitle("Parameter differences at {comp_name}".format(comp_name = component_name))
-            comp_name = LatexCompliantName(component_name)
-            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
-            fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+            fig.suptitle("Parameter differences at {comp_name}".format(
+                comp_name=component_name))
+            comp_name = latex_compliant_name(component_name)
+            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(
+                dir=output_dir, comp_name=comp_name)
+            fig.savefig(figfile_name, bbox_extra_artists=(lgd,),
+                        bbox_inches='tight')
             if latex_report is not None:
-                latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
+                latex_report.add_figure(
+                    figfile_name,
+                    "Parameter differences at {0}".format(component_name))
 
-def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, objective_type = "linear"):
+
+def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
+                   start_iter=1):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -443,45 +581,84 @@ def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, ob
             pass
         else:
             raise e
-    if plot:
+    if g_plot:
         latex_report = LatexReport("{0}/report.pdf".format(output_dir))
     else:
         latex_report = None
 
-    if objective_type == "chain":
-        logger.info("Generating log-probability plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-    elif objective_type == "linear":
-        logger.info("Generating accuracy plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-
-        logger.info("Generating log-likelihood plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-    else:
-        logger.info("Generating " + objective_type + " objective plots")
-        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    for (output_name, objective_type) in output_names:
+        if objective_type == "linear":
+            logger.info("Generating accuracy plots")
+            generate_accuracy_plots(
+                exp_dir, output_dir, g_plot, key='accuracy',
+                file_basename='accuracy', comparison_dir=comparison_dir,
+                start_iter=start_iter,
+                latex_report=latex_report, output_name=output_name)
+
+            logger.info("Generating log-likelihood plots")
+            generate_accuracy_plots(
+                exp_dir, output_dir, g_plot, key='log-likelihood',
+                file_basename='loglikelihood', comparison_dir=comparison_dir,
+                start_iter=start_iter,
+                latex_report=latex_report, output_name=output_name)
+        elif objective_type == "chain":
+            logger.info("Generating log-probability plots")
+            generate_accuracy_plots(
+                exp_dir, output_dir, g_plot,
+                key='log-probability', file_basename='log_probability',
+                comparison_dir=comparison_dir, start_iter=start_iter,
+                latex_report=latex_report, output_name=output_name)
+        else:
+            logger.info("Generating " + objective_type + " objective plots")
+            generate_accuracy_plots(
+                exp_dir, output_dir, g_plot, key='objective',
+                file_basename='objective', comparison_dir=comparison_dir,
+                start_iter=start_iter,
+                latex_report=latex_report, output_name=output_name)
 
     logger.info("Generating non-linearity stats plots")
-    GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    generate_nonlin_stats_plots(
+        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
+        start_iter=start_iter, latex_report=latex_report)
 
     logger.info("Generating clipped-proportion plots")
-    GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    generate_clipped_proportion_plots(
+        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
+        start_iter=start_iter, latex_report=latex_report)
 
     logger.info("Generating parameter difference plots")
-    GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-
+    generate_parameter_diff_plots(
+        exp_dir, output_dir, g_plot, comparison_dir=comparison_dir,
+        start_iter=start_iter, latex_report=latex_report)
 
-    if plot and latex_report is not None:
-        has_compiled = latex_report.Close()
+    if g_plot and latex_report is not None:
+        has_compiled = latex_report.close()
         if has_compiled:
-            logger.info("Report has been generated. You can find it at the location {0}".format("{0}/report.pdf".format(output_dir)))
+            logger.info("Report has been generated. "
+                        "You can find it at the location "
+                        "{0}".format("{0}/report.pdf".format(output_dir)))
+
+
+def main():
+    args = get_args()
+
+    output_nodes = []
+
+    if args.output_nodes is not None:
+        nodes = args.output_nodes.split(' ')
+        for n in nodes:
+            parts = n.split(':')
+            assert len(parts) == 2
+            output_nodes.append(tuple(parts))
+    elif args.is_chain:
+        output_nodes.append(('output', 'chain'))
+    else:
+        output_nodes.append(('output', 'linear'))
+
+    generate_plots(args.exp_dir, args.output_dir, output_nodes,
+                   comparison_dir=args.comparison_dir,
+                   start_iter=args.start_iter)
 
-def Main():
-    args = GetArgs()
-    GeneratePlots(args.exp_dir, args.output_dir,
-                  comparison_dir = args.comparison_dir,
-                  start_iter = args.start_iter,
-                  objective_type = args.objective_type)
 
 if __name__ == "__main__":
-    Main()
+    main()

From 09186531a78f295361ba5fbda3ed211c847b7c52 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 27 Nov 2016 16:29:18 -0500
Subject: [PATCH 46/71] raw_python_script: Fixing name conflicts and moving
 print statement to logger

---
 .../nnet3/train/chain_objf/acoustic_model.py  |  2 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   | 22 +++++++++----------
 .../nnet3/train/frame_level_objf/common.py    |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index c2378b90c1c..0c871f07c2e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -517,7 +517,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
     factors (max_models_combine) is moved into the nnet3-combine.
     """
     raw_model_strings = []
-    print len(models_to_combine)
+    logger.info("Combining {0} models.".format(models_to_combine))
 
     models_to_combine.add(num_iters)
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index dcf07fa8af3..25dbeed5e4d 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -51,7 +51,7 @@ def get_successful_models(num_models, log_file_pattern,
         model_num = i + 1
         logfile = re.sub('%', str(model_num), log_file_pattern)
         lines = open(logfile, 'r').readlines()
-        this_objf = -100000
+        this_objf = -100000.0
         for line_num in range(1, len(lines) + 1):
             # we search from the end as this would result in
             # lesser number of regex searches. Python regex is slow !
@@ -174,13 +174,13 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
                    left_context, right_context):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
         egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
         egs_left_context = int(open('{0}/info/left_context'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
         egs_right_context = int(open('{0}/info/right_context'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
         if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
             raise Exception("There is mismatch between featdim/ivector_dim of "
                             "the current experiment and the provided "
@@ -191,15 +191,15 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
             raise Exception('The egs have insufficient context')
 
         frames_per_eg = int(open('{0}/info/frames_per_eg'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
         num_archives = int(open('{0}/info/num_archives'.format(
-                                egs_dir)).readline())
+                                    egs_dir)).readline())
 
         return [egs_left_context, egs_right_context,
                 frames_per_eg, num_archives]
     except (IOError, ValueError) as e:
         raise Exception("The egs dir {0} has missing or "
-                        "malformed files: {1}".format(egs_dir, e.str()))
+                        "malformed files: {1}".format(egs_dir, e.strerr))
 
 
 def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
@@ -218,7 +218,7 @@ def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
     common_lib.run_job(
         """{command} {dir}/log/sum_pdf_counts.log \
                 vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts \
-        """.format(command=run_opts.command,  dir=dir))
+        """.format(command=run_opts.command, dir=dir))
 
     for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
         os.remove(file)
@@ -364,7 +364,7 @@ def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
     return False
 
 
-def remove_egs(egs_dir):
+def remove_nnet_egs(egs_dir):
     common_lib.run_job("steps/nnet2/remove_egs.sh {egs_dir}".format(
                             egs_dir=egs_dir))
 
@@ -375,7 +375,7 @@ def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
                    get_raw_nnet_from_am=True):
     try:
         if remove_egs:
-            remove_egs(egs_dir)
+            remove_nnet_egs(egs_dir)
 
         for iter in range(num_iters):
             remove_model(nnet_dir, iter, num_iters, None,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 9f29cebb0d2..87cae801e90 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -461,7 +461,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
     factors (max_models_combine) is moved into the nnet3-combine.
     """
     raw_model_strings = []
-    print len(models_to_combine)
+    logger.info("Combining {0} models.".format(models_to_combine))
 
     models_to_combine.add(num_iters)
 

From 51339f4f33a8a040f7dfc596bf9db4669891be12 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 28 Nov 2016 17:54:43 -0500
Subject: [PATCH 47/71] Adding and debugging some scripts (in the process of
 integrating fast LSTMs.

---
 .../s5_r2/local/chain/tuning/run_lstm_1a.sh   | 243 +++++++++++++++++
 .../s5_r2/local/chain/tuning/run_lstm_1b.sh   | 243 +++++++++++++++++
 .../s5_r2/local/chain/tuning/run_lstm_1c.sh   | 243 +++++++++++++++++
 .../s5_r2/local/chain/tuning/run_tdnn_1b.sh   |  16 --
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |  65 ++---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 252 ++++++++++++++++--
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh       |   5 +-
 egs/wsj/s5/steps/nnet3/chain/train.py         |  29 +-
 egs/wsj/s5/steps/nnet3/get_saturation.pl      |  98 +++++++
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py       |  20 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  20 +-
 egs/wsj/s5/utils/data/get_frame_shift.sh      |   2 +-
 12 files changed, 1115 insertions(+), 121 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
 create mode 100755 egs/wsj/s5/steps/nnet3/get_saturation.pl

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..7027e8ffa8e
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_lstm.sh
+
+# without cleanup:
+# local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script (run_lstm_1a) is like run_tdnn_1b.sh except modified to use an LSTM
+# configuration (some aspects borrowed from egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh).
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+lstm_affix=1a  #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/lstm${lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
new file mode 100755
index 00000000000..3fdabd651e0
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_lstm_1b.sh is like run_lstm_1a.sh, but using a regular LSTM, not LSTMP,
+# layer, as a closer baseline for the 'fast' LSTM layer.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+lstm_affix=1b  #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/lstm${lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstm-layer name=lstm1 cell-dim=512 delay=-3
+  lstm-layer name=lstm2 cell-dim=512 delay=-3
+  lstm-layer name=lstm3 cell-dim=512 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
new file mode 100755
index 00000000000..b495f49f6a3
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
+# 'lstm-layer'.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+lstm_affix=1c  #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/lstm${lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstm-layer name=lstm1 cell-dim=512 delay=-3
+  fast-lstm-layer name=lstm2 cell-dim=512 delay=-3
+  fast-lstm-layer name=lstm3 cell-dim=512 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 72070551ec8..09dec79ee60 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -166,22 +166,6 @@ if [ $stage -le 17 ]; then
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 450 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize "$xent_regularize" \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index dcf07fa8af3..cd788ca7328 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -318,53 +318,34 @@ def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
     return num_jobs * effective_learning_rate
 
 
-def do_shrinkage(iter, model_file, non_linearity, shrink_threshold,
+def do_shrinkage(iter, model_file, shrink_saturation_threshold,
                  get_raw_nnet_from_am=True):
 
     if iter == 0:
         return True
 
+    if get_raw_nnet_from_am:
+        output, error = common_lib.run_kaldi_command(
+            "nnet3-am-info --print-args=false {0} | "
+            "steps/nnet3/get_saturation.pl".format(model_file))
+    else:
+        output, error = common_lib.run_kaldi_command(
+            "nnet3-info --print-args=false {0} | "
+            "steps/nnet3/get_saturation.pl".format(model_file))
+    output = output.strip().split("\n")
     try:
-        if get_raw_nnet_from_am:
-            output, error = common_lib.run_kaldi_command(
-                "nnet3-am-info --print-args=false {model_file} | "
-                "grep {non_linearity}".format(
-                    non_linearity=non_linearity, model_file=model_file))
-        else:
-            output, error = common_lib.run_kaldi_command(
-                "nnet3-info --print-args=false {model_file} | "
-                "grep {non_linearity}".format(
-                    non_linearity=non_linearity, model_file=model_file))
-        output = output.strip().split("\n")
-        # eg.
-        # component name=Lstm1_f type=SigmoidComponent, dim=1280,
-        # count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90
-        # 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79
-        # 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198],
-        # deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
-        # 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25
-        # 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
-
-        mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*")
-        total_mean_deriv = 0
-        num_derivs = 0
-        for line in output:
-            mat_obj = mean_pattern.search(line)
-            if mat_obj is None:
-                raise Exception("Something went wrong, unable to find "
-                                "deriv-avg in the line \n{0}".format(line))
-            mean_deriv = float(mat_obj.groups()[0])
-            total_mean_deriv += mean_deriv
-            num_derivs += 1
-        if total_mean_deriv / num_derivs < shrink_threshold:
-            return True
-    except ValueError:
-        raise Exception("Error while parsing the model info output")
-
-    return False
-
-
-def remove_egs(egs_dir):
+        assert len(output) == 1
+        saturation = float(output[0])
+        assert saturation >= 0 and saturation <= 1
+    except:
+        raise Exception("Something went wrong, could not get "
+                        "saturation from the output '{0}' of "
+                        "get_saturation.pl on the info of "
+                        "model {1}".format(output, model_file))
+    return (saturation > shrink_saturation_threshold)
+
+
+def remove_nnet_egs(egs_dir):
     common_lib.run_job("steps/nnet2/remove_egs.sh {egs_dir}".format(
                             egs_dir=egs_dir))
 
@@ -375,7 +356,7 @@ def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
                    get_raw_nnet_from_am=True):
     try:
         if remove_egs:
-            remove_egs(egs_dir)
+            remove_nnet_egs(egs_dir)
 
         for iter in range(num_iters):
             remove_model(nnet_dir, iter, num_iters, None,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index db7424e3f0f..fba314c4972 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -117,6 +117,7 @@ def generate_lstm_config(self):
         affine_str = self.config['ng-affine-options']
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
+        ng_per_element_scale_options = self.config['ng-per-element-scale-options']
         if re.search('param-mean', ng_per_element_scale_options) is None and \
            re.search('param-stddev', ng_per_element_scale_options) is None:
            ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
@@ -130,6 +131,7 @@ def generate_lstm_config(self):
         # TODO: write these
         # naming convention
         # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("# Input gate control : W_i* matrices")
         configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
         configs.append("# note : the cell outputs pass through a diagonal matrix")
@@ -198,7 +200,7 @@ def generate_lstm_config(self):
         # add the recurrent connections
         configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
         configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name))
-
+        configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
 
@@ -231,8 +233,9 @@ def __init__(self, first_token, key_to_value, prev_names = None):
     def set_default_configs(self):
         self.config = {'input' : '[-1]',
                         'cell-dim' : -1, # this is a compulsory argument
-                        'recurrent-projection-dim' : -1,
-                        'non-recurrent-projection-dim' : -1,
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
                         'clipping-threshold' : 30.0,
                         'delay' : -1,
                         'ng-per-element-scale-options' : ' max-change=0.75 ',
@@ -246,15 +249,25 @@ def set_derived_configs(self):
         if self.config['cell-dim'] <= 0:
             self.config['cell-dim'] = self.InputDim()
 
-        for key in ['recurrent-projection-dim', 'non-recurrent-projection-dim']:
-            if self.config[key] <= 0:
-                self.config[key] = self.config['cell-dim'] / 2
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim'])
 
     def check_configs(self):
-        for key in ['cell-dim', 'recurrent-projection-dim', 'non-recurrent-projection-dim']:
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
             if self.config[key] <= 0:
-                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]), self.str())
-
+                raise xparser_error("{0} has invalid value {1}.".format(
+                    key, self.config[key]), self.str())
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise xparser_error("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim: {0}".format(self.str()))
         for key in ['self-repair-scale-nonlinearity']:
             if self.config[key] < 0.0 or self.config[key] > 1.0:
                 raise xparser_error("{0} has invalid value {2}.".format(self.layer_type,
@@ -449,6 +462,7 @@ def generate_lstm_config(self):
         configs = []
         # naming convention
         # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("# Full W_ifoc* matrix")
         configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
         configs.append("# note : the cell outputs pass through a diagonal matrix")
@@ -517,13 +531,14 @@ def generate_lstm_config(self):
 
         # add the recurrent connections
         configs.append("# projection matrices : Wrm and Wpm")
-        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, affine_str))
-        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, recurrent_projection_dim, bptrunc_str))
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
 
         configs.append("# r_t and p_t : rp_t will be the output")
         configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
-        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
         configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+        configs.append("### End LTSM layer '{0}'".format(name))
 
         return configs
 
@@ -569,6 +584,7 @@ def set_default_configs(self):
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 3.0
                         }
+        self.c_needed = False  # keep track of whether the 'c' output is needed.
 
     def set_derived_configs(self):
         if self.config['cell-dim'] <= 0:
@@ -579,9 +595,168 @@ def check_configs(self):
         if self.config['cell-dim'] <= 0:
             raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str())
 
-        for key in ['self-repair-scale-nonlinearity']:
-            if self.config[key] < 0.0 or self.config[key] > 1.0:
-                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]))
+
+
+    def auxiliary_outputs(self):
+        return ['c']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'm'
+        if auxiliary_output is not None:
+            if auxiliary_output == 'c':
+                node_name = 'c'
+                self.c_needed = True
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output == 'c':
+                self.c_needed = True
+                return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        lstm_str = self.config['lstm-nonlinearity-options']
+
+
+        configs = []
+
+        # the equations implemented here are
+        # TODO: write these
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("### Begin LTSM layer '{0}'".format(name))
+        configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
+        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                       "output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim * 4, affine_str))
+        configs.append("# The core LSTM nonlinearity, implemented as a single component.")
+        configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
+        configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
+        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str))
+        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
+
+        configs.append("###  Nodes for the components above.")
+        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
+                       "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
+                       "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+        # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
+        #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("### End LTSM layer '{0}'".format(name))
+        return configs
+
+
+
+
+# This class is for lines like
+#   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3'
+# or:
+#   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512'
+# It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP).
+# Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose
+# component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
+# into one.
+#
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigFastLstmpLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-lstm-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,
+                        'non-recurrent-projection-dim' : -1,
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        # if you want to set 'self-repair-scale' (c.f. the
+                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
+                        # add 'self-repair-scale=xxx' to
+                        # lstm-nonlinearity-options.
+                        'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # the affine layer contains 4 of our old layers -> use a
+                        # larger max-change than the normal value of 0.75.
+                        'ng-affine-options' : ' max-change=1.5',
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim'])
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise xparser_error("{0} has invalid value {1}.".format(
+                    key, self.config[key]), self.str())
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise xparser_error("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim: {0}".format(self.str()))
+
 
     def auxiliary_outputs(self):
         return ['c_t']
@@ -599,13 +774,13 @@ def output_name(self, auxiliary_output = None):
     def output_dim(self, auxiliary_output = None):
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
-                if node_name == 'c_t':
+                if node_name == 'c':
                     return self.config['cell-dim']
                 # add code for other auxiliary_outputs here when we decide to expose them
             else:
                 raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
-
-        return self.config['cell-dim']
+        return self.config['recurrent-projection-dim'] + \
+               self.config['non-recurrent-projection-dim']
 
     def get_full_config(self):
         ans = []
@@ -627,6 +802,8 @@ def generate_lstm_config(self):
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
         delay = self.config['delay']
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
@@ -646,22 +823,43 @@ def generate_lstm_config(self):
         # TODO: write these
         # naming convention
         # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
-        configs.append("###  Components for the LTSM layer named '{0}'".format(name))
+        configs.append("##  Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
-        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim * 4, affine_str))
+        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                       "output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim * 4, affine_str))
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
-        configs.append("Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
+        configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
-        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
-
+        configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
+                       "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
+        configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
+        configs.append("# and non-recurrent projections")
+        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
+                       "output-dim={2} {3}".format(
+                           name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+                       "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                        "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
-        configs.append("dim-range-node name={0}.c input={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.m input={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.c_trunc input={0}.c".format(name))
+        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
+                       "dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "
+                       "dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("# {0}.rp is the output node of this layer:".format(name))
+        configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name))
+        configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 "
+                       "dim={1}".format(name, rec_proj_dim))
+        configs.append("# Note: it's not 100% efficient that we have to stitch the c")
+        configs.append("# and r back together to truncate them but it probably");
+        configs.append("# makes the deriv truncation more accurate .")
+        configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
+                       "input=Append({0}.c, {0}.r")
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
+                       "dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
+                       "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        configs.append("### End LSTM Layer '{0}'".format(name))
         return configs
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index f560c5a9b27..c7263f41698 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -233,7 +233,10 @@ if [ $stage -le 1 ]; then
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
   feats_one="$(echo $feats | sed s/JOB/1/g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  if ! feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo "Command failed (getting feature dim): feat-to-dim \"$feats_one\""
+    exit 1
+  fi
   echo $feat_dim > $dir/info/feat_dim
 else
   num_frames=$(cat $dir/info/num_frames) || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 2acfe56727d..2879f2740fc 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -127,23 +127,17 @@ def get_args():
                         dest='shrink_value', default=1.0,
                         help="""Scaling factor used for scaling the parameter
                         matrices when the derivative averages are below the
-                        shrink-threshold at the non-linearities""")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
-                        dest='shrink_threshold', default=0.15,
-                        help="""If the derivative averages are below this
+                        shrink-threshold at the non-linearities.  E.g. 0.99.
+                        Only applicable when the neural net contains sigmoid or
+                        tanh units.""")
+    parser.add_argument("--trainer.optimization.shrink-saturation-threshold", type=float,
+                        dest='shrink_saturation_threshold', default=0.40,
+                        help="""Threshold that controls when we apply the 'shrinkage'
+                        (i.e. scaling by shrink-value).  If the saturation of the
+                        sigmoid and tanh nonlinearities in the neural net (as
+                        measured by steps/nnet3/get_saturation.pl) exceeds this
                         threshold we scale the parameter matrices with the
-                        shrink-value. It is less than 0.25 for sigmoid
-                        non-linearities.""")
-    parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str,
-                        dest='shrink_nonlinearity', default="SigmoidComponent",
-                        choices=["TanhComponent", "SigmoidComponent"],
-                        help="""The non-linear component from which the
-                        deriv-avg values are going to used to compute
-                        mean-deriv-avg. The mean-deriv-avg is going to be
-                        compared with shrink-threshold. Be careful to specify a
-                        shrink-threshold which is dependent on the
-                        shrink-nonlinearity type""")
-
+                        shrink-value.""")
     # RNN specific trainer options
     parser.add_argument("--trainer.num-chunk-per-minibatch",
                         "--trainer.rnn.num-chunk-per-minibatch",
@@ -424,8 +418,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 shrinkage_value = (args.shrink_value
                                    if common_train_lib.do_shrinkage(
                                         iter, model_file,
-                                        args.shrink_nonlinearity,
-                                        args.shrink_threshold)
+                                        args.shrink_saturation_threshold)
                                    else 1
                                    )
             logger.info("On iteration {0}, learning rate is {1} and "
diff --git a/egs/wsj/s5/steps/nnet3/get_saturation.pl b/egs/wsj/s5/steps/nnet3/get_saturation.pl
new file mode 100755
index 00000000000..5c59d9e5520
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_saturation.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/env perl
+
+# This program parses the output of nnet3-am-info or nnet3-info,
+# and prints out a number between zero and one that reflects
+# how saturated the (sigmoid and tanh) nonlinearities are, on average
+# over the model.
+#
+# This is based on the 'avg-deriv' (average-derivative) values printed
+# out for the sigmoid and tanh components.  The 'saturation' of such a component
+# is defined as (1.0 - its avg-deriv / the maximum possible derivative of that nonlinearity),
+# where the denominator is 1.0 for tanh and 0.25 for sigmoid.
+# This component averages the saturation over all the sigmoid/tanh units in
+# the network.
+#
+# It parses the Info() output of components of type SigmoidComponent,
+# TanhComponent, and LstmNonlinearityComponent.  It prints an error message to
+# stderr and returns with status 1 if it could not find the info for any such components
+# in the input stream.
+
+# Usage: nnet3-am-info 10.mdl | steps/nnet3/get_saturation.pl
+# or: nnet3-info 10.raw | steps/nnet3/get_saturation.pl
+
+use warnings;
+
+my $num_nonlinearities = 0;
+my $total_saturation = 0.0;
+
+while (<STDIN>) {
+  if (m/type=SigmoidComponent/) {
+    # a line like:
+    # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05,
+    # value-avg=[percentiles(0,1,2,5 10,20,50,80,90
+    # 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79
+    # 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198],
+    # deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
+    # 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25
+    # 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
+    if (m/deriv-avg=.+mean=([^,]+),/) {
+      $num_nonlinearities += 1;
+      my $this_saturation = 1.0 - ($1 / 0.25);
+      $total_saturation += $this_saturation;
+    } else {
+      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
+    }
+  } elsif (m/type=TanhComponent/) {
+    if (m/deriv-avg=.+mean=([^,]+),/) {
+      $num_nonlinearities += 1;
+      my $this_saturation = 1.0 - ($1 / 1.0);
+      $total_saturation += $this_saturation;
+    } else {
+      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
+    }
+  } elsif (m/type=LstmNonlinearityComponent/) {
+    # An example of a line like this is right at the bottom of this program, it's extremely long.
+    my $ok = 1;
+    foreach my $sigmoid_name ( ("i_t", "f_t", "o_t") ) {
+      if (m/${sigmoid_name}_sigmoid={[^}]+deriv-avg=[^}]+mean=([^,]+),/) {
+        $num_nonlinearities += 1;
+        my $this_saturation = 1.0 - ($1 / 0.25);
+        $total_saturation += $this_saturation;
+      } else {
+        $ok = 0;
+      }
+    }
+    foreach my $tanh_name ( ("c_t", "m_t") ) {
+      if (m/${tanh_name}_tanh={[^}]+deriv-avg=[^}]+mean=([^,]+),/) {
+        $num_nonlinearities += 1;
+        my $this_saturation = 1.0 - ($1 / 1.0);
+        $total_saturation += $this_saturation;
+      } else {
+        $ok = 0;
+      }
+    }
+    if (! $ok) {
+      print STDERR "Could not parse at least one of the avg-deriv values in the following info line: $_";
+    }
+  }
+}
+
+
+if ($num_nonlinearities == 0) {
+  print "0.0\n";
+  exit(1);
+} else {
+  my $saturation = $total_saturation / $num_nonlinearities;
+  if ($saturation < 0.0 || $saturation > 1.0) {
+    print STDERR "Bad saturation value: $saturation\n";
+    exit(1);
+  } else {
+    print "$saturation\n";
+  }
+}
+
+
+
+
+# example line with LstmNonlinearityComponent that we parse:
+# component name=lstm2.lstm_nonlin type=LstmNonlinearityComponent, input-dim=2560, output-dim=1024, learning-rate=0.002, max-change=0.75, cell-dim=512, w_ic-rms=0.9941, w_fc-rms=0.8901, w_oc-rms=0.9794, count=3.53e+05, i_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0722299, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.04,0.08,0.09,0.12 0.17,0.25,0.46,0.76,0.87 0.91,0.96,0.96,1.0), mean=0.494, stddev=0.253], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0007,0.03,0.04,0.06 0.09,0.12,0.19,0.23,0.24 0.25,0.25,0.25,0.25), mean=0.179, stddev=0.0595] }, f_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0688061, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.11,0.13,0.17 0.22,0.30,0.51,0.70,0.82 0.90,0.96,0.98,1.0), mean=0.509, stddev=0.219], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.01,0.03,0.07 0.11,0.15,0.21,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.194, stddev=0.0561] }, c_t_tanh={ self-repair-lower-threshold=0.2, self-repair-scale=1e-05, self-repaired-proportion=0.178459, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-1.0,-0.98,-0.97,-0.92 -0.82,-0.65,-0.01,0.66,0.87 0.94,0.95,0.97,0.99), mean=0.00447, stddev=0.612], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.02,0.04,0.10 0.14,0.25,0.65,0.84,0.90 0.94,0.97,0.97,0.98), mean=0.58, stddev=0.281] }, o_t_sigmoid={ self-repair-lower-threshold=0.05, self-repair-scale=1e-05, self-repaired-proportion=0.0608838, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.07,0.09,0.12 0.17,0.25,0.52,0.77,0.86 0.90,0.94,0.96,0.99), mean=0.514, stddev=0.256], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.007,0.04,0.04,0.07 0.09,0.12,0.19,0.23,0.24 0.25,0.25,0.25,0.25), mean=0.175, stddev=0.0579] }, m_t_tanh={ self-repair-lower-threshold=0.2, self-repair-scale=1e-05, self-repaired-proportion=0.134653, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-0.99,-0.95,-0.92,-0.85 -0.73,-0.51,0.02,0.48,0.73 0.86,0.96,0.98,1.0), mean=0.00581, stddev=0.522], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.03,0.04,0.13 0.26,0.41,0.75,0.93,0.97 0.99,1.0,1.0,1.0), mean=0.672, stddev=0.272] }
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 1e448ddde98..26ff1acf9b9 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -98,13 +98,17 @@ def get_args():
                         dest='shrink_value', default=0.99,
                         help="""Scaling factor used for scaling the parameter
                         matrices when the derivative averages are below the
-                        shrink-threshold at the non-linearities""")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
-                        dest='shrink_threshold', default=0.15,
-                        help="""If the derivative averages are below this
+                        shrink-threshold at the non-linearities.  E.g. 0.99.
+                        Only applicable when the neural net contains sigmoid or
+                        tanh units.""")
+    parser.add_argument("--trainer.optimization.shrink-saturation-threshold", type=float,
+                        dest='shrink_saturation_threshold', default=0.40,
+                        help="""Threshold that controls when we apply the 'shrinkage'
+                        (i.e. scaling by shrink-value).  If the saturation of the
+                        sigmoid and tanh nonlinearities in the neural net (as
+                        measured by steps/nnet3/get_saturation.pl) exceeds this
                         threshold we scale the parameter matrices with the
-                        shrink-value.  It is less than 0.25 for sigmoid
-                        non-linearities.""")
+                        shrink-value.""")
     parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
                         dest='cv_minibatch_size', default=256,
                         help="""Size of the minibatch to be used in diagnostic
@@ -372,8 +376,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
                                    if common_train_lib.do_shrinkage(
-                                        iter, model_file, "SigmoidComponent",
-                                        args.shrink_threshold,
+                                        iter, model_file,
+                                        args.shrink_saturation_threshold,
                                         get_raw_nnet_from_am=False)
                                    else 1
                                    )
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index c735e9f27f6..11508f663a6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -97,13 +97,17 @@ def get_args():
                         dest='shrink_value', default=0.99,
                         help="""Scaling factor used for scaling the parameter
                         matrices when the derivative averages are below the
-                        shrink-threshold at the non-linearities""")
-    parser.add_argument("--trainer.optimization.shrink-threshold", type=float,
-                        dest='shrink_threshold', default=0.15,
-                        help="""If the derivative averages are below this
+                        shrink-threshold at the non-linearities.  E.g. 0.99.
+                        Only applicable when the neural net contains sigmoid or
+                        tanh units.""")
+    parser.add_argument("--trainer.optimization.shrink-saturation-threshold", type=float,
+                        dest='shrink_saturation_threshold', default=0.40,
+                        help="""Threshold that controls when we apply the 'shrinkage'
+                        (i.e. scaling by shrink-value).  If the saturation of the
+                        sigmoid and tanh nonlinearities in the neural net (as
+                        measured by steps/nnet3/get_saturation.pl) exceeds this
                         threshold we scale the parameter matrices with the
-                        shrink-value. It is less than 0.25 for sigmoid
-                        non-linearities.""")
+                        shrink-value.""")
     parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
                         dest='cv_minibatch_size', default=256,
                         help="""Size of the minibatch to be used in diagnostic
@@ -371,8 +375,8 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
                                    if common_train_lib.do_shrinkage(
-                                        iter, model_file, "SigmoidComponent",
-                                        args.shrink_threshold)
+                                        iter, model_file,
+                                        args.shrink_saturation_threshold)
                                    else 1
                                    )
             logger.info("On iteration {0}, learning rate is {1} and "
diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
index 47e56b1e8fd..d032c9c17fa 100755
--- a/egs/wsj/s5/utils/data/get_frame_shift.sh
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -45,7 +45,7 @@ fi
 
 temp=$(mktemp /tmp/tmp.XXXX)
 
-feat-to-len scp:$dir/feats.scp ark,t:- | head -n 10 > $temp
+feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
 
 if [ -z $temp ]; then
   echo "$0: error running feat-to-len" 1>&2

From 9cf85ef748b3d765a0188f62d3d90d767daa99bb Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 28 Nov 2016 17:55:05 -0500
Subject: [PATCH 48/71] Fixes to fast LSTM code, regarding self-repair sum.

---
 src/cudamatrix/cu-kernels.cu | 3 +--
 src/cudamatrix/cu-math.cc    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 9da59f92eaa..42aa412f1f6 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3018,7 +3018,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
     // need to update self_repair_sum_out before deriv_sum_out, because
     // deriv_sum_out and deriv_sum_in might point to the same memory.
     if (i0 < 5 && j < cell_dim) {
-      self_repair_sum_out[i0 * self_repair_sum_out_stride + j] +=
+      self_repair_sum_out[i0 * self_repair_sum_out_stride + j] =
           update_sr[i0] ? num_rows : 0;
     }
 
@@ -4560,4 +4560,3 @@ void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
       value_sum_out_stride, deriv_sum_out, deriv_sum_out_stride,
       self_repair_sum_out, self_repair_sum_out_stride);
 }
-
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 2d05ee1cfdc..acc3e0ba36d 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -547,7 +547,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       // need to update self_repair_sum_out before deriv_sum_out, because
       // deriv_sum_out and deriv_sum_in might point to the same memory.
       for (int32 i = 0; i < 5; i++)
-        (*self_repair_sum_out_mat)(i, c) +=
+        (*self_repair_sum_out_mat)(i, c) =
             (deriv_sum_in(i, c) / count < sr_config(i) ? num_rows : 0);
 
       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;

From 12924d29f3cf95c951d1d2a1f1824d8cfa126926 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 28 Nov 2016 19:40:12 -0500
Subject: [PATCH 49/71] Adding more example scripts.  Some python script fixes.

---
 .../s5_r2/local/chain/tuning/run_lstm_1d.sh   | 248 +++++++++++++++++
 .../s5_r2/local/chain/tuning/run_lstm_1e.sh   | 252 ++++++++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  58 ++--
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   3 +-
 4 files changed, 538 insertions(+), 23 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
new file mode 100755
index 00000000000..afacf7ec8e3
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected
+# LSTM (LSTMP)... the configuration is like 1a, which is a little broken
+# in that its non-recurrent-projection-dim is twice the recurrent-projection-dim,
+# but it's better for comparison purposes to have it the same.
+
+# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
+# 'lstm-layer'.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+lstm_affix=1d  #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/lstm${lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
new file mode 100755
index 00000000000..6a8a122b6ff
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+
+# run_lstm_1e.sh is like run_lstm_1d.sh, but reducing non-recurrent-projection-dim
+# from 256 to 128 (fixes an earlier mistake).
+
+# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected
+# LSTM (LSTMP)... the configuration is like 1a, which is a little broken
+# in that its non-recurrent-projection-dim is twice the recurrent-projection-dim,
+# but it's better for comparison purposes to have it the same.
+
+# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
+# 'lstm-layer'.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+lstm_affix=1e  #affix for LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/lstm${lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index fba314c4972..6ee1e1256f3 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -23,7 +23,10 @@
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   cell-dim=-1              [Dimension of the cell]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
-#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
@@ -216,10 +219,14 @@ def generate_lstm_config(self):
 # Parameters of the class, and their defaults:
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   cell-dim=-1            [Dimension of the cell]
-#   recurrent_projection_dim [Dimension of the projection used in recurrent connections]
-#   non_recurrent_projection_dim        [Dimension of the projection in non-recurrent connections]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
-#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
@@ -254,7 +261,7 @@ def set_derived_configs(self):
 
         if self.config['non-recurrent-projection-dim'] <= 0:
             self.config['non-recurrent-projection-dim'] = \
-               self.config['recurrent-projection-dim'])
+               self.config['recurrent-projection-dim']
 
     def check_configs(self):
         for key in ['cell-dim', 'recurrent-projection-dim',
@@ -558,11 +565,13 @@ def generate_lstm_config(self):
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   cell-dim=-1              [Dimension of the cell]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
-#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
-#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
-#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
-#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
-#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
+#   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
+#                                      do things like set biases to initialize to 1]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -572,6 +581,8 @@ def set_default_configs(self):
         self.config = {'input':'[-1]',
                         'cell-dim' : -1, # this is a compulsory argument
                         'clipping-threshold' : 30.0,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0,
                         'delay' : -1,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
@@ -580,9 +591,7 @@ def set_default_configs(self):
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
-                        'ng-affine-options' : ' max-change=1.5',
-                        'zeroing-interval' : 20,
-                        'zeroing-threshold' : 3.0
+                        'ng-affine-options' : ' max-change=1.5'
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -703,15 +712,20 @@ def generate_lstm_config(self):
 # Parameters of the class, and their defaults:
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   cell-dim=-1              [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
 #   delay=-1                 [Delay in the recurrent connections of the LSTM ]
-#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
-#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
-#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
-#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
-#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
+#   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
+#                                      do things like set biases to initialize to 1]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstm-layer"
+        assert first_token == "fast-lstmp-layer"
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -742,7 +756,7 @@ def set_derived_configs(self):
 
         if self.config['non-recurrent-projection-dim'] <= 0:
             self.config['non-recurrent-projection-dim'] = \
-               self.config['recurrent-projection-dim'])
+               self.config['recurrent-projection-dim']
 
     def check_configs(self):
         for key in ['cell-dim', 'recurrent-projection-dim',
@@ -762,7 +776,7 @@ def auxiliary_outputs(self):
         return ['c_t']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'm'
+        node_name = 'rp'
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
                 node_name = auxiliary_output
@@ -856,7 +870,7 @@ def generate_lstm_config(self):
         configs.append("# and r back together to truncate them but it probably");
         configs.append("# makes the deriv truncation more accurate .")
         configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
-                       "input=Append({0}.c, {0}.r")
+                       "input=Append({0}.c, {0}.r)".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 111abd4cb89..7ccab2f6c6f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -28,7 +28,8 @@
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
         'lstmpc-layer' : xlayers.XconfigLstmpcLayer,
-        'fast-lstm-layer' : xlayers.XconfigFastLstmLayer
+        'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
+        'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer
         }
 
 # Converts a line as parsed by ParseConfigLine() into a first

From 32d41679d512cb484dd6a37705b0bfae9bcda420 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Mon, 28 Nov 2016 22:38:01 -0500
Subject: [PATCH 50/71] raw_python_script: Updating to google standards

---
 egs/wsj/s5/steps/nnet3/chain/train.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index ba8ea8af3d3..dbac8054258 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -113,6 +113,10 @@ def get_args():
                         [input] frames per job.  This option is passed to
                         get_egs.sh.  Aim for about a minute of training
                         time""")
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
+                        dest='num_chunk_per_minibatch', default=512,
+                        help="Number of sequences to be processed in parallel "
+                        "every minibatch")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.initial-effective-lrate",
@@ -144,13 +148,9 @@ def get_args():
                         shrink-threshold which is dependent on the
                         shrink-nonlinearity type""")
 
-    # Chunk training options
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=512,
-                        help="Number of sequences to be processed in parallel "
-                        "every minibatch")
+    # RNN-specific training options
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
-                        dest='deriv_truncate_margin', default = None,
+                        dest='deriv_truncate_margin', default=None,
                         help="""(Relevant only for recurrent models). If
                         specified, gives the margin (in input frames) around
                         the 'required' part of each chunk that the derivatives
@@ -164,7 +164,7 @@ def get_args():
     parser.add_argument("--feat-dir", type=str, required=True,
                         help="Directory with features used for training "
                         "the neural network.")
-    parser.add_argument("--tree-dir", type=str, required = True,
+    parser.add_argument("--tree-dir", type=str, required=True,
                         help="""Directory containing the tree to use for this
                         model (we also expect final.mdl and ali.*.gz in that
                         directory""")
@@ -198,7 +198,7 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if not args.left_deriv_truncate is None:
+    if args.left_deriv_truncate is not None:
         args.deriv_truncate_margin = -args.left_deriv_truncate
         logger.warning(
             "--chain.left-deriv-truncate (deprecated) is set by user, and "
@@ -400,9 +400,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
 
     min_deriv_time = None
     max_deriv_time = None
-    if not args.deriv_truncate_margin is None:
+    if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + model_right_context
+        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
+                          + model_right_context)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))

From e8698a919366dacb51b6f115b1467673a71c4996 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 28 Nov 2016 23:10:31 -0500
Subject: [PATCH 51/71] various minor script and code fixes; add TDNN+LSTM
 example script (tedlium s5_r2)

---
 .../s5_r2/local/chain/tuning/run_lstm_1b.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1c.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1d.sh   |   2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1e.sh   |   2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    | 251 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |   4 +
 src/nnet3/nnet-chain-training.cc              |  12 +-
 7 files changed, 265 insertions(+), 10 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 3fdabd651e0..e79f7a4395c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -4,7 +4,7 @@
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
-# local/chain/run_tdnn.sh
+# local/chain/run_lstm.sh
 
 # without cleanup:
 # local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index b495f49f6a3..690ef0c8053 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -4,7 +4,7 @@
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
-# local/chain/run_tdnn.sh
+# local/chain/run_lstm.sh
 
 # without cleanup:
 # local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index afacf7ec8e3..b72a69fdc2f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -4,7 +4,7 @@
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
-# local/chain/run_tdnn.sh
+# local/chain/run_lstm.sh
 
 # without cleanup:
 # local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index 6a8a122b6ff..3a3255514a2 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -4,7 +4,7 @@
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
-# local/chain/run_tdnn.sh
+# local/chain/run_lstm.sh
 
 # without cleanup:
 # local/chain/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..88a124b1343
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 52f366b4cc2..6bfd905d097 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -37,6 +37,10 @@ def __init__(self, first_token, key_to_value, all_layers):
         if not xutils.is_valid_line_name(self.name):
             raise xparser_error("Invalid value: name={0}".format(
                 key_to_value['name']), self.str())
+        for prev_layer in all_layers:
+            if self.name == prev_layer.name:
+                raise xparser_error("Name '{0}' is used for more than one "
+                                    "layer.".format(self.name))
 
         # the following, which should be overridden in the child class, sets
         # default config parameters in self.config.
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 3f08710fd38..d9d43006601 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -41,7 +41,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
                              // natural-gradient updates.
   SetZero(is_gradient, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0); 
+  num_max_change_per_component_applied_.resize(num_updatable, 0);
   num_max_change_global_applied_ = 0;
 
   if (opts.nnet_config.read_cache != "") {
@@ -49,12 +49,12 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     try {
       Input ki(opts.nnet_config.read_cache, &binary);
       compiler_.ReadCache(ki.Stream(), binary);
-      KALDI_LOG << "Read computation cache from " << opts.nnet_config.write_cache;
+      KALDI_LOG << "Read computation cache from " << opts.nnet_config.read_cache;
     } catch (...) {
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
-  } 
+  }
 }
 
 
@@ -186,7 +186,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
         component_name_with_min_scale = delta_nnet_->GetComponentName(c);
         max_change_with_min_scale = max_param_change_per_comp;
       }
-      param_delta_squared += std::pow(scale_factors(i), 
+      param_delta_squared += std::pow(scale_factors(i),
                                       static_cast<BaseFloat>(2.0)) * dot_prod;
       i++;
     }
@@ -217,7 +217,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
            << " / " << num_updatable << " Updatable Components."
            << "(smallest factor=" << min_scale << " on "
            << component_name_with_min_scale
-           << " with max-change=" << max_change_with_min_scale <<"). "; 
+           << " with max-change=" << max_change_with_min_scale <<"). ";
     if (param_delta > nnet_config.max_param_change)
       ostr << "Global max-change factor was "
            << nnet_config.max_param_change / param_delta
@@ -273,7 +273,7 @@ NnetChainTrainer::~NnetChainTrainer() {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
     compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache);
     KALDI_LOG << "Wrote computation cache to " << opts_.nnet_config.write_cache;
-  } 
+  }
   delete delta_nnet_;
 }
 

From fc16bde8aab9defb8ca2ef72c4a4072472fe60d1 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Mon, 28 Nov 2016 23:59:05 -0500
Subject: [PATCH 52/71] added the option trainer.deriv-truncate-margin to
 train_rnn.py and train_raw_rnn.py; deprecated trainer.rnn.num-bptt-steps

---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py | 37 +++++++++++++++++--------
 egs/wsj/s5/steps/nnet3/train_rnn.py     | 37 +++++++++++++++++--------
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 1e448ddde98..6a22f6b4fa5 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -118,9 +118,14 @@ def get_args():
                         "parallel every minibatch")
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
                         dest='num_bptt_steps', default=None,
-                        help="""The number of time steps to back-propagate from
-                        the last label in the chunk. By default it is same as
-                        the (chunk-width + 10).""")
+                        help="""Deprecated. Kept for back compatibility.""")
+    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
+                        dest='deriv_truncate_margin', default=8,
+                        help="""Margin (in input frames) around the 'required'
+                        part of each chunk that the derivatives are
+                        backpropagated to. E.g., 8 is a reasonable setting.
+                        Note: the 'required' part of the chunk is defined by
+                        the model's {left,right}-context.""")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
@@ -161,6 +166,17 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
+    if args.num_bptt_steps is not None:
+        # -2 is used to compensate for the splicing of the input frame, assuming
+        # that splicing spans from -2 to 2
+        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
+        logger.warning(
+            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
+            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
+            "chunk-width - 2) = {0}. We recommend using the option "
+            "--trainer.deriv-truncate-margin.".format(
+                args.deriv_truncate_margin))
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -344,15 +360,12 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.initial_effective_lrate,
                                                   args.final_effective_lrate)
 
-    if args.num_bptt_steps is None:
-        # num_bptt_steps is set to (chunk_width + 10) by default
-        num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context,
-                                                args.chunk_right_context)
-    else:
-        num_bptt_steps = args.num_bptt_steps
-
-    min_deriv_time = args.chunk_width - num_bptt_steps
-    max_deriv_time = num_bptt_steps - 1
+    min_deriv_time = None
+    max_deriv_time = None
+    if args.deriv_truncate_margin is not None:
+        min_deriv_time = -args.deriv_truncate_margin - model_left_context
+        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
+                          + model_right_context)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index c735e9f27f6..50ff057a5cc 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -117,9 +117,14 @@ def get_args():
                         "parallel every minibatch")
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
                         dest='num_bptt_steps', default=None,
-                        help="""The number of time steps to back-propagate from
-                        the last label in the chunk. By default it is same as
-                        the (chunk-width + 10).""")
+                        help="""Deprecated. Kept for back compatibility.""")
+    parser.add_argument("--trainer.deriv-truncate-margin", type=int,
+                        dest='deriv_truncate_margin', default=8,
+                        help="""Margin (in input frames) around the 'required'
+                        part of each chunk that the derivatives are
+                        backpropagated to. E.g., 8 is a reasonable setting.
+                        Note: the 'required' part of the chunk is defined by
+                        the model's {left,right}-context.""")
 
     # General options
     parser.add_argument("--feat-dir", type=str, required=True,
@@ -157,6 +162,17 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
+    if args.num_bptt_steps is not None:
+        # -2 is used to compensate for the splicing of the input frame, assuming
+        # that splicing spans from -2 to 2
+        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
+        logger.warning(
+            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
+            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
+            "chunk-width - 2) = {0}. We recommend using the option "
+            "--trainer.deriv-truncate-margin.".format(
+                args.deriv_truncate_margin))
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -343,15 +359,12 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.initial_effective_lrate,
                                                   args.final_effective_lrate)
 
-    if args.num_bptt_steps is None:
-        # num_bptt_steps is set to (chunk_width + 10) by default
-        num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context,
-                                                args.chunk_right_context)
-    else:
-        num_bptt_steps = args.num_bptt_steps
-
-    min_deriv_time = args.chunk_width - num_bptt_steps
-    max_deriv_time = num_bptt_steps - 1
+    min_deriv_time = None
+    max_deriv_time = None
+    if args.deriv_truncate_margin is not None:
+        min_deriv_time = -args.deriv_truncate_margin - model_left_context
+        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
+                          + model_right_context)
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))

From 3a62393cb0657c1f320290051f14fda542532031 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 29 Nov 2016 16:25:40 -0500
Subject: [PATCH 53/71] raw_python_script: Minor cosmetic changes

---
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py | 4 ++--
 egs/wsj/s5/steps/nnet3/train_rnn.py     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 6a22f6b4fa5..b78fc6bb162 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -167,8 +167,8 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame, assuming
-        # that splicing spans from -2 to 2
+        # -2 is used to compensate for the splicing of the input frame,
+        # assuming that splicing spans from -2 to 2
         args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
         logger.warning(
             "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 50ff057a5cc..ba6b0df7466 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -163,8 +163,8 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame, assuming
-        # that splicing spans from -2 to 2
+        # -2 is used to compensate for the splicing of the input frame,
+        # assuming that splicing spans from -2 to 2
         args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
         logger.warning(
             "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "

From 725cd385b06b8a0cec4ce6928dd4771f7864227d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 30 Nov 2016 15:51:27 -0500
Subject: [PATCH 54/71] Adding results and TDNN+LSTM recipe for chain models in
 tedlium s5_r2.

---
 .../s5_r2/local/chain/compare_wer_general.sh  | 64 +++++++++++++++++++
 egs/tedlium/s5_r2/local/chain/run_tdnn.sh     |  2 +-
 .../s5_r2/local/chain/run_tdnn_lstm.sh        |  1 +
 .../s5_r2/local/chain/tuning/run_lstm_1a.sh   | 17 +++++
 .../s5_r2/local/chain/tuning/run_lstm_1b.sh   | 24 ++++++-
 .../s5_r2/local/chain/tuning/run_lstm_1c.sh   | 24 +++++--
 .../s5_r2/local/chain/tuning/run_lstm_1d.sh   | 38 +++++++++--
 .../s5_r2/local/chain/tuning/run_lstm_1e.sh   | 32 ++++++----
 .../s5_r2/local/chain/tuning/run_tdnn_1a.sh   |  5 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1b.sh   | 16 +++++
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    | 19 ++++++
 11 files changed, 215 insertions(+), 27 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
 create mode 120000 egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh

diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..aebbd66349a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+echo $0 $*
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "WER on dev(orig)    "
+for x in $*; do
+  wer=$(grep Sum $x/decode_dev/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on dev(rescored)"
+for x in $*; do
+  wer=$(grep Sum $x/decode_dev_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on test(orig)    "
+for x in $*; do
+  wer=$(grep Sum $x/decode_test/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on test(rescored)"
+for x in $*; do
+  wer=$(grep Sum $x/decode_test_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
index 34499362831..61f8f499182 120000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1a.sh
\ No newline at end of file
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index 7027e8ffa8e..68587ffe0aa 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -1,5 +1,22 @@
 #!/bin/bash
 
+# run_lstm_1a.sh is a first attempt at an LSTM system, based on xconfigs-- it's
+# probably not very well configured, e.g. the num-params might be too small.
+# recurrent-projection-dim is less than non-recurrent-projection-dim due to an
+# oversight.
+
+# comparison with TDNN system (WER is worse):
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/lstm1a_sp_bi
+# System                tdnn1b_sp_bi lstm1a_sp_bi
+# WER on dev(orig)          10.2       10.8
+# WER on dev(rescored)       9.6       10.2
+# WER on test(orig)           9.7      10.0
+# WER on test(rescored)       9.2      9.6
+# Final train prob        -0.0928   -0.0848
+# Final valid prob        -0.1178   -0.1098
+# Final train prob (xent)   -1.4666   -1.1692
+# Final valid prob (xent)   -1.5473   -1.2520
+
 
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index e79f7a4395c..490c86d087f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -1,5 +1,26 @@
 #!/bin/bash
 
+# run_lstm_1b.sh is as run_lstm_1a.sh but replacing the projected LSTM
+# with a regular LSTM.  This is done in order to have an LSTM-only baseline
+# for the 'fast lstm', where we need to test the regular as well as projected
+# LSTM layers.
+
+# It's worse than the LSTMP, as expected, due to more overtraining.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1b_sp_bi
+# exp/chain_cleaned/lstm1b_sp_bi: num-iters=253 nj=2..12 num-params=9.6M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.24,-1.14,-1.14/-1.35,-1.28,-1.28) logprob:train/valid[167,252,final]=(-0.092,-0.079,-0.079/-0.119,-0.110,-0.110)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1a_sp_bi exp/chain_cleaned/lstm1b_sp_bi
+# System                lstm1a_sp_bi lstm1b_sp_bi
+# WER on dev(orig)         10.8       11.3
+# WER on dev(rescored)     10.2       10.7
+# WER on test(orig)        10.0       10.6
+# WER on test(rescored)     9.6       10.0
+# Final train prob        -0.0848   -0.0787
+# Final valid prob        -0.1098   -0.1104
+# Final train prob (xent)   -1.1692   -1.1442
+# Final valid prob (xent)   -1.2520   -1.2782
+
 
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
@@ -12,9 +33,6 @@
 # note, if you have already run one of the non-chain nnet3 systems
 # (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
 
-# run_lstm_1b.sh is like run_lstm_1a.sh, but using a regular LSTM, not LSTMP,
-# layer, as a closer baseline for the 'fast' LSTM layer.
-
 
 set -e -o pipefail
 
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index 690ef0c8053..e93da503448 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -1,6 +1,26 @@
 #!/bin/bash
 
 
+# run_lstm_1c.sh is like run_lstm_1b.sh but changing from the old LSTM
+# implementation to our new 'fast' LSTM layer.  The xconfig changes from
+# 'lstm-layer' to 'fast-lstm-layer'.  It's as good as or maybe slightly better
+# than the old setup.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1c_sp_bi
+# exp/chain_cleaned/lstm1c_sp_bi: num-iters=253 nj=2..12 num-params=9.6M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.26,-1.14,-1.14/-1.34,-1.27,-1.27) logprob:train/valid[167,252,final]=(-0.092,-0.078,-0.078/-0.116,-0.111,-0.111)
+
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1b_sp_bi exp/chain_cleaned/lstm1c_sp_bi
+# System                lstm1b_sp_bi lstm1c_sp_bi
+# WER on dev(orig)          11.3      11.2
+# WER on dev(rescored)      10.7      10.5
+# WER on test(orig)          10.6      10.6
+# WER on test(rescored)      10.0      10.1
+# Final train prob        -0.0787   -0.0777
+# Final valid prob        -0.1104   -0.1108
+# Final train prob (xent)   -1.1442   -1.1445
+# Final valid prob (xent)   -1.2782   -1.2692
+
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
@@ -12,10 +32,6 @@
 # note, if you have already run one of the non-chain nnet3 systems
 # (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
 
-# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
-# 'lstm-layer'.
-
-
 set -e -o pipefail
 
 # First the options that are passed through to run_ivector_common.sh
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index b72a69fdc2f..7d94501ddc1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -1,6 +1,37 @@
 #!/bin/bash
 
 
+# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected
+# LSTM (LSTMP)... the configuration is the same 1a (but unlike 1a it uses
+# the fast lstm layer).  Note: 1a and 1d are a little broken
+# in that their non-recurrent-projection-dim are twice the recurrent-projection-dim,
+# but it's better for comparison purposes to have this the same as 1a.
+
+# As you can see, compared to 1a, 1d is 0.3% to 0.5% better absolute;
+# this comes with the upgrade to 'fast' LSTM.  There were differences to how
+# the gradient truncation is done, maybe that's it; also there are
+# other differences, like how the update of the diagonal matrices
+# are done, and the integration of 4 matrix multiplies into one which
+# will affect the natural gradient.  Anyway, we're not complaining.
+
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1d_sp_bi
+# exp/chain_cleaned/lstm1d_sp_bi: num-iters=253 nj=2..12 num-params=6.4M dim=40+100->3607 combine=-0.09->-0.09 xent:train/valid[167,252,final]=(-1.21,-1.13,-1.13/-1.29,-1.22,-1.23) logprob:train/valid[167,252,final]=(-0.092,-0.083,-0.081/-0.114,-0.105,-0.105)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1a_sp_bi exp/chain_cleaned/lstm1c_sp_bi exp/chain_cleaned/lstm1d_sp_bi
+# System                lstm1a_sp_bi lstm1c_sp_bi lstm1d_sp_bi
+# WER on dev(orig)         10.8       11.2      10.3
+# WER on dev(rescored)     10.2       10.5       9.8
+# WER on test(orig)        10.0        10.6       9.7
+# WER on test(rescored)     9.6        10.1       9.2
+# Final train prob        -0.0848   -0.0777   -0.0812
+# Final valid prob        -0.1098   -0.1108   -0.1049
+# Final train prob (xent)   -1.1692   -1.1445   -1.1334
+# Final valid prob (xent)   -1.2520   -1.2692   -1.2263
+
+
+
+
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
@@ -12,13 +43,6 @@
 # note, if you have already run one of the non-chain nnet3 systems
 # (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
 
-# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected
-# LSTM (LSTMP)... the configuration is like 1a, which is a little broken
-# in that its non-recurrent-projection-dim is twice the recurrent-projection-dim,
-# but it's better for comparison purposes to have it the same.
-
-# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
-# 'lstm-layer'.
 
 
 set -e -o pipefail
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index 3a3255514a2..1050eac709d 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -1,6 +1,27 @@
 #!/bin/bash
 
 
+# run_lstm_1e.sh is like run_lstm_1d.sh, but reducing non-recurrent-projection-dim
+# from 256 to 128 (fixes an earlier mistake).
+# However, this doesn't improve WER results-- see below.  Probably the system
+# has too few parameters.  Anyway we probably won't tune this further
+# as LSTMs by themselves aren't expected to perform that well:
+# see run_tdnn_lstm_1a.sh and others in that sequence.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/lstm1e_sp_bi
+# exp/chain_cleaned/lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=4.7M dim=40+100->3607 combine=-0.10->-0.10 xent:train/valid[167,252,final]=(-1.25,-1.16,-1.18/-1.29,-1.23,-1.24) logprob:train/valid[167,252,final]=(-0.097,-0.087,-0.086/-0.113,-0.105,-0.105)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi
+# WER on dev(orig)          10.3      10.7
+# WER on dev(rescored)       9.8      10.1
+# WER on test(orig)           9.7       9.8
+# WER on test(rescored)       9.2       9.4
+# Final train prob        -0.0812   -0.0862
+# Final valid prob        -0.1049   -0.1047
+# Final train prob (xent)   -1.1334   -1.1763
+# Final valid prob (xent)   -1.2263   -1.2427
+
 ## how you run this (note: this assumes that the run_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:
@@ -13,17 +34,6 @@
 # (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
 
 
-# run_lstm_1e.sh is like run_lstm_1d.sh, but reducing non-recurrent-projection-dim
-# from 256 to 128 (fixes an earlier mistake).
-
-# run_lstm_1d.sh is like run_lstm_1c.sh, but switching back to projected
-# LSTM (LSTMP)... the configuration is like 1a, which is a little broken
-# in that its non-recurrent-projection-dim is twice the recurrent-projection-dim,
-# but it's better for comparison purposes to have it the same.
-
-# run_lstm_1c.sh is like run_lstm1b.sh, but using 'fast-lstm-layer' instead of
-# 'lstm-layer'.
-
 
 set -e -o pipefail
 
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
index 82647b81767..b8b7edba0b6 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# This is the original TDNN script before we introduced xconfigs.
+# See run_tdnn_1b.sh for comparative results.
+
 
 # by default, with cleanup:
 # local/chain/run_tdnn.sh
@@ -196,4 +199,4 @@ if [ $stage -le 20 ]; then
     exit 1
   fi
 fi
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 09dec79ee60..78038e830e1 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -1,5 +1,21 @@
 #!/bin/bash
 
+# run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
+# config generation.
+
+# Results (11/29/2016, note, this build is is before the upgrade of the LM
+#   done in Nov 2016):
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_sp_bi exp/chain_cleaned/tdnn1b_sp_bi
+# System                tdnn_sp_bi tdnn1b_sp_bi
+# WER on dev(orig)          10.3      10.2
+# WER on dev(rescored)       9.8       9.6
+# WER on test(orig)           9.8       9.7
+# WER on test(rescored)       9.3       9.2
+# Final train prob        -0.0918   -0.0928
+# Final valid prob        -0.1190   -0.1178
+# Final train prob (xent)   -1.3572   -1.4666
+# Final valid prob (xent)   -1.4415   -1.5473
+
 
 ## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
 ## otherwise call it directly in its location).
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index 88a124b1343..f086a506e28 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -1,6 +1,25 @@
 #!/bin/bash
 
 
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
 ## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
 ## otherwise call it directly in its location).
 # by default, with cleanup:

From 1cab8bdccdf72cf8fd3cb2670b3e1000ace4ce3e Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Fri, 2 Dec 2016 14:40:47 -0500
Subject: [PATCH 55/71] changed default value of zeroing-threshold in
 BackpropTruncationComponent to 15; updated the results on AMI (#1240)

Note: this pull request is to the fast_lstm branch, which I will shortly merge to master.
---
 egs/ami/s5b/RESULTS_ihm                     |  5 ++---
 egs/ami/s5b/RESULTS_sdm                     |  5 ++---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 16 ++++++++--------
 egs/wsj/s5/steps/nnet3/components.py        |  4 ++--
 egs/wsj/s5/steps/nnet3/lstm/make_configs.py |  2 +-
 src/nnet3/nnet-general-component.cc         |  4 ++--
 6 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 0776bc05923..484115c3243 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -51,9 +51,8 @@
 # local/nnet3/run_blstm.sh --mic ihm
 # nnet3 xent BLSTM with data cleaning
 # for d in exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
-# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
-%WER 22.3 | 13098 94494 | 80.9 11.7 7.4 3.2 22.3 55.7 | -0.618 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 22.5 | 12643 89962 | 80.2 12.7 7.1 2.7 22.5 53.4 | -0.476 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+%WER 22.4 | 13098 94483 | 80.8 11.6 7.6 3.2 22.4 55.4 | -0.620 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.4 | 12643 89977 | 80.3 12.5 7.2 2.7 22.4 53.6 | -0.503 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 ############################################
 
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index f0177a45078..5ff1f934a3f 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -49,9 +49,8 @@
 # xent BLSTM system; cleaned data and IHM alignments.
 # local/nnet3/run_blstm.sh --mic sdm1 --use-ihm-ali true
 # for d in exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
-# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
-%WER 37.8 | 14633 94518 | 67.1 22.3 10.7 4.9 37.8 64.2 | 0.745 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys
-%WER 41.4 | 13809 89628 | 62.7 24.1 13.2 4.1 41.4 65.2 | 0.723 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys
+%WER 37.9 | 15953 94512 | 66.7 22.0 11.3 4.7 37.9 58.9 | 0.734 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_12/dev_hires_o4.ctm.filt.sys
+%WER 41.2 | 13271 89635 | 62.9 23.8 13.2 4.2 41.2 67.8 | 0.722 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys
 
 # =========================
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 6ee1e1256f3..1ac860ffa9c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -26,7 +26,7 @@
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
 #                             This is the threshold used to decide if clipping has to be activated ]
 #   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
-#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
@@ -45,7 +45,7 @@ def set_default_configs(self):
                         'ng-affine-options' : ' max-change=0.75 ',
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 3.0
+                        'zeroing-threshold' : 15.0
                         }
 
     def set_derived_configs(self):
@@ -226,7 +226,7 @@ def generate_lstm_config(self):
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
 #                             This is the threshold used to decide if clipping has to be activated ]
 #   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
-#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
@@ -249,7 +249,7 @@ def set_default_configs(self):
                         'ng-affine-options' : ' max-change=0.75 ',
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 3.0
+                        'zeroing-threshold' : 15.0
                        }
 
     def set_derived_configs(self):
@@ -568,7 +568,7 @@ def generate_lstm_config(self):
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
 #                             This is the threshold used to decide if clipping has to be activated ]
 #   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
-#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
@@ -582,7 +582,7 @@ def set_default_configs(self):
                         'cell-dim' : -1, # this is a compulsory argument
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 3.0,
+                        'zeroing-threshold' : 15.0,
                         'delay' : -1,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
@@ -719,7 +719,7 @@ def generate_lstm_config(self):
 #   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections.
 #                             This is the threshold used to decide if clipping has to be activated ]
 #   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
-#   zeroing-threshold=20     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
@@ -744,7 +744,7 @@ def set_default_configs(self):
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 3.0
+                        'zeroing-threshold' : 15.0
                         }
 
     def set_derived_configs(self):
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 4bfcb219fc3..3fb92117d78 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -289,8 +289,8 @@ def AddLstmLayer(config_lines,
                  name, input, cell_dim,
                  recurrent_projection_dim = 0,
                  non_recurrent_projection_dim = 0,
-                 clipping_threshold = 1.0,
-                 zeroing_threshold = 3.0,
+                 clipping_threshold = 30.0,
+                 zeroing_threshold = 15.0,
                  zeroing_interval = 20,
                  ng_per_element_scale_options = "",
                  ng_affine_options = "",
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index eeab313a950..205b6034fad 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -84,7 +84,7 @@ def GetArgs():
                         "if clipping-threshold=0 no clipping is done", default=30)
     parser.add_argument("--zeroing-threshold", type=float,
                         help="zeroing threshold used in BackpropTruncation components, "
-                        "if zeroing-threshold=0 no periodic zeroing is done", default=3.0)
+                        "if zeroing-threshold=0 no periodic zeroing is done", default=15.0)
     parser.add_argument("--zeroing-interval", type=int,
                         help="zeroing interval used in BackpropTruncation components", default=20)
     parser.add_argument("--self-repair-scale-nonlinearity", type=float,
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index f5687ec1d71..b1a2d9327f8 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -991,8 +991,8 @@ void BackpropTruncationComponent::Init(int32 dim,
 void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   bool ok = cfl->GetValue("dim", &dim);
-  BaseFloat clipping_threshold = 15.0;
-  BaseFloat zeroing_threshold = 2.0;
+  BaseFloat clipping_threshold = 30.0;
+  BaseFloat zeroing_threshold = 15.0;
   int32 zeroing_interval = 20, recurrence_interval = 1;
   cfl->GetValue("clipping-threshold", &clipping_threshold);
   cfl->GetValue("zeroing-threshold", &zeroing_threshold);

From 53653302b49a1c0c9d4882f16b9231a986e1b97a Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sat, 3 Dec 2016 09:09:34 +0800
Subject: [PATCH 56/71] tdnn_fastlstm

---
 .../chain/tuning/run_tdnn_fastlstm_1b.sh      | 237 ++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh
new file mode 100644
index 00000000000..1d44cf92b6e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh
@@ -0,0 +1,237 @@
+#!/bin/bash
+
+# Unlike 1a this setup interleaves the TDNN and LSTM layers.
+
+#System               tdnn_lstm_1a_ld5 tdnn_lstm_1b_ld5 tdnn_fastlstm_1b_ld5
+#WER on train_dev(tg)      13.42           13.00             12.91    
+#WER on train_dev(fg)      12.42           12.03             11.98
+#WER on eval2000(tg)        15.7           15.3              15.2
+#WER on eval2000(fg)        14.2           13.9              13.8
+#Final train prob     -0.0538088      -0.056294            -0.050
+#Final valid prob     -0.0800484      -0.0813322           -0.092
+#Final train prob (xent)   -0.7603    -0.777787            -0.756
+#Final valid prob (xent)   -0.949909  -0.939146            -0.983
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_fastlstm_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;

From 0f1e21c2a4f52a3cfe48c703da2fd29bde0246d5 Mon Sep 17 00:00:00 2001
From: Ante Kegalj <akegalj@gmail.com>
Date: Sun, 4 Dec 2016 12:08:43 +0100
Subject: [PATCH 57/71] Create db dirrectory if it doesn't exists

---
 egs/tedlium/s5/local/download_data.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/tedlium/s5/local/download_data.sh b/egs/tedlium/s5/local/download_data.sh
index d9a8fbbabf3..f24aa33c3d3 100755
--- a/egs/tedlium/s5/local/download_data.sh
+++ b/egs/tedlium/s5/local/download_data.sh
@@ -4,6 +4,7 @@
 #            2014 Brno University of Technology (Author: Karel Vesely)
 # Apache 2.0
 
+mkdir -p db
 pushd db
 # TED-LIUM database:
 if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then

From d41af2280a4313f5e802976c404777cbec2c2e6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=96=9B=E4=B8=9E=E5=AE=8F?= <ihcaoe@gmail.com>
Date: Tue, 6 Dec 2016 22:20:07 +0800
Subject: [PATCH 58/71] score path fixed (#1250)

* score path

* score path fixed

* score path fixed
---
 egs/wsj/s5/steps/scoring/score_kaldi_cer.sh     | 4 ++--
 egs/wsj/s5/steps/scoring/score_kaldi_compare.sh | 4 ++--
 egs/wsj/s5/steps/scoring/score_kaldi_wer.sh     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_cer.sh b/egs/wsj/s5/steps/scoring/score_kaldi_cer.sh
index b55747d53f5..4249cd1e146 100755
--- a/egs/wsj/s5/steps/scoring/score_kaldi_cer.sh
+++ b/egs/wsj/s5/steps/scoring/score_kaldi_cer.sh
@@ -33,7 +33,7 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
@@ -50,7 +50,7 @@ dir=$3
 symtab=$lang_or_graph/words.txt
 
 for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
 
diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_compare.sh b/egs/wsj/s5/steps/scoring/score_kaldi_compare.sh
index 91fc057b906..32afa296796 100755
--- a/egs/wsj/s5/steps/scoring/score_kaldi_compare.sh
+++ b/egs/wsj/s5/steps/scoring/score_kaldi_compare.sh
@@ -14,7 +14,7 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
-  echo "Usage: local/score_compare.sh [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --replications <int>            # number of bootstrap evaluation to compute confidence."
@@ -29,7 +29,7 @@ mkdir -p $dir_compare/log
 
 for d in $dir1 $dir2; do
   for f in test_filt.txt best_wer; do
-    [ ! -f $d/$f ] && echo "score_compare.sh: no such file $d/$f" && exit 1;
+    [ ! -f $d/$f ] && echo "$0: no such file $d/$f" && exit 1;
   done
 done
 
diff --git a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh
index 9fcafdc0b5c..9988c941441 100755
--- a/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh
+++ b/egs/wsj/s5/steps/scoring/score_kaldi_wer.sh
@@ -23,7 +23,7 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."

From eef631bc16ee4fc4ab9589ac92379e2b6379b2e8 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 6 Dec 2016 14:35:07 -0500
Subject: [PATCH 59/71] minor bug fix in XconfigAffineLayer when initializing
 param-stddev

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 6bfd905d097..24eea922968 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -8,6 +8,7 @@
 
 from __future__ import print_function
 import sys
+import math
 import libs.nnet3.xconfig.utils as xutils
 from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
 
@@ -849,7 +850,7 @@ def set_default_configs(self):
     def set_derived_configs(self):
         super(XconfigAffineLayer, self).set_derived_configs()
         if self.config['param-stddev'] < 0:
-            self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim']
+            self.config['param-stddev'] = 1.0 / math.sqrt(self.descriptors['input']['dim'])
 
     def check_configs(self):
         if self.config['dim'] <= 0:

From 80b467cf3e3518da68292b7fb5736199ef85f6f0 Mon Sep 17 00:00:00 2001
From: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2016 10:40:19 -0200
Subject: [PATCH 60/71] Add PowerPC64le in Kaldi build system

This commit introduces the PowerPC64le in the Kaldi build system,
accepting Atlas, OpenBLAS and CUDA.

A tipical example to build Kaldi in Power8 le is:

 1 - clone Kaldi
 2 - build tools
 3 - build Kaldi in src/
     run ./configure
     make -j 50

Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Signed-off-by: Leonardo Augusto Guimaraes Garcia <lagarcia@linux.vnet.ibm.com>
---
 src/configure                           | 20 +++++++++++++
 src/makefiles/cuda_ppc64le.mk           | 12 ++++++++
 src/makefiles/linux_atlas_ppc64le.mk    | 37 +++++++++++++++++++++++++
 src/makefiles/linux_openblas_ppc64le.mk | 37 +++++++++++++++++++++++++
 tools/Makefile                          |  4 +++
 5 files changed, 110 insertions(+)
 create mode 100644 src/makefiles/cuda_ppc64le.mk
 create mode 100644 src/makefiles/linux_atlas_ppc64le.mk
 create mode 100644 src/makefiles/linux_openblas_ppc64le.mk

diff --git a/src/configure b/src/configure
index 3446a9532e0..d4122f1808e 100755
--- a/src/configure
+++ b/src/configure
@@ -462,6 +462,8 @@ function configure_cuda {
       else
         cat makefiles/cuda_64bit.mk >> kaldi.mk
       fi
+    elif [ "`uname -m`" == "ppc64le" ]; then
+      cat makefiles/cuda_ppc64le.mk >> kaldi.mk
     else
       cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
@@ -526,6 +528,8 @@ function linux_atlas_failure { # function we use when we couldn't find
    echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
    if [[ "`uname -m`" == arm* ]]; then
      cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+   elif [[ "`uname -m`" == ppc64le ]]; then
+     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
    else
      cat makefiles/linux_atlas.mk >> kaldi.mk
    fi
@@ -581,6 +585,8 @@ function linux_configure_debian_ubuntu {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+   elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -604,6 +610,8 @@ function linux_configure_debian_ubuntu3 {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -630,6 +638,8 @@ function linux_configure_debian7 {
   echo
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -653,6 +663,8 @@ function linux_configure_redhat {
   echo
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -678,6 +690,8 @@ function linux_configure_redhat_fat {
   echo
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -735,6 +749,8 @@ function linux_configure_static {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -818,6 +834,8 @@ function linux_configure_dynamic {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   if [[ "`uname -m`" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "`uname -m`" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
@@ -1104,6 +1122,8 @@ if [ "`uname`" == "Linux" ]; then
     echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
     if [[ "`uname -m`" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
+    elif [[ "`uname -m`" == ppc64le ]]; then
+      cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
     else
       cat makefiles/linux_openblas.mk >> kaldi.mk
     fi
diff --git a/src/makefiles/cuda_ppc64le.mk b/src/makefiles/cuda_ppc64le.mk
new file mode 100644
index 00000000000..3941de6a230
--- /dev/null
+++ b/src/makefiles/cuda_ppc64le.mk
@@ -0,0 +1,12 @@
+
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+
+
+CUDA_INCLUDE= -I$(CUDATKDIR)/include
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
+             -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
+CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
+CUDA_LDLIBS += -lcublas -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
new file mode 100644
index 00000000000..234a3794721
--- /dev/null
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -0,0 +1,37 @@
+# You have to make sure ATLASLIBS is set...
+
+ifndef FSTROOT
+$(error FSTROOT not defined.)
+endif
+
+ifndef ATLASINC
+$(error ATLASINC not defined.)
+endif
+
+ifndef ATLASLIBS
+$(error ATLASLIBS not defined.)
+endif
+
+
+DOUBLE_PRECISION = 0
+CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
+	   -mtune=power8 -mpower8-vector -mvsx -pthread \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
+      -DHAVE_ATLAS -I$(ATLASINC) \
+      -I$(FSTROOT)/include \
+      $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID 
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
+CC = g++
+CXX = g++
+AR = ar
+AS = as
+RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
new file mode 100644
index 00000000000..222551f3bab
--- /dev/null
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -0,0 +1,37 @@
+# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
+
+ifndef FSTROOT
+$(error FSTROOT not defined.)
+endif
+
+ifndef OPENBLASLIBS
+$(error OPENBLASLIBS not defined.)
+endif
+
+ifndef OPENBLASROOT
+$(error OPENBLASROOT not defined.)
+endif
+
+
+DOUBLE_PRECISION = 0
+CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
+           -mtune=power8 -mpower8-vector -mvsx -pthread \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
+      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
+      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
+      -I $(FSTROOT)/include \
+      $(EXTRA_CXXFLAGS) \
+      -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
+CC = g++
+CXX = g++
+AR = ar
+AS = as
+RANLIB = ranlib
diff --git a/tools/Makefile b/tools/Makefile
index 714e613e4bf..8fd0c7a02a9 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -85,6 +85,10 @@ ifeq ($(OSTYPE),cygwin)
 else ifeq ($(OS),Windows_NT)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 else
+	# ppc64le needs the newsted config.guess to be correctly indentified
+	[ "$(shell uname -p)" == "ppc64le" ] && wget -O openfst-$(OPENFST_VERSION)/config.guess \
+		"http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD" || \
+		echo "config.guess unchanged"
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
 

From f0c7a55585c38d5b430d7aab5ca2040773932bbc Mon Sep 17 00:00:00 2001
From: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Date: Tue, 6 Dec 2016 18:11:18 -0200
Subject: [PATCH 61/71] Add ppc64le in platform notes section in README.md

This new section is intended to give platform specific information to
users such as libraries, OS and where to get more information about any
relevant topic.

Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index ab7aa846f11..32d4945a909 100644
--- a/README.md
+++ b/README.md
@@ -51,3 +51,14 @@ Development pattern for contributors
    You can use the [Google's cpplint.py]
    (https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py)
    to verify that your code is free of basic mistakes.
+
+Platform specific notes
+-----------------------
+
+PowerPC 64bits little-endian (ppc64le):
+- Kaldi is expected to work out of the box in RHEL >= 7 and Ubuntu >= 16.04 with
+  OpenBLAS, ATLAS, or CUDA.
+- CUDA drivers for ppc64le can be found at [https://developer.nvidia.com/cuda-downloads]
+  (https://developer.nvidia.com/cuda-downloads).
+- An [IBM Redbook] (https://www.redbooks.ibm.com/abstracts/redp5169.html) is
+  available as a guide to install and configure CUDA.

From e71312d87d088e4a676a239115cc03d7ca4e877f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 7 Dec 2016 14:18:44 -0500
Subject: [PATCH 62/71] Fix small bug in
 steps/cleanup/create_segments_from_ctm.pl [thanks: Vincent Nguyen].  Will not
 affect any recipes.

---
 egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
index 2660ebce479..fef2d8ef1f5 100755
--- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
+++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
@@ -426,7 +426,7 @@ sub InsertSilence {
     my $first_word = $col[$x * 3];
     my $second_word = $col[$x * 3 + 1];
     if ($x * 3 + 2 < @col) {
-      if ($col[$x*3 + 2] != $separator) {
+      if ($col[$x * 3 + 2] ne $separator) {
         die "Bad line in align-text output (expected separator '$separator'): $_";
       }
     }

From 740e25edfbb2e3f3fc71ecbb60f7c831b3cc0808 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 8 Dec 2016 01:26:57 -0500
Subject: [PATCH 63/71] Remove left-context and right-context from being
 explicitly printed nnet3-am-info to avoid duplication (now in nnet3-info
 which is included in that).  Thanks to Xiang Li who noticed the issue.

---
 src/nnet3/am-nnet-simple.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/nnet3/am-nnet-simple.cc b/src/nnet3/am-nnet-simple.cc
index e1211170ec2..13d06b7d09d 100644
--- a/src/nnet3/am-nnet-simple.cc
+++ b/src/nnet3/am-nnet-simple.cc
@@ -79,8 +79,6 @@ void AmNnetSimple::SetPriors(const VectorBase<BaseFloat> &priors) {
 
 std::string AmNnetSimple::Info() const {
   std::ostringstream ostr;
-  ostr << "left-context: " << left_context_ << "\n";
-  ostr << "right-context: " << right_context_ << "\n";
   ostr << "input-dim: " << nnet_.InputDim("input") << "\n";
   ostr << "ivector-dim: " << nnet_.InputDim("ivector") << "\n";
   ostr << "num-pdfs: " << nnet_.OutputDim("output") << "\n";

From 350bbe5a849cd34d43adbf7e7ee333f8f6329726 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 8 Dec 2016 14:43:45 -0500
Subject: [PATCH 64/71] Changes to nnet3 natural-gradient code to make some
 warnings less scary, and to change how initialization is done where the
 num-rows of input is small.

---
 src/nnet3/natural-gradient-online.cc | 30 +++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 9f046d4bf3a..90c57434c2c 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -126,9 +126,22 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   this_copy.InitDefault(D);
 
   CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
-  // number of iterations with the same data from a pseudorandom start.
-  // this is a faster way of starting than doing eigenvalue decomposition.
-  int32 num_init_iters = 3;
+  // 'num_iters' is number of iterations with the same data from a pseudorandom
+  // start.  this is a faster way of starting than doing eigenvalue
+  // decomposition.
+  //
+  // Note: we only do three iterations of initialization if we have enough data
+  // that it's reasonably possible to estimate the subspace of dimension
+  // this_copy.rank_.  If we don't have more than that many rows in our initial
+  // minibatch R0, we just do one iteration... this gives us almost exactly
+  // (barring small effects due to epsilon_ > 0) the row subspace of R0 after
+  // one iteration anyway.
+  int32 num_init_iters;
+  if (R0.NumRows() <= this_copy.rank_)
+    num_init_iters = 1;
+  else
+    num_init_iters = 3;
+
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
     R0_copy.CopyFromMat(R0);
@@ -214,17 +227,24 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
     return;
   }
   TpMatrix<BaseFloat> C(R);
+  bool cholesky_ok = true;
   try {
+    // one of the following two calls may throw an exception.
     C.Cholesky(O);
     C.Invert();  // Now it's C^{-1}.
-    if (!(C.Max() < 100.0))
-      KALDI_ERR << "Cholesky out of expected range, "
+    if (!(C.Max() < 100.0)) {
+      KALDI_WARN << "Cholesky out of expected range, "
                 << "reorthogonalizing with Gram-Schmidt";
+      cholesky_ok = false;
+    }
   } catch (...) {
     // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
     // more robust than the method using Cholesky.
     KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
                << "Re-orthogonalizing on CPU.";
+    cholesky_ok = false;
+  }
+  if (!cholesky_ok) {
     Matrix<BaseFloat> cpu_W_t1(*W_t1);
     cpu_W_t1.OrthogonalizeRows();
     W_t1->CopyFromMat(cpu_W_t1);

From cb886166c21e9a74570d3645ad63fba1ebd05d26 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 8 Dec 2016 21:38:15 -0500
Subject: [PATCH 65/71] Modify install_sequitur.sh to correctly state path to
 env.sh

---
 tools/extras/install_sequitur.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 02145c7f0c8..50ec7e98b5e 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -95,5 +95,4 @@ cd ../
 ) >> env.sh
 
 echo >&2 "Installation of SEQUITUR finished successfully"
-echo >&2 "Please source the tools/extras/env.sh in your path.sh to enable it"
-
+echo >&2 "Please source tools/env.sh in your path.sh to enable it"

From 9171216818d1d1a99559aa39fd4229847a86e152 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 11 Dec 2016 16:52:53 -0500
Subject: [PATCH 66/71] Small fix to rnnlm-rescoring tools, to work without
 unk-penalty specified.

---
 src/lm/mikolov-rnnlm-lib.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lm/mikolov-rnnlm-lib.cc b/src/lm/mikolov-rnnlm-lib.cc
index f69d47f7b80..d1afd666539 100644
--- a/src/lm/mikolov-rnnlm-lib.cc
+++ b/src/lm/mikolov-rnnlm-lib.cc
@@ -1165,6 +1165,8 @@ float CRnnLM::getUnkPenalty(const std::string &word) {
 }
 
 void CRnnLM::setUnkPenalty(const std::string &filename) {
+  if (filename.empty())
+    return;
   kaldi::SequentialBaseFloatReader unk_reader(filename);
   for (; !unk_reader.Done(); unk_reader.Next()) {
     std::string key = unk_reader.Key();

From 0a5ff7c3aacfc51c4e6aefb4fa0b11d872cb5b22 Mon Sep 17 00:00:00 2001
From: Hang Lyu <hlyu@a11.clsp.jhu.edu>
Date: Sun, 11 Dec 2016 22:13:36 -0500
Subject: [PATCH 67/71] change utils/mkgraph.sh so that it ignores the
 mono,left-biphone and quinphone options and fix the checked-in example
 scripts

fix the utils/mkgraph.sh
---
 .../local/chain/multi_condition/run_tdnn.sh   |  2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh |  4 ++--
 egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh |  2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1c.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1d.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1f.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1h.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    |  2 +-
 egs/an4/s5/run.sh                             |  2 +-
 egs/aurora4/s5/run.sh                         |  2 +-
 egs/chime2/s5/run.sh                          |  2 +-
 egs/farsdat/s5/run.sh                         |  2 +-
 egs/gp/s1/run.sh                              |  4 ++--
 egs/gp/s5/run.sh                              |  2 +-
 egs/hkust/s5/run.sh                           |  2 +-
 egs/iban/s5/run.sh                            |  2 +-
 egs/librispeech/s5/local/chain/run_tdnn.sh    |  2 +-
 egs/librispeech/s5/run.sh                     |  2 +-
 egs/rm/s5/local/run_pitch.sh                  |  2 +-
 egs/rm/s5/run.sh                              |  2 +-
 egs/sprakbanken/s5/run.sh                     |  4 ++--
 egs/sprakbanken_swe/s5/run.sh                 |  2 +-
 egs/swahili/s5/run.sh                         |  2 +-
 .../chain/multi_condition/run_tdnn_7f.sh      |  2 +-
 .../s5c/local/chain/tuning/run_blstm_6i.sh    |  2 +-
 .../s5c/local/chain/tuning/run_blstm_6j.sh    |  2 +-
 .../s5c/local/chain/tuning/run_lstm_6i.sh     |  2 +-
 .../s5c/local/chain/tuning/run_lstm_6j.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7d.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7e.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7f.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7g.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7h.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7i.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7j.sh     |  2 +-
 .../s5c/local/chain/tuning/run_tdnn_7l.sh     |  4 ++--
 .../chain/tuning/run_tdnn_fastlstm_1b.sh      |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1b.sh    |  2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1a.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1b.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1c.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1d.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_lstm_1e.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1a.sh   |  2 +-
 .../s5_r2/local/chain/tuning/run_tdnn_1b.sh   |  2 +-
 .../local/chain/tuning/run_tdnn_lstm_1a.sh    |  2 +-
 egs/tidigits/s5/run.sh                        |  2 +-
 egs/timit/s5/run.sh                           |  2 +-
 egs/voxforge/s5/run.sh                        |  2 +-
 egs/vystadial_cz/s5/run.sh                    |  2 +-
 egs/vystadial_en/s5/run.sh                    |  2 +-
 egs/wsj/s5/run.sh                             |  2 +-
 egs/wsj/s5/utils/mkgraph.sh                   | 23 ++++++++-----------
 egs/yesno/s5/run.sh                           |  2 +-
 60 files changed, 72 insertions(+), 77 deletions(-)

diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
index 28c9849d885..617336236ed 100755
--- a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
@@ -260,7 +260,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
index 8df62af8bad..b3a645c0c11 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -219,7 +219,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
@@ -239,4 +239,4 @@ if [ $stage -le 18 ]; then
     exit 1
   fi
 fi
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
index a262f8e1860..0644d624606 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -247,7 +247,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
index 64cde69e7dd..0a49575ebb0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -234,7 +234,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index ba136e67521..d63712f1f0f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -259,7 +259,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
index ed615a98e30..a53785f45c2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
index ce719d6f2cb..76a9f735c5f 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -262,7 +262,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
index 22967036cb2..8cc1a4e15fa 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
index 6e73457a772..accfd158a9d 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
index 3c4df056460..2b275e4e27d 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
index cce5f2f5f3e..1c90af38c4c 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -264,7 +264,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
index c306849632a..fb4b6a475e2 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -265,7 +265,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 3f8ff14efd9..3e3976ac7a8 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -263,7 +263,7 @@ if [ $stage -le 17 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
 if [ $stage -le 18 ]; then
diff --git a/egs/an4/s5/run.sh b/egs/an4/s5/run.sh
index 4c04e51e311..e37293781fb 100755
--- a/egs/an4/s5/run.sh
+++ b/egs/an4/s5/run.sh
@@ -83,7 +83,7 @@ fi
 # train monophone system 
 if [ $stage -le 4 ]; then
     steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train data/lang exp/mono
-    utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
+    utils/mkgraph.sh data/lang exp/mono exp/mono/graph
     steps/decode.sh --config conf/decode.config --nj $nj --cmd "$decode_cmd" \
         exp/mono/graph data/test exp/mono/decode
 fi
diff --git a/egs/aurora4/s5/run.sh b/egs/aurora4/s5/run.sh
index 2c923394082..bb61bec4cb5 100755
--- a/egs/aurora4/s5/run.sh
+++ b/egs/aurora4/s5/run.sh
@@ -51,7 +51,7 @@ done
 steps/train_mono.sh --boost-silence 1.25 --nj 10  \
   data/train_si84_multi data/lang exp/mono0a_multi || exit 1;
 #(
-# utils/mkgraph.sh --mono data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \
+# utils/mkgraph.sh data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \
 # steps/decode.sh --nj 8  \
 #   exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92 
 #) &
diff --git a/egs/chime2/s5/run.sh b/egs/chime2/s5/run.sh
index ab125543d27..57809678f62 100755
--- a/egs/chime2/s5/run.sh
+++ b/egs/chime2/s5/run.sh
@@ -114,7 +114,7 @@ steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
 
 
 
-utils/mkgraph.sh --mono data/lang_test_tgpr_5k exp/mono0a exp/mono0a/graph_tgpr_5k
+utils/mkgraph.sh data/lang_test_tgpr_5k exp/mono0a exp/mono0a/graph_tgpr_5k
 #steps/decode.sh --nj 8  \
 #  exp/mono0a/graph_tgpr_5k data/test_eval92_5k_clean exp/mono0a/decode_tgpr_eval92_5k_clean
 steps/decode.sh --nj 8  --cmd "$train_cmd" \
diff --git a/egs/farsdat/s5/run.sh b/egs/farsdat/s5/run.sh
index fd893a1fb3c..81f353c301c 100755
--- a/egs/farsdat/s5/run.sh
+++ b/egs/farsdat/s5/run.sh
@@ -65,7 +65,7 @@ echo ===========================================================================
 
 steps/train_mono.sh  --nj "$train_nj" --cmd "$train_cmd" data/train data/lang exp/mono
 
-utils/mkgraph.sh --mono data/lang_test_bg exp/mono exp/mono/graph
+utils/mkgraph.sh data/lang_test_bg exp/mono exp/mono/graph
 
 steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
  exp/mono/graph data/dev exp/mono/decode_dev
diff --git a/egs/gp/s1/run.sh b/egs/gp/s1/run.sh
index f66882915d8..84aca43e01a 100755
--- a/egs/gp/s1/run.sh
+++ b/egs/gp/s1/run.sh
@@ -66,7 +66,7 @@ for LCODE in GE PO SP SW; do
   # The following 3 commands will not run as written, since the LM directories
   # will be different across sites. Edit the 'lang_test' to match what is 
   # available
-  utils/mkgraph.sh --mono data/$LCODE/lang_test exp/$LCODE/mono \
+  utils/mkgraph.sh data/$LCODE/lang_test exp/$LCODE/mono \
     exp/$LCODE/mono/graph
   utils/decode.sh --qcmd "$decode_cmd" steps/decode_deltas.sh \
     exp/$LCODE/mono/graph data/$LCODE/dev exp/$LCODE/mono/decode_dev
@@ -96,4 +96,4 @@ for LCODE in GE PO SP SW; do
   utils/decode.sh --qcmd "$decode_cmd" steps/decode_deltas.sh \
     exp/$LCODE/tri1/graph data/$LCODE/eval exp/$LCODE/tri1/decode
 
-done
\ No newline at end of file
+done
diff --git a/egs/gp/s5/run.sh b/egs/gp/s5/run.sh
index 933d3a4f566..e563bdff0d1 100755
--- a/egs/gp/s5/run.sh
+++ b/egs/gp/s5/run.sh
@@ -93,7 +93,7 @@ for L in $GP_LANGUAGES; do
     (
       graph_dir=exp/$L/mono/graph_${lm_suffix}
       mkdir -p $graph_dir
-      utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \
+      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/mono \
 	 $graph_dir
 
       steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
diff --git a/egs/hkust/s5/run.sh b/egs/hkust/s5/run.sh
index 3533e0cebff..bdd3e7797e8 100755
--- a/egs/hkust/s5/run.sh
+++ b/egs/hkust/s5/run.sh
@@ -46,7 +46,7 @@ steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
 
 
 # Monophone decoding
-utils/mkgraph.sh --mono data/lang_test exp/mono0a exp/mono0a/graph || exit 1
+utils/mkgraph.sh data/lang_test exp/mono0a exp/mono0a/graph || exit 1
 # note: local/decode.sh calls the command line once for each
 # test, and afterwards averages the WERs into (in this case
 # exp/mono/decode/
diff --git a/egs/iban/s5/run.sh b/egs/iban/s5/run.sh
index b184a79e45e..991d32505bf 100755
--- a/egs/iban/s5/run.sh
+++ b/egs/iban/s5/run.sh
@@ -54,7 +54,7 @@ if [ $stage -le 3 ]; then
 
   (
   echo "Decoding the dev set using monophone models."
-  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
 
   steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \
     exp/mono/graph data/dev exp/mono/decode_dev
diff --git a/egs/librispeech/s5/local/chain/run_tdnn.sh b/egs/librispeech/s5/local/chain/run_tdnn.sh
index ae1b403c64a..39e0890e208 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn.sh
@@ -152,7 +152,7 @@ if [ $stage -le 16 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
   # romove <UNK> from the graph
   fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst $graph_dir/HCLG.fst
 fi
diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh
index 9bb96144763..1b12f5126fd 100755
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@@ -91,7 +91,7 @@ steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
 
 # decode using the monophone model
 (
-  utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
+  utils/mkgraph.sh data/lang_nosp_test_tgsmall \
     exp/mono exp/mono/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
diff --git a/egs/rm/s5/local/run_pitch.sh b/egs/rm/s5/local/run_pitch.sh
index 3b8557a7315..ed17b628f47 100755
--- a/egs/rm/s5/local/run_pitch.sh
+++ b/egs/rm/s5/local/run_pitch.sh
@@ -49,7 +49,7 @@ steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono
 #show-transitions data/lang/phones.txt exp/tri2a/final.mdl  exp/tri2a/final.occs | perl -e 'while(<>) { if (m/ sil /) { $l = <>; $l =~ m/pdf = (\d+)/|| die "bad line $l";  $tot += $1; }} print "Total silence count $tot\n";'
 
 
-utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
+utils/mkgraph.sh data/lang exp/mono exp/mono/graph
 
 
 
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 43ec446e6fe..00bac326a80 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -47,7 +47,7 @@ steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono
 
 
 
-utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
+utils/mkgraph.sh data/lang exp/mono exp/mono/graph
 
 
 steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
diff --git a/egs/sprakbanken/s5/run.sh b/egs/sprakbanken/s5/run.sh
index 9d61a181ad3..34c1f18d964 100755
--- a/egs/sprakbanken/s5/run.sh
+++ b/egs/sprakbanken/s5/run.sh
@@ -73,8 +73,8 @@ steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
 # Ensure that LMs are created
 wait
 
-utils/mkgraph.sh --mono data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
-utils/mkgraph.sh --mono data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
+utils/mkgraph.sh data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
+utils/mkgraph.sh data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
 
 # Ensure that all graphs are constructed
 wait 
diff --git a/egs/sprakbanken_swe/s5/run.sh b/egs/sprakbanken_swe/s5/run.sh
index 59d9e2edbd9..691433d3709 100644
--- a/egs/sprakbanken_swe/s5/run.sh
+++ b/egs/sprakbanken_swe/s5/run.sh
@@ -60,7 +60,7 @@ steps/train_mono.sh --nj 10 --cmd "$train_cmd" data/train data/lang exp/mono ||
 
 # Ensure that LMs are created
 
-utils/mkgraph.sh --mono data/lang_test_4g exp/mono exp/mono/graph_4g || exit 1; 
+utils/mkgraph.sh data/lang_test_4g exp/mono exp/mono/graph_4g || exit 1; 
 
 # Ensure that all graphs are constructed
  
diff --git a/egs/swahili/s5/run.sh b/egs/swahili/s5/run.sh
index 00523c475aa..2c7c9574984 100755
--- a/egs/swahili/s5/run.sh
+++ b/egs/swahili/s5/run.sh
@@ -34,7 +34,7 @@ done
 # Training
 steps/train_mono.sh --nj 20 --cmd "$train_cmd" data/train data/lang exp/system1/mono
 # Graph compilation
-utils/mkgraph.sh --mono data/lang exp/system1/mono exp/system1/mono/graph
+utils/mkgraph.sh data/lang exp/system1/mono exp/system1/mono/graph
 
 # Decoding
 steps/decode.sh --nj 4 --cmd "$train_cmd" exp/system1/mono/graph  data/test exp/system1/mono/decode_test
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
index 75b541b49e1..d317b1dc55a 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7f.sh
@@ -221,7 +221,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
index a4333e40b30..1eac1c60c27 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
@@ -178,7 +178,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
index 34dd378a7fe..1718b5a4f7e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -203,7 +203,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
index db0a0fe7b1a..aa48db04841 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
@@ -177,7 +177,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
index 90afd1fb4cd..e262430ab06 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -199,7 +199,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
index 5bcfea82ec3..dba1b99582a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7d.sh
@@ -181,7 +181,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
index c426ede515c..704411b6a76 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7e.sh
@@ -182,7 +182,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
index 256373fc698..a7a5a11dc7a 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7f.sh
@@ -183,7 +183,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
index 7a4512097d3..2a0019e59d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -200,7 +200,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index 00743ca9ebf..946ae796e2f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -190,7 +190,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
index 1b3e86715ed..c19ca88a843 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index b19ea6eafab..b3bed2f2538 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -193,7 +193,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index 06ae6f49728..b346862049b 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -188,7 +188,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
@@ -213,4 +213,4 @@ if [ $stage -le 15 ]; then
   done
 fi
 wait;
-exit 0;
\ No newline at end of file
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh
index 1d44cf92b6e..88a191a1348 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_fastlstm_1b.sh
@@ -202,7 +202,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index e32fdffb69d..47d4fcdd52c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -206,7 +206,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
index 555afa467fa..07e38cb29c5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -202,7 +202,7 @@ if [ $stage -le 14 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index 68587ffe0aa..3ea61800869 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -232,7 +232,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 490c86d087f..a22d4eb53d7 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -233,7 +233,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index e93da503448..718992fc909 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -231,7 +231,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index 7d94501ddc1..8cf543f5096 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -244,7 +244,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index 1050eac709d..e71a40f0e9b 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -234,7 +234,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
index b8b7edba0b6..21e3edac5f3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1a.sh
@@ -177,7 +177,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 78038e830e1..24ba6705585 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -223,7 +223,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index f086a506e28..f128ae49928 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -242,7 +242,7 @@ if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang $dir $dir/graph
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
 fi
 
 if [ $stage -le 20 ]; then
diff --git a/egs/tidigits/s5/run.sh b/egs/tidigits/s5/run.sh
index fb36f89d696..873e2db69fa 100755
--- a/egs/tidigits/s5/run.sh
+++ b/egs/tidigits/s5/run.sh
@@ -42,7 +42,7 @@ utils/subset_data_dir.sh data/train 1000 data/train_1k
 steps/train_mono.sh  --nj 4 --cmd "$train_cmd" \
   data/train_1k data/lang exp/mono0a
 
- utils/mkgraph.sh --mono data/lang exp/mono0a exp/mono0a/graph && \
+ utils/mkgraph.sh data/lang exp/mono0a exp/mono0a/graph && \
  steps/decode.sh --nj 10 --cmd "$decode_cmd" \
       exp/mono0a/graph data/test exp/mono0a/decode
 
diff --git a/egs/timit/s5/run.sh b/egs/timit/s5/run.sh
index ce96f64fc2d..ae23d2a7e90 100755
--- a/egs/timit/s5/run.sh
+++ b/egs/timit/s5/run.sh
@@ -68,7 +68,7 @@ echo ===========================================================================
 
 steps/train_mono.sh  --nj "$train_nj" --cmd "$train_cmd" data/train data/lang exp/mono
 
-utils/mkgraph.sh --mono data/lang_test_bg exp/mono exp/mono/graph
+utils/mkgraph.sh data/lang_test_bg exp/mono exp/mono/graph
 
 steps/decode.sh --nj "$decode_nj" --cmd "$decode_cmd" \
  exp/mono/graph data/dev exp/mono/decode_dev
diff --git a/egs/voxforge/s5/run.sh b/egs/voxforge/s5/run.sh
index 5ea8d84fb80..280e47328b5 100755
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -85,7 +85,7 @@ utils/subset_data_dir.sh data/train 1000 data/train.1k  || exit 1;
 steps/train_mono.sh --nj $njobs --cmd "$train_cmd" data/train.1k data/lang exp/mono  || exit 1;
 
 # Monophone decoding
-utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1
+utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1
 # note: local/decode.sh calls the command line once for each
 # test, and afterwards averages the WERs into (in this case
 # exp/mono/decode/
diff --git a/egs/vystadial_cz/s5/run.sh b/egs/vystadial_cz/s5/run.sh
index d03a1c45b56..472a55022cb 100755
--- a/egs/vystadial_cz/s5/run.sh
+++ b/egs/vystadial_cz/s5/run.sh
@@ -109,7 +109,7 @@ steps/train_mpe.sh $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP
 #######################################################################
 for lm in $LMs ; do
   lm=`basename "$lm"`
-  utils/mkgraph.sh --mono $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri1 $EXP/tri1/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2a $EXP/tri2a/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2b $EXP/tri2b/graph_${lm} || exit 1
diff --git a/egs/vystadial_en/s5/run.sh b/egs/vystadial_en/s5/run.sh
index 988b2a2f10d..ef746a723d9 100755
--- a/egs/vystadial_en/s5/run.sh
+++ b/egs/vystadial_en/s5/run.sh
@@ -109,7 +109,7 @@ steps/train_mpe.sh $WORK/train $WORK/lang $EXP/tri2b_ali $EXP/tri2b_denlats $EXP
 #######################################################################
 for lm in $LMs ; do
   lm=`basename "$lm"`
-  utils/mkgraph.sh --mono $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
+  utils/mkgraph.sh $WORK/lang_${lm} $EXP/mono $EXP/mono/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri1 $EXP/tri1/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2a $EXP/tri2a/graph_${lm} || exit 1
   utils/mkgraph.sh $WORK/lang_${lm} $EXP/tri2b $EXP/tri2b/graph_${lm} || exit 1
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index 5ff885ae032..ca13c1704f2 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -81,7 +81,7 @@ steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
   data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
 
 (
- utils/mkgraph.sh --mono data/lang_nosp_test_tgpr \
+ utils/mkgraph.sh data/lang_nosp_test_tgpr \
    exp/mono0a exp/mono0a/graph_nosp_tgpr && \
  steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
    data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 2fa9e0ff89d..c62f0ccb14f 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -20,10 +20,9 @@ loopscale=0.1
 
 remove_oov=false
 
-for x in `seq 6`; do
-  [ "$1" == "--mono" ] && context=mono && shift;
-  [ "$1" == "--left-biphone" ] && context=lbiphone && shift;
-  [ "$1" == "--quinphone" ] && context=quinphone && shift;
+for x in `seq 4`; do
+  [ "$1" == "--mono" -o "$1" == "left-biphone" -o "$1" == "--quinphone" ] && shift && \
+    echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored."
   [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
   [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
   [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
@@ -33,10 +32,12 @@ if [ $# != 3 ]; then
    echo "Usage: utils/mkgraph.sh [options] <lang-dir> <model-dir> <graphdir>"
    echo "e.g.: utils/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
    echo " Options:"
-   echo " --mono          #  For monophone models."
-   echo " --quinphone     #  For models with 5-phone context (3 is default)"
-   echo " --left-biphone  #  For left biphone models"
-   echo "For other accepted options, see top of script."
+   echo " --remove-oov       #  If true, any paths containing the OOV symbol (obtained from oov.int"
+   echo "                    #  in the lang directory) are removed from the G.fst during compilation."
+   echo " --transition-scale #  Scaling factor on transition probabilities."
+   echo " --self-loop-scale  #  Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale."
+   echo "Note: the --mono, --left-biphone and --quinphone options are now deprecated"
+   echo "and will be ignored."
    exit 1;
 fi
 
@@ -73,12 +74,6 @@ fi
 
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
-if [[ $context == mono && ($N != 1 || $P != 0) || \
-      $context == lbiphone && ($N != 2 || $P != 1) || \
-      $context == quinphone && ($N != 5 || $P != 2) ]]; then
-  echo "mkgraph.sh: mismatch between the specified context (--$context) and the one in the tree: N=$N, P=$P"
-  exit 1
-fi
 
 [[ -f $2/frame_subsampling_factor && $loopscale != 1.0 ]] && \
   echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
diff --git a/egs/yesno/s5/run.sh b/egs/yesno/s5/run.sh
index 12b00273f8b..f881da7a0e6 100755
--- a/egs/yesno/s5/run.sh
+++ b/egs/yesno/s5/run.sh
@@ -35,7 +35,7 @@ steps/train_mono.sh --nj 1 --cmd "$train_cmd" \
   data/train_yesno data/lang exp/mono0a 
   
 # Graph compilation  
-utils/mkgraph.sh --mono data/lang_test_tg exp/mono0a exp/mono0a/graph_tgpr
+utils/mkgraph.sh data/lang_test_tg exp/mono0a exp/mono0a/graph_tgpr
 
 # Decoding
 steps/decode.sh --nj 1 --cmd "$decode_cmd" \

From 63c5a853eda5e5402dba8348909d6da7cb3c41f2 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Mon, 12 Dec 2016 10:25:22 -0500
Subject: [PATCH 68/71] gale_mandarin RESULTS updated

---
 egs/gale_mandarin/s5/RESULTS                  | 315 ++++++++++++++----
 egs/gale_mandarin/s5/local/gale_prep_dict.sh  |   4 +
 egs/gale_mandarin/s5/local/split_wer.sh       |  70 ----
 .../s5/local/split_wer_per_corpus.sh          |  52 ++-
 egs/gale_mandarin/s5/run.sh                   |  60 ++--
 5 files changed, 302 insertions(+), 199 deletions(-)
 delete mode 100755 egs/gale_mandarin/s5/local/split_wer.sh

diff --git a/egs/gale_mandarin/s5/RESULTS b/egs/gale_mandarin/s5/RESULTS
index 47974d88975..b1edc568a0c 100644
--- a/egs/gale_mandarin/s5/RESULTS
+++ b/egs/gale_mandarin/s5/RESULTS
@@ -1,97 +1,266 @@
-#RESULTS splits generated by jtrmal1@jhu.edu at 2016-11-21-12-05-54
+# Get WER and CER
+%WER 48.37 [ 263343 / 544398, 59464 ins, 61751 del, 142128 sub ] exp/tri1/decode/wer_14
+%WER 46.92 [ 255435 / 544398, 61084 ins, 56012 del, 138339 sub ] exp/tri2a/decode/wer_14
+%WER 46.45 [ 252879 / 544398, 64019 ins, 49291 del, 139569 sub ] exp/tri3b/decode.si/wer_14
+%WER 45.20 [ 246062 / 544398, 64493 ins, 46744 del, 134825 sub ] exp/tri2b/decode/wer_14
+%WER 41.30 [ 224836 / 544398, 62047 ins, 46613 del, 116176 sub ] exp/tri2b_mpe/decode_it3/wer_12
+%WER 41.06 [ 223547 / 544398, 67146 ins, 37475 del, 118926 sub ] exp/tri3b/decode/wer_14
+%WER 40.66 [ 221333 / 544398, 57785 ins, 45636 del, 117912 sub ] exp/tri2b_mmi/decode_it3/wer_11
+%WER 40.58 [ 220918 / 544398, 58174 ins, 52314 del, 110430 sub ] exp/tri2b_mpe/decode_it4/wer_13
+%WER 40.42 [ 220024 / 544398, 49748 ins, 58009 del, 112267 sub ] exp/tri2b_mmi/decode_it4/wer_11
+%WER 40.22 [ 218975 / 544398, 55657 ins, 50365 del, 112953 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_12
+%WER 39.77 [ 216506 / 544398, 65278 ins, 36165 del, 115063 sub ] exp/sgmm_5a/decode/wer_11
+%WER 39.69 [ 216051 / 544398, 53034 ins, 53491 del, 109526 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_11
+%WER 38.67 [ 210531 / 544398, 66344 ins, 31914 del, 112273 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_10
+%WER 38.18 [ 207867 / 544398, 65994 ins, 31883 del, 109990 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_10
+%WER 37.78 [ 205693 / 544398, 65686 ins, 31705 del, 108302 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_10
+%WER 37.51 [ 204229 / 544398, 65502 ins, 31771 del, 106956 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_10
+%WER 36.94 [ 201074 / 544398, 66470 ins, 30258 del, 104346 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_9
+
+%WER 40.70 [ 360556 / 885790, 89973 ins, 82245 del, 188338 sub ] exp/tri1/decode/cer_13
+%WER 39.29 [ 348071 / 885790, 92415 ins, 74065 del, 181591 sub ] exp/tri2a/decode/cer_13
+%WER 38.68 [ 342642 / 885790, 98429 ins, 62154 del, 182059 sub ] exp/tri3b/decode.si/cer_12
+%WER 37.49 [ 332045 / 885790, 96932 ins, 61925 del, 173188 sub ] exp/tri2b/decode/cer_13
+%WER 33.85 [ 299862 / 885790, 93460 ins, 60231 del, 146171 sub ] exp/tri2b_mpe/decode_it3/cer_11
+%WER 33.49 [ 296629 / 885790, 86746 ins, 61534 del, 148349 sub ] exp/tri2b_mmi/decode_it3/cer_11
+%WER 33.37 [ 295570 / 885790, 80320 ins, 70288 del, 144962 sub ] exp/tri2b_mmi/decode_it4/cer_10
+%WER 33.30 [ 295009 / 885790, 99171 ins, 50231 del, 145607 sub ] exp/tri3b/decode/cer_13
+%WER 33.23 [ 294379 / 885790, 88389 ins, 68681 del, 137309 sub ] exp/tri2b_mpe/decode_it4/cer_12
+%WER 33.03 [ 292595 / 885790, 87700 ins, 61287 del, 143608 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_11
+%WER 32.60 [ 288751 / 885790, 83604 ins, 65659 del, 139488 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_10
+%WER 32.14 [ 284728 / 885790, 99089 ins, 45433 del, 140206 sub ] exp/sgmm_5a/decode/cer_10
+%WER 31.24 [ 276708 / 885790, 101134 ins, 39271 del, 136303 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_9
+%WER 30.82 [ 273013 / 885790, 100939 ins, 38720 del, 133354 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_9
+%WER 30.49 [ 270059 / 885790, 100834 ins, 38371 del, 130854 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_9
+%WER 30.25 [ 267980 / 885790, 100694 ins, 38242 del, 129044 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_9
+%WER 29.76 [ 263594 / 885790, 99415 ins, 39444 del, 124735 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_9
+
+# Detailed WER on all corpus dev sets
 
 # WER test.LDC2013S04
-%WER 42.23 [ 40179 / 95137, 5329 ins, 8769 del, 26081 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10
-%WER 43.81 [ 41682 / 95137, 5469 ins, 9213 del, 27000 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13
-%WER 49.06 [ 46677 / 95137, 5459 ins, 10672 del, 30546 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13
-%WER 50.53 [ 48073 / 95137, 5505 ins, 11022 del, 31546 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12
-%WER 51.47 [ 48971 / 95137, 5103 ins, 12391 del, 31477 sub ] exp/tri2a/decode/wer_test.LDC2013S04_13
-%WER 53.30 [ 50708 / 95137, 4829 ins, 13624 del, 32255 sub ] exp/tri1/decode/wer_test.LDC2013S04_13
+%WER 37.60 [ 35770 / 95137, 5670 ins, 7459 del, 22641 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2013S04_9
+%WER 38.20 [ 36338 / 95137, 5759 ins, 7315 del, 23264 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2013S04_9
+%WER 38.51 [ 36639 / 95137, 5768 ins, 7390 del, 23481 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2013S04_9
+%WER 39.07 [ 37173 / 95137, 5805 ins, 7425 del, 23943 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2013S04_9
+%WER 39.64 [ 37713 / 95137, 5843 ins, 7490 del, 24380 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2013S04_9
+%WER 40.88 [ 38894 / 95137, 5514 ins, 8378 del, 25002 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10
+%WER 41.71 [ 39680 / 95137, 5237 ins, 9772 del, 24671 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2013S04_10
+%WER 41.96 [ 39915 / 95137, 5626 ins, 8584 del, 25705 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13
+%WER 42.02 [ 39973 / 95137, 5539 ins, 8861 del, 25573 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2013S04_10
+%WER 42.13 [ 40081 / 95137, 5170 ins, 9891 del, 25020 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2013S04_11
+%WER 42.71 [ 40635 / 95137, 5332 ins, 9748 del, 25555 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2013S04_11
+%WER 42.72 [ 40643 / 95137, 5624 ins, 8835 del, 26184 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2013S04_10
+%WER 42.97 [ 40880 / 95137, 5278 ins, 10109 del, 25493 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2013S04_10
+%WER 47.10 [ 44807 / 95137, 5574 ins, 10120 del, 29113 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13
+%WER 48.93 [ 46555 / 95137, 5680 ins, 10447 del, 30428 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12
+%WER 49.38 [ 46982 / 95137, 4996 ins, 11786 del, 30200 sub ] exp/tri2a/decode/wer_test.LDC2013S04_14
+%WER 50.97 [ 48494 / 95137, 5175 ins, 11987 del, 31332 sub ] exp/tri1/decode/wer_test.LDC2013S04_13
 
 # WER test.LDC2013S08
-%WER 26.01 [ 20781 / 79911, 3764 ins, 3034 del, 13983 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_8
-%WER 27.43 [ 21917 / 79911, 3644 ins, 3544 del, 14729 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13
-%WER 31.24 [ 24968 / 79911, 3820 ins, 3943 del, 17205 sub ] exp/tri2b/decode/wer_test.LDC2013S08_12
-%WER 32.45 [ 25932 / 79911, 3816 ins, 4112 del, 18004 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11
-%WER 34.22 [ 27349 / 79911, 3677 ins, 5034 del, 18638 sub ] exp/tri2a/decode/wer_test.LDC2013S08_13
-%WER 35.88 [ 28676 / 79911, 3715 ins, 5127 del, 19834 sub ] exp/tri1/decode/wer_test.LDC2013S08_12
+%WER 22.16 [ 17707 / 79911, 3606 ins, 2589 del, 11512 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2013S08_8
+%WER 22.54 [ 18009 / 79911, 3486 ins, 2764 del, 11759 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2013S08_9
+%WER 22.84 [ 18253 / 79911, 3626 ins, 2612 del, 12015 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2013S08_8
+%WER 23.16 [ 18507 / 79911, 3506 ins, 2819 del, 12182 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2013S08_9
+%WER 23.62 [ 18877 / 79911, 3554 ins, 2849 del, 12474 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2013S08_9
+%WER 24.52 [ 19594 / 79911, 3688 ins, 3017 del, 12889 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_9
+%WER 25.25 [ 20177 / 79911, 3357 ins, 3442 del, 13378 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2013S08_10
+%WER 25.53 [ 20400 / 79911, 3346 ins, 3483 del, 13571 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2013S08_11
+%WER 25.59 [ 20447 / 79911, 3330 ins, 3814 del, 13303 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2013S08_12
+%WER 25.67 [ 20510 / 79911, 3660 ins, 3361 del, 13489 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13
+%WER 25.91 [ 20702 / 79911, 3295 ins, 3670 del, 13737 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2013S08_10
+%WER 25.93 [ 20721 / 79911, 3319 ins, 3532 del, 13870 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2013S08_11
+%WER 26.08 [ 20841 / 79911, 3418 ins, 3757 del, 13666 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2013S08_12
+%WER 29.35 [ 23450 / 79911, 3706 ins, 3910 del, 15834 sub ] exp/tri2b/decode/wer_test.LDC2013S08_13
+%WER 30.48 [ 24359 / 79911, 3831 ins, 3859 del, 16669 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11
+%WER 31.68 [ 25314 / 79911, 3637 ins, 4636 del, 17041 sub ] exp/tri2a/decode/wer_test.LDC2013S08_14
+%WER 33.01 [ 26375 / 79911, 3675 ins, 4743 del, 17957 sub ] exp/tri1/decode/wer_test.LDC2013S08_13
 
 # WER test.LDC2014S09
-%WER 50.54 [ 39383 / 77932, 10535 ins, 7593 del, 21255 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12
-%WER 52.14 [ 40634 / 77932, 10271 ins, 8530 del, 21833 sub ] exp/tri3b/decode/wer_test.LDC2014S09_17
-%WER 56.57 [ 44085 / 77932, 9394 ins, 10954 del, 23737 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16
-%WER 57.95 [ 45158 / 77932, 8777 ins, 12547 del, 23834 sub ] exp/tri2a/decode/wer_test.LDC2014S09_15
-%WER 58.19 [ 45347 / 77932, 9712 ins, 10831 del, 24804 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15
-%WER 59.38 [ 46277 / 77932, 7944 ins, 14560 del, 23773 sub ] exp/tri1/decode/wer_test.LDC2014S09_16
+%WER 45.41 [ 35390 / 77932, 11018 ins, 5860 del, 18512 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2014S09_10
+%WER 46.00 [ 35848 / 77932, 10930 ins, 6053 del, 18865 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2014S09_11
+%WER 46.27 [ 36059 / 77932, 10673 ins, 6370 del, 19016 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2014S09_12
+%WER 46.57 [ 36293 / 77932, 11029 ins, 5994 del, 19270 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2014S09_11
+%WER 47.07 [ 36684 / 77932, 10819 ins, 6276 del, 19589 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2014S09_12
+%WER 47.80 [ 37249 / 77932, 7270 ins, 12090 del, 17889 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2014S09_11
+%WER 48.15 [ 37528 / 77932, 10807 ins, 6823 del, 19898 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12
+%WER 48.40 [ 37722 / 77932, 6651 ins, 13935 del, 17136 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2014S09_14
+%WER 48.52 [ 37812 / 77932, 6507 ins, 13163 del, 18142 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2014S09_11
+%WER 48.69 [ 37947 / 77932, 6897 ins, 12985 del, 18065 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2014S09_13
+%WER 48.75 [ 37995 / 77932, 8250 ins, 12319 del, 17426 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2014S09_14
+%WER 49.49 [ 38569 / 77932, 10792 ins, 7406 del, 20371 sub ] exp/tri3b/decode/wer_test.LDC2014S09_16
+%WER 49.55 [ 38615 / 77932, 8623 ins, 11835 del, 18157 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2014S09_14
+%WER 53.82 [ 41942 / 77932, 9645 ins, 10274 del, 22023 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16
+%WER 54.92 [ 42801 / 77932, 8585 ins, 13190 del, 21026 sub ] exp/tri2a/decode/wer_test.LDC2014S09_16
+%WER 55.33 [ 43118 / 77932, 10169 ins, 9959 del, 22990 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15
+%WER 56.01 [ 43648 / 77932, 7925 ins, 14882 del, 20841 sub ] exp/tri1/decode/wer_test.LDC2014S09_16
 
 # WER test.LDC2015S06
-%WER 46.22 [ 28480 / 61612, 8454 ins, 5015 del, 15011 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_9
-%WER 48.08 [ 29624 / 61612, 8471 ins, 5669 del, 15484 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13
-%WER 52.67 [ 32450 / 61612, 8425 ins, 6441 del, 17584 sub ] exp/tri2b/decode/wer_test.LDC2015S06_12
-%WER 53.51 [ 32968 / 61612, 8444 ins, 6576 del, 17948 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_11
-%WER 55.08 [ 33936 / 61612, 8031 ins, 7811 del, 18094 sub ] exp/tri2a/decode/wer_test.LDC2015S06_13
-%WER 56.70 [ 34937 / 61612, 7890 ins, 8531 del, 18516 sub ] exp/tri1/decode/wer_test.LDC2015S06_13
+%WER 41.65 [ 25659 / 61612, 8345 ins, 4519 del, 12795 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2015S06_9
+%WER 42.31 [ 26067 / 61612, 8572 ins, 4202 del, 13293 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2015S06_8
+%WER 42.64 [ 26271 / 61612, 8588 ins, 4225 del, 13458 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2015S06_8
+%WER 43.00 [ 26491 / 61612, 8453 ins, 4486 del, 13552 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2015S06_9
+%WER 43.57 [ 26846 / 61612, 8485 ins, 4545 del, 13816 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2015S06_9
+%WER 44.64 [ 27503 / 61612, 8428 ins, 4884 del, 14191 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_10
+%WER 45.50 [ 28034 / 61612, 7886 ins, 6546 del, 13602 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2015S06_9
+%WER 45.71 [ 28165 / 61612, 7943 ins, 6204 del, 14018 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2015S06_10
+%WER 46.14 [ 28428 / 61612, 8541 ins, 5351 del, 14536 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13
+%WER 46.15 [ 28434 / 61612, 8006 ins, 6925 del, 13503 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2015S06_11
+%WER 46.19 [ 28459 / 61612, 8143 ins, 5704 del, 14612 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2015S06_9
+%WER 46.35 [ 28555 / 61612, 7379 ins, 7453 del, 13723 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2015S06_10
+%WER 46.66 [ 28751 / 61612, 8068 ins, 6749 del, 13934 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2015S06_11
+%WER 50.45 [ 31086 / 61612, 8308 ins, 6588 del, 16190 sub ] exp/tri2b/decode/wer_test.LDC2015S06_13
+%WER 51.27 [ 31586 / 61612, 8305 ins, 6656 del, 16625 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_12
+%WER 52.65 [ 32436 / 61612, 8220 ins, 7524 del, 16692 sub ] exp/tri2a/decode/wer_test.LDC2015S06_12
+%WER 54.21 [ 33398 / 61612, 8128 ins, 8138 del, 17132 sub ] exp/tri1/decode/wer_test.LDC2015S06_12
 
 # WER test.LDC2015S13
-%WER 23.35 [ 19752 / 84594, 2196 ins, 3274 del, 14282 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9
-%WER 24.81 [ 20984 / 84594, 2214 ins, 3600 del, 15170 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12
-%WER 28.62 [ 24211 / 84594, 2306 ins, 4186 del, 17719 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12
-%WER 30.03 [ 25405 / 84594, 2106 ins, 4617 del, 18682 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12
-%WER 30.58 [ 25869 / 84594, 2142 ins, 4798 del, 18929 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12
-%WER 32.16 [ 27206 / 84594, 1958 ins, 5681 del, 19567 sub ] exp/tri1/decode/wer_test.LDC2015S13_13
+%WER 19.24 [ 16273 / 84594, 2118 ins, 2624 del, 11531 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2015S13_8
+%WER 19.68 [ 16647 / 84594, 2117 ins, 2638 del, 11892 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2015S13_8
+%WER 20.02 [ 16936 / 84594, 2156 ins, 2665 del, 12115 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2015S13_8
+%WER 20.31 [ 17178 / 84594, 2179 ins, 2724 del, 12275 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2015S13_8
+%WER 20.68 [ 17494 / 84594, 2111 ins, 2905 del, 12478 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2015S13_9
+%WER 21.58 [ 18255 / 84594, 2222 ins, 3059 del, 12974 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9
+%WER 22.08 [ 18678 / 84594, 1796 ins, 3753 del, 13129 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2015S13_10
+%WER 22.35 [ 18907 / 84594, 1921 ins, 3449 del, 13537 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2015S13_10
+%WER 22.54 [ 19066 / 84594, 1700 ins, 4044 del, 13322 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2015S13_12
+%WER 22.59 [ 19108 / 84594, 1821 ins, 3889 del, 13398 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2015S13_10
+%WER 22.64 [ 19152 / 84594, 2243 ins, 3274 del, 13635 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12
+%WER 22.68 [ 19187 / 84594, 1950 ins, 3444 del, 13793 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2015S13_10
+%WER 22.97 [ 19429 / 84594, 1748 ins, 4021 del, 13660 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2015S13_12
+%WER 26.22 [ 22178 / 84594, 2285 ins, 3818 del, 16075 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12
+%WER 27.69 [ 23425 / 84594, 2163 ins, 4348 del, 16914 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12
+%WER 27.74 [ 23470 / 84594, 2137 ins, 4463 del, 16870 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12
+%WER 29.10 [ 24619 / 84594, 1912 ins, 5352 del, 17355 sub ] exp/tri1/decode/wer_test.LDC2015S13_13
 
 # WER test.LDC2016S03
-%WER 53.04 [ 77015 / 145212, 34385 ins, 9733 del, 32897 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_12
-%WER 54.68 [ 79399 / 145212, 34634 ins, 10414 del, 34351 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17
-%WER 58.99 [ 85661 / 145212, 33946 ins, 12904 del, 38811 sub ] exp/tri2b/decode/wer_test.LDC2016S03_16
-%WER 59.80 [ 86841 / 145212, 34387 ins, 12610 del, 39844 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_15
-%WER 60.29 [ 87547 / 145212, 31358 ins, 15266 del, 40923 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16
-%WER 61.75 [ 89662 / 145212, 30628 ins, 16992 del, 42042 sub ] exp/tri1/decode/wer_test.LDC2016S03_16
+%WER 48.17 [ 69952 / 145212, 34989 ins, 7540 del, 27423 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2016S03_10
+%WER 48.71 [ 70739 / 145212, 34599 ins, 7965 del, 28175 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2016S03_11
+%WER 48.97 [ 71110 / 145212, 25206 ins, 19110 del, 26794 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2016S03_13
+%WER 48.99 [ 71138 / 145212, 34706 ins, 7910 del, 28522 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2016S03_11
+%WER 49.39 [ 71725 / 145212, 34857 ins, 7904 del, 28964 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2016S03_11
+%WER 49.68 [ 72141 / 145212, 24534 ins, 19597 del, 28010 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2016S03_12
+%WER 49.84 [ 72372 / 145212, 34400 ins, 8414 del, 29558 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2016S03_12
+%WER 49.90 [ 72459 / 145212, 27743 ins, 16413 del, 28303 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2016S03_14
+%WER 50.19 [ 72877 / 145212, 26408 ins, 17847 del, 28622 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2016S03_14
+%WER 50.91 [ 73932 / 145212, 33830 ins, 9535 del, 30567 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_13
+%WER 51.11 [ 74219 / 145212, 30731 ins, 14606 del, 28882 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2016S03_15
+%WER 52.08 [ 75631 / 145212, 31779 ins, 13885 del, 29967 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2016S03_15
+%WER 52.52 [ 76271 / 145212, 35202 ins, 9563 del, 31506 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17
+%WER 56.58 [ 82157 / 145212, 34695 ins, 11508 del, 35954 sub ] exp/tri2b/decode/wer_test.LDC2016S03_15
+%WER 57.28 [ 83179 / 145212, 33956 ins, 12500 del, 36723 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_16
+%WER 57.77 [ 83895 / 145212, 31963 ins, 14939 del, 36993 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16
+%WER 59.27 [ 86074 / 145212, 30962 ins, 17056 del, 38056 sub ] exp/tri1/decode/wer_test.LDC2016S03_17
 
 # CER test.LDC2013S04
-%WER 33.93 [ 51673 / 152279, 7241 ins, 12180 del, 32252 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_10
-%WER 35.31 [ 53769 / 152279, 7813 ins, 11593 del, 34363 sub ] exp/tri3b/decode/cer_test.LDC2013S04_11
-%WER 40.56 [ 61767 / 152279, 8062 ins, 13321 del, 40384 sub ] exp/tri2b/decode/cer_test.LDC2013S04_11
-%WER 42.08 [ 64081 / 152279, 8052 ins, 13940 del, 42089 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10
-%WER 43.22 [ 65818 / 152279, 7602 ins, 15416 del, 42800 sub ] exp/tri2a/decode/cer_test.LDC2013S04_11
-%WER 44.93 [ 68413 / 152279, 7255 ins, 16855 del, 44303 sub ] exp/tri1/decode/cer_test.LDC2013S04_11
+%WER 29.58 [ 45038 / 152279, 8264 ins, 9223 del, 27551 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2013S04_8
+%WER 30.12 [ 45873 / 152279, 8362 ins, 9107 del, 28404 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2013S04_8
+%WER 30.45 [ 46374 / 152279, 8377 ins, 9153 del, 28844 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2013S04_8
+%WER 30.95 [ 47132 / 152279, 8143 ins, 9769 del, 29220 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2013S04_9
+%WER 31.42 [ 47853 / 152279, 8134 ins, 9921 del, 29798 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2013S04_9
+%WER 32.54 [ 49545 / 152279, 7985 ins, 10732 del, 30828 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_9
+%WER 33.42 [ 50894 / 152279, 7672 ins, 11845 del, 31377 sub ] exp/tri3b/decode/cer_test.LDC2013S04_12
+%WER 33.72 [ 51348 / 152279, 7214 ins, 13316 del, 30818 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2013S04_10
+%WER 33.94 [ 51680 / 152279, 6748 ins, 14248 del, 30684 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2013S04_11
+%WER 33.98 [ 51737 / 152279, 7729 ins, 11833 del, 32175 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2013S04_10
+%WER 34.64 [ 52757 / 152279, 8561 ins, 10696 del, 33500 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2013S04_9
+%WER 34.67 [ 52802 / 152279, 7277 ins, 13490 del, 32035 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2013S04_11
+%WER 34.88 [ 53115 / 152279, 8059 ins, 12270 del, 32786 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2013S04_9
+%WER 38.75 [ 59002 / 152279, 7909 ins, 13549 del, 37544 sub ] exp/tri2b/decode/cer_test.LDC2013S04_12
+%WER 40.49 [ 61655 / 152279, 8366 ins, 13257 del, 40032 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10
+%WER 41.22 [ 62774 / 152279, 7165 ins, 15963 del, 39646 sub ] exp/tri2a/decode/cer_test.LDC2013S04_13
+%WER 42.69 [ 65004 / 152279, 7307 ins, 16188 del, 41509 sub ] exp/tri1/decode/cer_test.LDC2013S04_12
 
 # CER test.LDC2013S08
-%WER 19.18 [ 25398 / 132434, 4773 ins, 3650 del, 16975 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8
-%WER 20.54 [ 27201 / 132434, 4792 ins, 4037 del, 18372 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11
-%WER 24.12 [ 31943 / 132434, 4817 ins, 4968 del, 22158 sub ] exp/tri2b/decode/cer_test.LDC2013S08_12
-%WER 25.15 [ 33309 / 132434, 4839 ins, 5019 del, 23451 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11
-%WER 26.90 [ 35623 / 132434, 4725 ins, 6057 del, 24841 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12
-%WER 28.45 [ 37674 / 132434, 4506 ins, 6690 del, 26478 sub ] exp/tri1/decode/cer_test.LDC2013S08_12
+%WER 15.96 [ 21136 / 132434, 4775 ins, 3000 del, 13361 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2013S08_8
+%WER 16.30 [ 21593 / 132434, 4859 ins, 2856 del, 13878 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2013S08_7
+%WER 16.55 [ 21914 / 132434, 4786 ins, 3035 del, 14093 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2013S08_8
+%WER 16.82 [ 22272 / 132434, 4795 ins, 3084 del, 14393 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2013S08_8
+%WER 17.19 [ 22766 / 132434, 4804 ins, 3151 del, 14811 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2013S08_8
+%WER 17.90 [ 23712 / 132434, 4898 ins, 3447 del, 15367 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8
+%WER 18.75 [ 24836 / 132434, 4339 ins, 4148 del, 16349 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2013S08_10
+%WER 18.96 [ 25105 / 132434, 4832 ins, 3953 del, 16320 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11
+%WER 19.00 [ 25164 / 132434, 4160 ins, 4851 del, 16153 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2013S08_12
+%WER 19.01 [ 25182 / 132434, 4569 ins, 3748 del, 16865 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2013S08_10
+%WER 19.38 [ 25671 / 132434, 4798 ins, 3359 del, 17514 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2013S08_9
+%WER 19.42 [ 25716 / 132434, 4571 ins, 3923 del, 17222 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2013S08_9
+%WER 19.43 [ 25738 / 132434, 4303 ins, 4685 del, 16750 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2013S08_12
+%WER 22.36 [ 29618 / 132434, 5010 ins, 4337 del, 20271 sub ] exp/tri2b/decode/cer_test.LDC2013S08_11
+%WER 23.38 [ 30959 / 132434, 4820 ins, 4772 del, 21367 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11
+%WER 24.48 [ 32421 / 132434, 4829 ins, 5141 del, 22451 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12
+%WER 25.74 [ 34093 / 132434, 4727 ins, 5715 del, 23651 sub ] exp/tri1/decode/cer_test.LDC2013S08_12
 
 # CER test.LDC2014S09
-%WER 42.24 [ 53240 / 126027, 16007 ins, 10270 del, 26963 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11
-%WER 43.81 [ 55212 / 126027, 15435 ins, 11971 del, 27806 sub ] exp/tri3b/decode/cer_test.LDC2014S09_15
-%WER 48.72 [ 61395 / 126027, 14667 ins, 15066 del, 31662 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14
-%WER 50.20 [ 63270 / 126027, 15105 ins, 14701 del, 33464 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_13
-%WER 50.37 [ 63481 / 126027, 13343 ins, 18289 del, 31849 sub ] exp/tri2a/decode/cer_test.LDC2014S09_14
-%WER 51.95 [ 65470 / 126027, 12613 ins, 20231 del, 32626 sub ] exp/tri1/decode/cer_test.LDC2014S09_14
+%WER 37.36 [ 47080 / 126027, 16306 ins, 8137 del, 22637 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2014S09_10
+%WER 37.96 [ 47842 / 126027, 16721 ins, 7781 del, 23340 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2014S09_10
+%WER 38.20 [ 48139 / 126027, 17115 ins, 7320 del, 23704 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2014S09_9
+%WER 38.48 [ 48500 / 126027, 16796 ins, 7735 del, 23969 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2014S09_10
+%WER 38.88 [ 48998 / 126027, 16815 ins, 7810 del, 24373 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2014S09_10
+%WER 39.70 [ 50034 / 126027, 16290 ins, 9049 del, 24695 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11
+%WER 40.55 [ 51106 / 126027, 12063 ins, 15356 del, 23687 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2014S09_10
+%WER 41.02 [ 51702 / 126027, 13460 ins, 15578 del, 22664 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2014S09_12
+%WER 41.03 [ 51703 / 126027, 13360 ins, 13875 del, 24468 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2014S09_11
+%WER 41.08 [ 51774 / 126027, 16137 ins, 10241 del, 25396 sub ] exp/tri3b/decode/cer_test.LDC2014S09_14
+%WER 41.43 [ 52216 / 126027, 12943 ins, 14013 del, 25260 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2014S09_11
+%WER 41.44 [ 52223 / 126027, 11129 ins, 16614 del, 24480 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2014S09_10
+%WER 41.71 [ 52571 / 126027, 13916 ins, 15018 del, 23637 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2014S09_12
+%WER 45.78 [ 57690 / 126027, 14874 ins, 14113 del, 28703 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14
+%WER 47.18 [ 59462 / 126027, 14177 ins, 16374 del, 28911 sub ] exp/tri2a/decode/cer_test.LDC2014S09_13
+%WER 47.21 [ 59502 / 126027, 15174 ins, 14317 del, 30011 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_14
+%WER 48.49 [ 61109 / 126027, 13331 ins, 18880 del, 28898 sub ] exp/tri1/decode/cer_test.LDC2014S09_13
 
 # CER test.LDC2015S06
-%WER 38.57 [ 38234 / 99132, 12510 ins, 7120 del, 18604 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9
-%WER 40.30 [ 39954 / 99132, 12593 ins, 7986 del, 19375 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12
-%WER 44.83 [ 44438 / 99132, 12639 ins, 8903 del, 22896 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11
-%WER 45.71 [ 45318 / 99132, 12631 ins, 9164 del, 23523 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10
-%WER 47.39 [ 46983 / 99132, 12432 ins, 9935 del, 24616 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11
-%WER 49.03 [ 48600 / 99132, 12250 ins, 10831 del, 25519 sub ] exp/tri1/decode/cer_test.LDC2015S06_11
+%WER 34.45 [ 34148 / 99132, 12809 ins, 5824 del, 15515 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2015S06_8
+%WER 34.95 [ 34650 / 99132, 12868 ins, 5686 del, 16096 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2015S06_8
+%WER 35.23 [ 34921 / 99132, 12869 ins, 5752 del, 16300 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2015S06_8
+%WER 35.53 [ 35225 / 99132, 12701 ins, 6047 del, 16477 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2015S06_9
+%WER 36.01 [ 35700 / 99132, 12722 ins, 6147 del, 16831 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2015S06_9
+%WER 36.86 [ 36538 / 99132, 12698 ins, 6493 del, 17347 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9
+%WER 38.28 [ 37946 / 99132, 11826 ins, 9192 del, 16928 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2015S06_9
+%WER 38.34 [ 38009 / 99132, 12635 ins, 7528 del, 17846 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12
+%WER 38.35 [ 38016 / 99132, 12245 ins, 7821 del, 17950 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2015S06_9
+%WER 38.55 [ 38211 / 99132, 12035 ins, 9225 del, 16951 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2015S06_10
+%WER 38.88 [ 38546 / 99132, 12302 ins, 7668 del, 18576 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2015S06_9
+%WER 39.01 [ 38672 / 99132, 12118 ins, 8917 del, 17637 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2015S06_10
+%WER 39.27 [ 38931 / 99132, 11682 ins, 9477 del, 17772 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2015S06_9
+%WER 42.63 [ 42261 / 99132, 12629 ins, 8447 del, 21185 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11
+%WER 43.52 [ 43141 / 99132, 12665 ins, 8709 del, 21767 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10
+%WER 44.95 [ 44562 / 99132, 12330 ins, 10278 del, 21954 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11
+%WER 46.55 [ 46143 / 99132, 12202 ins, 11242 del, 22699 sub ] exp/tri1/decode/cer_test.LDC2015S06_11
 
 # CER test.LDC2015S13
-%WER 17.05 [ 23993 / 140702, 2450 ins, 3594 del, 17949 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_8
-%WER 18.39 [ 25872 / 140702, 2257 ins, 4274 del, 19341 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11
-%WER 21.98 [ 30933 / 140702, 2347 ins, 4784 del, 23802 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11
-%WER 23.23 [ 32679 / 140702, 2197 ins, 5383 del, 25099 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_11
-%WER 23.88 [ 33596 / 140702, 2030 ins, 6225 del, 25341 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12
-%WER 25.47 [ 35842 / 140702, 1944 ins, 6979 del, 26919 sub ] exp/tri1/decode/cer_test.LDC2015S13_12
+%WER 13.50 [ 19001 / 140702, 2366 ins, 2994 del, 13641 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2015S13_8
+%WER 13.88 [ 19524 / 140702, 2365 ins, 2990 del, 14169 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2015S13_8
+%WER 14.11 [ 19858 / 140702, 2383 ins, 3013 del, 14462 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2015S13_8
+%WER 14.33 [ 20158 / 140702, 2389 ins, 3059 del, 14710 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2015S13_8
+%WER 14.67 [ 20640 / 140702, 2482 ins, 2990 del, 15168 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2015S13_7
+%WER 15.42 [ 21702 / 140702, 2337 ins, 3609 del, 15756 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_9
+%WER 15.97 [ 22475 / 140702, 1954 ins, 4050 del, 16471 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2015S13_9
+%WER 16.23 [ 22838 / 140702, 1944 ins, 3994 del, 16900 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2015S13_10
+%WER 16.43 [ 23116 / 140702, 1765 ins, 4775 del, 16576 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2015S13_11
+%WER 16.44 [ 23135 / 140702, 2306 ins, 3973 del, 16856 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11
+%WER 16.50 [ 23214 / 140702, 2058 ins, 4180 del, 16976 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2015S13_9
+%WER 16.56 [ 23296 / 140702, 2211 ins, 3512 del, 17573 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2015S13_9
+%WER 16.81 [ 23654 / 140702, 1810 ins, 4651 del, 17193 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2015S13_11
+%WER 19.63 [ 27616 / 140702, 2395 ins, 4467 del, 20754 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11
+%WER 21.01 [ 29562 / 140702, 2365 ins, 4751 del, 22446 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_10
+%WER 21.15 [ 29758 / 140702, 2017 ins, 5741 del, 22000 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12
+%WER 22.48 [ 31633 / 140702, 1928 ins, 6389 del, 23316 sub ] exp/tri1/decode/cer_test.LDC2015S13_12
 
 # CER test.LDC2016S03
-%WER 45.40 [ 106787 / 235216, 53964 ins, 12519 del, 40304 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_11
-%WER 46.75 [ 109953 / 235216, 54007 ins, 13639 del, 42307 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15
-%WER 51.08 [ 120139 / 235216, 53593 ins, 16514 del, 50032 sub ] exp/tri2b/decode/cer_test.LDC2016S03_14
-%WER 51.97 [ 122235 / 235216, 52763 ins, 17940 del, 51532 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_15
-%WER 52.61 [ 123739 / 235216, 47836 ins, 22637 del, 53266 sub ] exp/tri2a/decode/cer_test.LDC2016S03_16
-%WER 54.06 [ 127163 / 235216, 47776 ins, 23865 del, 55522 sub ] exp/tri1/decode/cer_test.LDC2016S03_15
+%WER 40.99 [ 96423 / 235216, 54232 ins, 10004 del, 32187 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2016S03_10
+%WER 41.52 [ 97659 / 235216, 54881 ins, 9585 del, 33193 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2016S03_10
+%WER 41.78 [ 98270 / 235216, 55048 ins, 9598 del, 33624 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2016S03_10
+%WER 42.11 [ 99051 / 235216, 54335 ins, 10340 del, 34376 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2016S03_11
+%WER 42.28 [ 99452 / 235216, 41779 ins, 24370 del, 33303 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2016S03_12
+%WER 42.50 [ 99971 / 235216, 53658 ins, 11074 del, 35239 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2016S03_12
+%WER 43.03 [ 101223 / 235216, 38061 ins, 29066 del, 34096 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2016S03_12
+%WER 43.11 [ 101399 / 235216, 43075 ins, 23843 del, 34481 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2016S03_14
+%WER 43.29 [ 101832 / 235216, 53048 ins, 12380 del, 36404 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_12
+%WER 43.33 [ 101926 / 235216, 43250 ins, 22694 del, 35982 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2016S03_13
+%WER 43.86 [ 103167 / 235216, 48178 ins, 19999 del, 34990 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2016S03_14
+%WER 44.76 [ 105279 / 235216, 54744 ins, 12548 del, 37987 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15
+%WER 44.77 [ 105298 / 235216, 49827 ins, 18866 del, 36605 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2016S03_14
+%WER 48.68 [ 114501 / 235216, 52710 ins, 16670 del, 45121 sub ] exp/tri2b/decode/cer_test.LDC2016S03_15
+%WER 49.57 [ 116592 / 235216, 53572 ins, 16111 del, 46909 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_14
+%WER 50.03 [ 117681 / 235216, 49653 ins, 20856 del, 47172 sub ] exp/tri2a/decode/cer_test.LDC2016S03_15
+%WER 51.54 [ 121232 / 235216, 49022 ins, 23008 del, 49202 sub ] exp/tri1/decode/cer_test.LDC2016S03_15
diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
index cb3f1b56cba..f1e39fb452e 100755
--- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh
+++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
@@ -77,6 +77,10 @@ if [ ! -f conf/g2p_model ]; then
 fi
 
 echo "--- Preparing pronunciations for OOV words ..."
+if [ ! -x g2p.py ]; then
+  echo "g2p.py is not found. Checkout tools/extra/install_sequitur.sh."
+  exit 1
+fi
 g2p.py --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt
 
 cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\
diff --git a/egs/gale_mandarin/s5/local/split_wer.sh b/egs/gale_mandarin/s5/local/split_wer.sh
deleted file mode 100755
index 38cdb3af991..00000000000
--- a/egs/gale_mandarin/s5/local/split_wer.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-
-# Report WER for reports and conversational 
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
-   echo "Arguments should be the gale folder, see ../run.sh for example."
-   exit 1;
-fi
-
-[ -f ./path.sh ] && . ./path.sh
-
-
-galeFolder=$(readlink -f $1)
-symtab=./data/lang/words.txt
-
-#split the test set per type:
-awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$
-
-# generate the report test set
-awk '{print $2}' $galeFolder/report | sort -u  > $galeFolder/report_id$$
-comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test
-
-# generate the conversational test set
-awk '{print $2}' $galeFolder/conversational | sort -u  > $galeFolder/conversational_id$$
-
-comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test
-
-rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$
-
-min_lmwt=9
-max_lmwt=20
-for dir in exp/*/*decode*; do
- for type in report conversational; do
- #echo "Processing: $dir $type"
-  rm -fr $dir/scoring_$type
-  cp -pr $dir/scoring  $dir/scoring_$type
-  ( cd $dir/scoring_$type;
-    for x in *.tra test_filt.txt; do
-	  sort -u $x > tmp$$
-      join tmp$$ $galeFolder/${type}.test > $x
-      rm -fr tmp$$
-    done
-   )
-
-utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
-   cat $dir/scoring_${type}/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
-done
-done
-
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-echo "RESULTS generated by $USER at $time"
-
-echo "Report Results WER:"
-for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2 
-
-echo "Conversational Results WER:"
-for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2
-
-echo "Combined Results for Reports and Conversational WER:"
-for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2
-
-
-
-
diff --git a/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
index b4a4de94a6d..7fc51e74846 100755
--- a/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
+++ b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh
@@ -20,41 +20,37 @@ min_lmwt=7
 max_lmwt=20
 
 for dir in exp/*/*decode*; do
- for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
- #echo "Processing: $dir $type"
-  rm -fr $dir/scoring_$type
-  mkdir -p $dir/scoring_$type/log
-  for x in $dir/scoring/*.char $dir/scoring/*.tra $dir/scoring/char.filt $dir/scoring/text.filt; do
-    cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
-  done
-
-  utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
-     cat $dir/scoring_${type}/LMWT.tra \| \
-      utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-      compute-wer --text --mode=present \
+  for type in $(ls -1 local/test.* | xargs -n1 basename); do
+    rm -fr $dir/scoring_$type
+    mkdir -p $dir/scoring_$type/log
+    for x in $dir/scoring/*.char $dir/scoring/*.tra $dir/scoring/char.filt $dir/scoring/text.filt; do
+      cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
+    done
+
+    utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+       cat $dir/scoring_${type}/LMWT.tra \| \
+       utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+       compute-wer --text --mode=present \
        ark:$dir/scoring_${type}/text.filt  ark,p:- ">&" $dir/wer_${type}_LMWT
-  utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.cer.LMWT.log \
-     cat $dir/scoring_${type}/LMWT.char \| \
-      compute-wer --text --mode=present \
+    utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.cer.LMWT.log \
+       cat $dir/scoring_${type}/LMWT.char \| \
+       compute-wer --text --mode=present \
        ark:$dir/scoring_${type}/char.filt  ark,p:- ">&" $dir/cer_${type}_LMWT
+  done
 done
-done
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-echo "#RESULTS splits generated by $USER at $time"
 
-for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
- echo -e "\n# WER $type"
- for x in exp/*/*decode*; do
-  grep WER $x/wer_${type}_* | utils/best_wer.sh;
- done | sort -n -k2
+for type in $(ls -1 local/test.* | xargs -n1 basename); do
+  echo -e "\n# WER $type"
+  for x in exp/*/*decode*; do
+    grep WER $x/wer_${type}_* | utils/best_wer.sh;
+  done | sort -n -k2
 done
 
 for type in $(ls -1 local/test.* | xargs -n1 basename); do
- echo -e "\n# CER $type"
- for x in exp/*/*decode*; do
-  grep WER $x/cer_${type}_* | utils/best_wer.sh;
- done | sort -n -k2
+  echo -e "\n# CER $type"
+  for x in exp/*/*decode*; do
+    grep WER $x/cer_${type}_* | utils/best_wer.sh;
+  done | sort -n -k2
 done
 
 
diff --git a/egs/gale_mandarin/s5/run.sh b/egs/gale_mandarin/s5/run.sh
index f6c9f0828b7..2e6e2df492b 100755
--- a/egs/gale_mandarin/s5/run.sh
+++ b/egs/gale_mandarin/s5/run.sh
@@ -10,20 +10,20 @@ nJobs=64
 nDecodeJobs=128
 
 AUDIO=(
-  /scratch/groups/skhudan1/corpora/LDC2013S08/
-  /scratch/groups/skhudan1/corpora/LDC2013S04/
-  /scratch/groups/skhudan1/corpora/LDC2014S09/
-  /scratch/groups/skhudan1/corpora/LDC2015S06/
-  /scratch/groups/skhudan1/corpora/LDC2015S13/
-  /scratch/groups/skhudan1/corpora/LDC2016S03/
+  /export/corpora/LDC/LDC2013S08/
+  /export/corpora/LDC/LDC2013S04/
+  /export/corpora/LDC/LDC2014S09/
+  /export/corpora/LDC/LDC2015S06/
+  /export/corpora/LDC/LDC2015S13/
+  /export/corpora/LDC/LDC2016S03/
 )
 TEXT=(
-  /scratch/groups/skhudan1/corpora/LDC2013T20/
-  /scratch/groups/skhudan1/corpora/LDC2013T08/
-  /scratch/groups/skhudan1/corpora/LDC2014T28/
-  /scratch/groups/skhudan1/corpora/LDC2015T09/
-  /scratch/groups/skhudan1/corpora/LDC2015T25/
-  /scratch/groups/skhudan1/corpora/LDC2016T12/
+  /export/corpora/LDC/LDC2013T20/
+  /export/corpora/LDC/LDC2013T08/
+  /export/corpora/LDC/LDC2014T28/
+  /export/corpora/LDC/LDC2015T09/
+  /export/corpora/LDC/LDC2015T25/
+  /export/corpora/LDC/LDC2016T12/
 )
 galeData=GALE/
 
@@ -52,6 +52,13 @@ local/gale_format_data.sh
 # want to store MFCC features.
 mfccdir=mfcc
 
+# spread the mfccs over various machines, as this data-set is quite large.
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
+  mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
+  utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/gale_mandarin/s5/$mfcc/storage \
+    $mfccdir/storage
+fi
+
 for x in train dev ; do
   utils/fix_data_dir.sh data/$x
   steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nJobs \
@@ -95,9 +102,12 @@ utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph || exit 1;
 steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
   exp/tri2a/graph data/dev exp/tri2a/decode &
 
+steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
+  data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
+
 # train and decode tri2b [LDA+MLLT]
 steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
-  data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+  data/train data/lang exp/tri2a_ali exp/tri2b || exit 1;
 utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1;
 steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode &
 
@@ -178,26 +188,20 @@ for n in 1 2 3 4; do
     data/dev exp/sgmm_5a_mmi_b0.1/decode exp/sgmm_5a_mmi_b0.1/decode$n
 done
 
-for n in 1 2 3 4; do
-  steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \
-    data/dev exp/sgmm_5a/decode exp/sgmm_5a_mmi_onlyRescoreb0.1/decode$n
-done
-
 wait
-local/nnet/run_dnn.sh
+#local/nnet/run_dnn.sh
 
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-#get WER
-for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; \
-done | sort -n -r -k2 > RESULTS.$USER.$time # to make sure you keep the results timed and owned
+echo "# Get WER and CER" > RESULTS
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_[0-9]* | utils/best_wer.sh; \
+done | sort -n -r -k2 >> RESULTS
+echo "" >> RESULTS
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/cer_[0-9]* | utils/best_wer.sh; \
+done | sort -n -r -k2 >> RESULTS
 
-#get detailed WER; reports, conversational and combined
-local/split_wer.sh $galeData > RESULTS.details.$USER.$time
+echo -e "\n# Detailed WER on all corpus dev sets" >> RESULTS
+local/split_wer_per_corpus.sh $galeData >> RESULTS
 
 echo training succedded
 exit 0
 
 
-
-
-

From 057c6629b913e758f41f91b15c7bb816ec746d31 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Mon, 12 Dec 2016 20:21:59 -0500
Subject: [PATCH 69/71] rename gale_mandarin jobs-variables

---
 egs/gale_mandarin/s5/run.sh | 46 ++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/egs/gale_mandarin/s5/run.sh b/egs/gale_mandarin/s5/run.sh
index 2e6e2df492b..74e69e9d12a 100755
--- a/egs/gale_mandarin/s5/run.sh
+++ b/egs/gale_mandarin/s5/run.sh
@@ -6,8 +6,8 @@
 . ./path.sh
 . ./cmd.sh
 
-nJobs=64
-nDecodeJobs=128
+num_jobs=64
+num_jobs_decode=128
 
 AUDIO=(
   /export/corpora/LDC/LDC2013S08/
@@ -61,7 +61,7 @@ fi
 
 for x in train dev ; do
   utils/fix_data_dir.sh data/$x
-  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nJobs \
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs \
     data/$x exp/make_mfcc/$x $mfccdir
   utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
@@ -78,7 +78,7 @@ steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
   data/train.10k data/lang exp/mono || exit 1;
 
 # Get alignments from monophone system.
-steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
+steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
   data/train.50k data/lang exp/mono exp/mono_ali.50k || exit 1;
 
 # train tri1 [first triphone pass]
@@ -87,10 +87,10 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 
 # First triphone decoding
 utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
-steps/decode.sh  --nj $nDecodeJobs --cmd "$decode_cmd" \
+steps/decode.sh  --nj $num_jobs_decode --cmd "$decode_cmd" \
   exp/tri1/graph data/dev exp/tri1/decode &
 
-steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
+steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
   data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
 
 # Train tri2a, which is deltas+delta+deltas
@@ -99,60 +99,60 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 
 # tri2a decoding
 utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph || exit 1;
-steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
+steps/decode.sh --nj $num_jobs_decode --cmd "$decode_cmd" \
   exp/tri2a/graph data/dev exp/tri2a/decode &
 
-steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
+steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
   data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
 
 # train and decode tri2b [LDA+MLLT]
 steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
   data/train data/lang exp/tri2a_ali exp/tri2b || exit 1;
 utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1;
-steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode &
+steps/decode.sh --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode &
 
 # Align all data with LDA+MLLT system (tri2b)
-steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \
+steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
   --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
 
 #  Do MMI on top of LDA+MLLT.
-steps/make_denlats.sh --nj $nJobs --cmd "$train_cmd" \
+steps/make_denlats.sh --nj $num_jobs --cmd "$train_cmd" \
  data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1;
 
 steps/train_mmi.sh data/train data/lang exp/tri2b_ali \
  exp/tri2b_denlats exp/tri2b_mmi
 
-steps/decode.sh  --iter 4 --nj $nJobs --cmd "$decode_cmd"  exp/tri2b/graph \
+steps/decode.sh  --iter 4 --nj $num_jobs --cmd "$decode_cmd"  exp/tri2b/graph \
  data/dev exp/tri2b_mmi/decode_it4 &
-steps/decode.sh  --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
+steps/decode.sh  --iter 3 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mmi/decode_it3 & # Do the same with boosting.
 
 steps/train_mmi.sh --boost 0.1 data/train data/lang exp/tri2b_ali \
 exp/tri2b_denlats exp/tri2b_mmi_b0.1
 
-steps/decode.sh  --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
+steps/decode.sh  --iter 4 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mmi_b0.1/decode_it4 &
-steps/decode.sh  --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \
+steps/decode.sh  --iter 3 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mmi_b0.1/decode_it3 &
 
 # Do MPE.
 steps/train_mpe.sh data/train data/lang exp/tri2b_ali exp/tri2b_denlats exp/tri2b_mpe || exit 1;
 
-steps/decode.sh  --iter 4 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph \
+steps/decode.sh  --iter 4 --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mpe/decode_it4 &
 
-steps/decode.sh  --iter 3 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph \
+steps/decode.sh  --iter 3 --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph \
  data/dev exp/tri2b_mpe/decode_it3 &
 
 # From 2b system, train 3b which is LDA + MLLT + SAT.
 steps/train_sat.sh --cmd "$train_cmd" \
   5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
 utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph|| exit 1;
-steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
+steps/decode_fmllr.sh --nj $num_jobs_decode --cmd "$decode_cmd" \
   exp/tri3b/graph data/dev exp/tri3b/decode &
 
 # From 3b system, align all data.
-steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \
+steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
   data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
 
 ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights"
@@ -163,14 +163,14 @@ steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3
   exp/ubm5a/final.ubm exp/sgmm_5a || exit 1;
 
 utils/mkgraph.sh data/lang_test exp/sgmm_5a exp/sgmm_5a/graph || exit 1;
-steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+steps/decode_sgmm2.sh --nj $num_jobs_decode --cmd "$decode_cmd" --config conf/decode.config \
   --transform-dir exp/tri3b/decode exp/sgmm_5a/graph data/dev exp/sgmm_5a/decode &
 
-steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_ali \
+steps/align_sgmm2.sh --nj $num_jobs --cmd "$train_cmd" --transform-dir exp/tri3b_ali \
   --use-graphs true --use-gselect true data/train data/lang exp/sgmm_5a exp/sgmm_5a_ali || exit 1;
 
 ## boosted MMI on SGMM
-steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split $nJobs --beam 9.0 --lattice-beam 6 \
+steps/make_denlats_sgmm2.sh --nj $num_jobs --sub-split $num_jobs --beam 9.0 --lattice-beam 6 \
   --cmd "$decode_cmd" --num-threads 4 --transform-dir exp/tri3b_ali \
   data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1;
 
@@ -180,7 +180,7 @@ steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tr
 #decode GMM MMI
 utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1;
 
-steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+steps/decode_sgmm2.sh --nj $num_jobs_decode --cmd "$decode_cmd" --config conf/decode.config \
   --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode
 
 for n in 1 2 3 4; do

From 65c2cb85f509f7debbe89e4f8d03f31c09d535ea Mon Sep 17 00:00:00 2001
From: Tom Ko <tomkocse@gmail.com>
Date: Tue, 13 Dec 2016 02:07:12 -0500
Subject: [PATCH 70/71] fix typo in mkgraph.sh

---
 egs/wsj/s5/utils/mkgraph.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index c62f0ccb14f..3305d628f83 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -21,7 +21,7 @@ loopscale=0.1
 remove_oov=false
 
 for x in `seq 4`; do
-  [ "$1" == "--mono" -o "$1" == "left-biphone" -o "$1" == "--quinphone" ] && shift && \
+  [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \
     echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored."
   [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
   [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;

From e49eaac1fbe5ef9116c54672a114005386a3b070 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Tue, 13 Dec 2016 03:03:27 -0500
Subject: [PATCH 71/71] Fix a script bug in gale_mandarin

---
 egs/gale_mandarin/s5/local/gale_prep_dict.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
index f1e39fb452e..bf2391d3bd7 100755
--- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh
+++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
@@ -77,7 +77,8 @@ if [ ! -f conf/g2p_model ]; then
 fi
 
 echo "--- Preparing pronunciations for OOV words ..."
-if [ ! -x g2p.py ]; then
+g2p=`which g2p.py`
+if [ ! -x $g2p ]; then
   echo "g2p.py is not found. Checkout tools/extra/install_sequitur.sh."
   exit 1
 fi