diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
index eaa5a77949f..218890cc418 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
@@ -1,69 +1,10 @@
 #!/bin/bash
-
-# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
-# the log-like change when deciding which states to back off.  The code is not the same
-# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
-# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
-# is quite similar to 2d, except new/more-exact code is used.
-
-# see table in run_tdnn_2a.sh for results
-
-# _2d is as _2c but with different LM options:
-# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
-# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
-# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
-# provided from the tree-building, and effectively puts the leftmost context position as a single
-# set.
-#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
-#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
-
-# _2c is as _2a but after a code change in which we start using transition-scale
-# and self-loop-scale of 1 instead of zero in training; we change the options to
-# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
-# results at all; it's is mainly for convenience in pushing weights in graphs,
-# and checking that graphs are stochastic.
-
-# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
-
-# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
-# (see also y, which has --num-extra-states=500).
-
-# _x is as _s but setting  --lm-opts "--num-extra-states=0".
-#  this is a kind of repeat of the u->v experiment, where it seemed to make things
-#  worse, but there were other factors involved in that so I want to be sure.
-
-# _s is as _q but setting pdf-boundary-penalty to 0.0
-# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
-# and 18.07 -> 16.96 on train_dev, after fg rescoring.
-
-# _q is as _p except making the same change as from n->o, which
-# reduces the parameters to try to reduce over-training.  We reduce
-# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
-# and modify the splicing setup.
-# note: I don't rerun the tree-building, I just use the '5o' treedir.
-
-# _p is as _m except with a code change in which we switch to a different, more
-# exact mechanism to deal with the edges of the egs, and correspondingly
-# different script options... we now dump weights with the egs, and apply the
-# weights to the derivative w.r.t. the output instead of using the
-# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
-# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
-# ramping up to a weight of 1.0 over 10 frames.
-
-# _m is as _k but after a code change that makes the denominator FST more
-# compact.  I am rerunning in order to verify that the WER is not changed (since
-# it's possible in principle that due to edge effects related to weight-pushing,
-# the results could be a bit different).
-#  The results are inconsistently different but broadly the same.  On all of eval2000,
-#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
-#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
-
-
-# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
-# option and setting max-param-change to 1..  Using the same egs.
-
+# _2i is as _i but it uses speaker perturbation combined with speed perturbation.
 # _i is as _h but longer egs: 150 frames instead of 75, and
 # 128 elements per minibatch instead of 256.
+# be cautious comparing the valid probs with h though, because
+# we fixed the utt2uniq bug at this point, so from h on, the valid probs
+# are properly held out.
 
 # _h is as _g but different application of max-param-change (use --scale-max-param-change true)
 
@@ -93,21 +34,23 @@
 set -e
 
 # configs for 'chain'
-stage=12
+stage=1
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
+speaker_perturb=true
 dir=exp/chain/tdnn_2i  # Note: _sp will get added to this if $speed_perturb == true.
 
 # TDNN options
-splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
 
 # training options
 num_epochs=4
 initial_effective_lrate=0.001
 final_effective_lrate=0.0001
 leftmost_questions_truncate=30
-max_param_change=1.0
+max_param_change=0.3333
+scale_max_param_change=true
 final_layer_normalize_target=0.5
 num_jobs_initial=3
 num_jobs_final=16
@@ -138,16 +81,19 @@ suffix=
 if [ "$speed_perturb" == "true" ]; then
   suffix=_sp
 fi
+if [ "$speaker_perturb" == "true" ]; then
+  suffix=$suffix"_fp"
+fi
 
 dir=${dir}$suffix
 train_set=train_nodup$suffix
 ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5o_tree$suffix
+treedir=exp/chain/tri5f_tree$suffix
 
 # if we are using the speed-perturbed data we need to generate
 # alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
+local/nnet3/run_ivector_common_2.sh --stage $stage \
+  --speed-perturb $speed_perturb --speaker-perturb $speaker_perturb \
   --generate-alignments $speed_perturb || exit 1;
 
 
@@ -161,6 +107,7 @@ if [ $stage -le 9 ]; then
 fi
 
 
+if false; then #100
 if [ $stage -le 10 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
   # topo file. [note, it really has two states.. the first one is only repeated
@@ -179,23 +126,23 @@ if [ $stage -le 11 ]; then
   # Build a tree using our new topology.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
       --leftmost-questions-truncate $leftmost_questions_truncate \
-      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
 fi
+fi #100
 
 if [ $stage -le 12 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
  touch $dir/egs/.nodelete # keep egs around when that run dies.
 
  steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
-    --pdf-boundary-penalty 0.0 \
-    --lm-opts "--num-extra-lm-states=2000" \
     --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
     --minibatch-size $minibatch_size \
-    --egs-opts "--frames-overlap-per-eg 30" \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
     --frames-per-eg $frames_per_eg \
     --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
     --splice-indexes "$splice_indexes" \
@@ -205,7 +152,7 @@ if [ $stage -le 12 ]; then
     --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
     --max-param-change $max_param_change \
     --final-layer-normalize-target $final_layer_normalize_target \
-    --relu-dim 850 \
+    --relu-dim 1024 \
     --cmd "$decode_cmd" \
     --remove-egs $remove_egs \
     data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
@@ -215,7 +162,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh
new file mode 100755
index 00000000000..d46d5cc7238
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+stage=1
+train_stage=-10
+generate_alignments=true # false if doing ctc training
+speed_perturb=true
+speaker_perturb=true
+lpc_order=100
+filter_nj=30
+spkf_per_spk=3
+perturb_suffix=""
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p nnet3
+# perturbed data preparation
+train_set=train_nodup
+
+if $speed_perturb; then
+  perturb_suffix="_sp"
+fi
+
+if $speaker_perturb; then
+  perturb_suffix=$perturb_suffix"_fp"
+fi
+
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    echo "speed perturb the data"
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+
+    for datadir in train_nodup; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp_sp1
+      utils/perturb_data_dir_speed.sh 0.95 data/${datadir} data/temp_sp2
+      utils/perturb_data_dir_speed.sh 1.05 data/${datadir} data/temp_sp3
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp_sp4
+
+      utils/combine_data.sh data/${datadir}_temp_sp data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4
+      utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp
+      rm -r data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4
+
+      if [ "$speaker_perturb" == "true" ]; then
+        echo "speaker perturbation of data"
+        utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp_sp0
+        utils/combine_data.sh data/${datadir}_sp data/${datadir}_temp_sp data/temp_sp0
+        utils/fix_data_dir.sh data/${datadir}_sp
+        
+        # compute filter correspond to different speed perturbed speaker.
+        spk_filters=spkfilters
+        mkdir -p $spk_filters
+        utils/split_data.sh data/${datadir}_sp $filter_nj
+        echo $filter_nj > data/${datadir}_sp/num_filter_jobs
+
+        $decode_cmd JOB=1:$filter_nj data/${datadir}_sp/split$filter_nj/compute_filter.JOB.log \
+          compute-filter --lpc-order=$lpc_order scp:data/${datadir}_sp/split$filter_nj/JOB/wav.scp \
+            ark,scp:$spk_filters/spk_filter.JOB.ark,$spk_filters/spk_filter.JOB.scp || exit 1;
+        
+        # combine filters.scp files together 
+        for n in $(seq $filter_nj); do
+          cat $spk_filters/spk_filter.$n.scp || exit 1;
+        done > data/${datadir}_sp/spk_filter.scp
+        echo "Finished generating filters per speakers."
+
+        echo "Perturb data using speaker perturbation."
+        utils/perturb_data_signal_v2.sh $spkf_per_spk 'fp' data/${datadir}_sp data/${datadir}_temp_sp_fp
+        utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp_fp
+      fi
+
+      echo "perturb_suffix=$perturb_suffix "
+      mfccdir=mfcc_perturbed
+      echo "Generating features using perturbed data"
+      steps/make_mfcc.sh --cmd "$decode_cmd" --nj 50 \
+        data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_temp${perturb_suffix}
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}${perturb_suffix} data/${datadir}_temp${perturb_suffix} data/temp0
+      utils/fix_data_dir.sh data/${datadir}${perturb_suffix}
+      rm -r data/temp0 data/${datadir}_temp${perturb_suffix}
+    done
+  fi
+
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/train_nodup${perturb_suffix} data/lang_nosp exp/tri4 exp/tri4_ali_nodup${perturb_suffix} || exit 1
+  fi
+fi
+
+train_set=train_nodup${perturb_suffix}
+if [ $stage -le 3 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+  fi
+
+  # the 100k_nodup directory is copied seperately, as
+  # we want to use exp/tri2_ali_100k_nodup for lda_mllt training
+  # the main train directory might be speed_perturbed
+  for dataset in $train_set train_100k_nodup; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+
+    # scale the waveforms, this is useful as we don't use CMVN
+    data_dir=data/${dataset}_hires
+    cat $data_dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
+    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+  if false; then #300
+  for dataset in eval2000 train_dev; do
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+  fi #300
+  # Take the first 30k utterances (about 1/8th of the data) this will be used
+  # for the diagubm training
+  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
+  local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
+fi
+if false; then #400
+# ivector extractor training
+if [ $stage -le 5 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  # this decision is based on fisher_english
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5500 90000 data/train_100k_nodup_hires \
+    data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b
+fi
+
+if [ $stage -le 6 ]; then
+  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+  # iVector extractors can be sensitive to the amount of data, but this one has a
+  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+  # 100k subset (just under half the data).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+fi #400
+
+if [ $stage -le 8 ]; then
+  # We extract iVectors on all the train_nodup data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
+
+  for data_set in eval2000 train_dev; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
+  done
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/utils/perturb_data_signal.sh b/egs/wsj/s5/utils/perturb_data_signal.sh
new file mode 100755
index 00000000000..dbc123769d1
--- /dev/null
+++ b/egs/wsj/s5/utils/perturb_data_signal.sh
@@ -0,0 +1,149 @@
+#!/bin/bash 
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+#           2014  Tom Ko
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+#  spk_filter.scp
+# It generates the files which are used for perturbing the data at signal-level.
+
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: perturb_data_signal.sh <prefix> <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 'fp01' data/train_si284 data/train_si284p"
+  exit 1
+fi
+
+export LC_ALL=C
+
+prefix=$1
+srcdir=$2
+destdir=$3
+spk_prefix=$prefix"-"
+utt_prefix=$prefix"-"
+
+for f in spk2utt text utt2spk wav.scp spk_filter.scp; do
+  [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1;
+done
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+
+# The following perl script is the core part.
+
+echo $spk_prefix | perl -e '
+  $prefix = <STDIN>;
+  chomp($prefix);
+  ($u2s_in, $seg_in, $wav_in, $filt_in, $wav_out) = @ARGV;
+  if (open(SEG, "<$seg_in")) {
+    $have_segments="true";
+  } else {
+    $have_segments="false";
+  }
+  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
+  open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n";
+  open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n";
+  open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n";
+  while (<UI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    ($utt_id, $spk) = @col;
+    $utt2spk{$utt_id} = $spk;
+  }
+  if ($have_segments eq "true") {
+    while (<SEG>) {
+      chomp;
+      @col = split;
+      $reco2utt{$col[1]} = $col[0];
+    }
+  }
+  while (<WI>) {
+    chomp;
+    @col = split;
+    $pipe = join(" ", @col[1..@col-1]);
+    $reco2pipe{$col[0]} = $pipe;
+    $recolist{$col[0]} = $col[0];
+    if ($have_segments eq "false") {
+      $reco2utt{$col[0]} = $col[0];
+    }
+  }
+  while (<FI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    $spk2filt{$col[0]} = $col[1];
+  }
+
+  foreach $reco (sort keys %recolist) {
+    #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}};
+    #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}};
+    $reco2spk{$reco} = $reco;
+    $reco2filt{$reco} = $spk2filt{$reco};
+    if ($reco2filt{$reco} eq "") {
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2spk{$reco} = $spk;
+      $reco2filt{$reco} = $spk2filt{$spk};
+    }
+    while (1) {
+      # randomly pick a filter from another speaker
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2perturbspk{$reco} = $spk;
+      $reco2perturbfilt{$reco} = $spk2filt{$spk};
+      if ($reco2perturbfilt{$reco} ne $reco2filt{$reco}) {
+        last;
+      }
+    }
+  }
+
+  foreach $reco (sort keys %recolist) {
+    print WO "$prefix$reco $reco2pipe{$reco} apply-filter --inverse=false \"ark:echo $reco2spk{$reco} $reco2filt{$reco} | \" - - | apply-filter --inverse=true \"ark:echo $reco2perturbspk{$reco} $reco2perturbfilt{$reco} | \" - - |\n";
+  }
+
+' $srcdir/utt2spk $srcdir/segments $srcdir/wav.scp \
+$srcdir/spk_filter.scp $destdir/wav.scp
+
+if [ -f $srcdir/segments ]; then
+  # also apply the spk_prefix to the recording-ids.
+  cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map
+
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments
+
+  if [ -f $srcdir/reco2file_and_channel ]; then
+    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+  fi
+  
+  rm $destdir/reco_map 2>/dev/null
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+
+
+rm $destdir/spk_map $destdir/utt_map 2>/dev/null
+echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
diff --git a/egs/wsj/s5/utils/perturb_data_signal_v2.sh b/egs/wsj/s5/utils/perturb_data_signal_v2.sh
new file mode 100755
index 00000000000..c205b67e5e0
--- /dev/null
+++ b/egs/wsj/s5/utils/perturb_data_signal_v2.sh
@@ -0,0 +1,187 @@
+#!/bin/bash 
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+#           2014  Tom Ko
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+#  spk_filter.scp
+# It generates the files which are used for perturbing the data at signal-level.
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+  echo "Usage: perturb_data_signal.sh <num-parts> <prefix> <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 3 'fp01' data/train_si284 data/train_si284p"
+  exit 1
+fi
+
+export LC_ALL=C
+
+num_parts=$1
+prefix=$2
+srcdir=$3
+destdir=$4
+spk_prefix=$prefix"-"
+utt_prefix=$prefix"-"
+
+for f in spk2utt text utt2spk wav.scp spk_filter.scp; do
+  [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1;
+done
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+
+# The following perl script is the core part.
+
+echo $spk_prefix | perl -e '
+  $prefix = <STDIN>;
+  chomp($prefix);
+  ($num_parts, $u2s_in, $s2u_in, $seg_in, $wav_in, $filt_in, $wav_out, $seg_out) = @ARGV;
+  if (open(SEG, "<$seg_in")) {
+    $have_segments="true";
+  } else {
+    $have_segments="false";
+  }
+  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
+  open(SI, "<$s2u_in") || die "Error: fail to open $s2u_in\n";
+  open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n";
+  open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n";
+  open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n";
+  open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n";
+  while (<UI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    ($utt_id, $spk) = @col;
+    $utt2spk{$utt_id} = $spk;
+  }
+  while (<SI>) {
+    chomp;
+    @col = split;
+    $spks = join(" ", @col[1..@col-1]);
+    $spk2utt{$col[0]} = $spks;
+  }
+  if ($have_segments eq "true") {
+    while (<SEG>) {
+      chomp;
+      @col = split;
+      $seg = join(" ", @col[2..@col-1]);
+      $reco2utt{$col[1]} = $col[0];
+      $utt2reco{$col[0]} = $col[1];
+      $utt2seg{$col[0]} = $seg;
+    }
+  }
+  while (<WI>) {
+    chomp;
+    @col = split;
+    $pipe = join(" ", @col[1..@col-1]);
+    $reco2pipe{$col[0]} = $pipe;
+    $recolist{$col[0]} = $col[0];
+    if ($have_segments eq "false") {
+      $reco2utt{$col[0]} = $col[0];
+    }
+  }
+  while (<FI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    $spk2filt{$col[0]} = $col[1];
+  }
+
+  foreach $reco (sort keys %recolist) {
+    #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}};
+    #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}};
+    $reco2spk{$reco} = $reco;
+    $reco2filt{$reco} = $spk2filt{$reco};
+    for (my $i=0; $i < $num_parts; $i++) {
+      $newreco2spk{$reco.$i} = $reco;
+    }
+    @spk2filt_rand{keys %spk2filt} = @spk2filt{keys %spk2filt};
+    delete $spk2filt_rand{$reco};
+    if ($reco2filt{$reco} eq "") {
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2spk{$reco} = $spk;
+      $reco2filt{$reco} = $spk2filt{$spk};
+      delete $spk2filt_rand{$spk};
+    }
+    for (my $i=0; $i < $num_parts; $i++) {
+      # randomly pick a filter from another speaker
+      $spk = (keys %spk2filt_rand)[rand keys %spk2filt_rand];
+      $newreco2perturbspk{$reco.$i} = $spk;
+      $newreco2perturbfilt{$reco.$i} = $spk2filt{$spk};
+      delete $spk2filt_rand{$spk};
+    }
+  }
+
+  foreach $spk (sort keys %spk2utt) {
+    @utts = split(" ", $spk2utt{$spk});
+    $numutts = @utts;
+    if ($numutts < $num_parts) {
+      $partsize = $numutts;
+    } else {
+      $partsize = $numutts / $num_parts;
+    }
+    for (my $i=0; $i < $numutts; $i++) {
+      $partid = int($i / $partsize);
+      $utt = $utts[$i];
+      $filled = sprintf "%02d", $partid;
+      print SO "$prefix$utt $prefix$utt2reco{$utt}-$filled $utt2seg{$utt}\n";
+      $newrecolist{"$prefix$utt2reco{$utt}-$filled"} = "$prefix$utt2reco{$utt}-$filled";
+    }
+  }
+
+  foreach $reco (sort keys %recolist) {
+    for (my $i=0; $i < $num_parts; $i++) {
+      $filled = sprintf "%02d", $i;
+      if ($newrecolist{"$prefix$reco-$filled"} ne "") {
+        print WO "$prefix$reco-$filled $reco2pipe{$reco} apply-filter \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $newreco2perturbspk{$reco.$i} $newreco2perturbfilt{$reco.$i} |\" - - |\n";
+      }
+    }
+  }
+
+' $num_parts $srcdir/utt2spk $srcdir/spk2utt $srcdir/segments $srcdir/wav.scp \
+$srcdir/spk_filter.scp $destdir/wav.scp $destdir/segments
+
+if [ -f $srcdir/segments ]; then
+  # also apply the spk_prefix to the recording-ids.
+  cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map
+
+#  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments
+
+#  if [ -f $srcdir/reco2file_and_channel ]; then
+#    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+#  fi
+  
+  rm $destdir/reco_map 2>/dev/null
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+
+
+rm $destdir/spk_map $destdir/utt_map 2>/dev/null
+echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
diff --git a/src/feat/signal.cc b/src/feat/signal.cc
index e8fbb0b84cf..a374c531e3d 100644
--- a/src/feat/signal.cc
+++ b/src/feat/signal.cc
@@ -38,7 +38,7 @@ void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal)
   signal_padded.SetZero();
   for (int32 i = 0; i < signal_length; i++) {
     for (int32 j = 0; j < filter_length; j++) {
-        signal_padded(i + j) += (*signal)(i) * filter(j);
+        signal_padded(i+j) += (*signal)(i) * filter(j);
     }
   }
   signal->CopyFromVec(signal_padded.Range(0, signal_length));
@@ -70,7 +70,8 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
   signal->CopyFromVec(signal_padded.Range(0, signal_length));
 }
 
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal,
+  bool apply_inverse) {
   int32 signal_length = signal->Dim();
   int32 filter_length = filter.Dim();
 
@@ -86,6 +87,30 @@ void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFl
   Vector<BaseFloat> filter_padded(fft_length);
   filter_padded.Range(0, filter_length).CopyFromVec(filter);
   srfft.Compute(filter_padded.Data(), true);
+  
+  // If true, inverse of filter is computed and 
+  // input signal is convolved with inverse of filter.
+  // The inverse of filter H_inv(w) is estimated as 
+  // conj(H(w))/( abs(H(w))^2 + const) 
+  if (apply_inverse) {
+    BaseFloat abs_Hw, const_val = 0.0;
+    int32 half_N = filter_padded.Dim() / 2;
+    Vector<float> inv_filter_padded(filter_padded);
+    inv_filter_padded(0) = 
+      filter_padded(0) / (filter_padded(0) * filter_padded(0) + const_val);
+    inv_filter_padded(1) = 
+      filter_padded(1) / (filter_padded(1) * filter_padded(1) + const_val);
+    for (int32 bin = 1; bin < half_N; bin++) {
+      int32 w_real_ind = 2 * bin,
+       w_im_ind = 2 * bin + 1;
+      abs_Hw = filter_padded(w_real_ind) * filter_padded(w_real_ind) +
+               filter_padded(w_im_ind) * filter_padded(w_im_ind);
+
+      inv_filter_padded(w_real_ind) /= (abs_Hw + const_val);
+      inv_filter_padded(w_im_ind) *= -1.0 / (abs_Hw + const_val);
+    }
+    filter_padded.CopyFromVec(inv_filter_padded);
+  }
 
   Vector<BaseFloat> temp_pad(filter_length - 1);
   temp_pad.SetZero();
diff --git a/src/feat/signal.h b/src/feat/signal.h
index 7ff0ce33b52..b9a49473b96 100644
--- a/src/feat/signal.h
+++ b/src/feat/signal.h
@@ -44,7 +44,8 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
    overlap-add method. This is an efficient way to evaluate the discrete
    convolution of a long signal with a finite impulse response filter.
 */
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal,
+  bool apply_inverse = false);
 
 }  // namespace kaldi
 
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 9843e7bbd4b..bff2b212a5b 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
     process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \
     compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \
     wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \
-    concat-feats
+    concat-feats compute-filter apply-filter
 
 OBJFILES =
 
diff --git a/src/featbin/apply-filter.cc b/src/featbin/apply-filter.cc
new file mode 100644
index 00000000000..18069ab33a6
--- /dev/null
+++ b/src/featbin/apply-filter.cc
@@ -0,0 +1,90 @@
+// featbin/apply-filters.cc
+
+// Copyright  2016 Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+      "apply filter to wave files supplied via input pip as FIR or IIR filter.\n"
+      "If the --inverse=false, it applies filter as FIR filter\n"
+      "and if --inverse=true, the inverse of filter applies as IIR filter.\n"
+      "Usage: apply-filters [options...] <wav-in-rxfilename> "
+      " <spkfilter-rxfilename> <wav-out-wxfilename>\n"
+      "e.g. \n"
+      "apply-filters --inverse=false --utt2spkfilter=ark:data/train/utt2spkfilter \n"
+      " input.wav filter.wav output_1.wav\n";
+    ParseOptions po(usage);
+    
+    bool inverse = false;
+    std::string utt2spkfilter_rspecifier = "";
+    po.Register("inverse", &inverse,
+                "If false, the filter is applied as FIR filter,"
+                "otherwise its inverse applied as IIR filter.");
+    po.Register("utt2spkfilter", &utt2spkfilter_rspecifier,
+                "rspecifier for utterance to spkear-filter list map"
+                " used to filter each utterance");
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string input_wave_file = po.GetArg(2),
+      filter_file = po.GetArg(1),
+      output_wave_file = po.GetArg(3);
+
+    WaveData input_wave;
+    {
+      Input ki(input_wave_file);
+      input_wave.Read(ki.Stream());
+    
+    }
+
+    SequentialBaseFloatVectorReader filter_reader(filter_file);
+    const Vector<BaseFloat> &lpc_filter = filter_reader.Value();
+   
+    Vector<BaseFloat> filtered_wav(input_wave.Data().Row(0));
+    BaseFloat samp_freq_input = input_wave.SampFreq();
+    // If inverse = false, it does FFT-based block Convolution of filter with 
+    // long input signal.
+    // Otherwise inverse of filter is convolved with input signal.
+    // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter
+    // convolving input with this filter is like whitening transform.
+    // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} 
+    //   = conv(y, [1 :-lpc-coeffs])
+    FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse);
+    Matrix<BaseFloat> filtered_wav_mat(1, filtered_wav.Dim());
+    filtered_wav_mat.CopyRowsFromVec(filtered_wav);
+    WaveData out_wave(samp_freq_input, filtered_wav_mat); 
+    Output ko(output_wave_file, false);
+    out_wave.Write(ko.Stream());
+    return 0; 
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/featbin/apply-filters.cc b/src/featbin/apply-filters.cc
new file mode 100644
index 00000000000..11c6867b553
--- /dev/null
+++ b/src/featbin/apply-filters.cc
@@ -0,0 +1,96 @@
+// featbin/apply-filters.cc
+
+// Copyright  2016 Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+      "apply filter to input wave as FIR or IIR filter.\n"
+      "If the --inverse=false, it applies filter as FIR filter\n"
+      "and if --inverse=true, the inverse of filter applies as IIR filter.\n"
+
+      "Usage: apply-filters [options...] <wav-in-rxfilename> "
+      " <filters-filename> <wav-out-wxfilename>\n"
+      "e.g. \n"
+      "apply-filters --inverse=false input.wav ark:filters.ark output.wav\n";
+    ParseOptions po(usage);
+    
+    bool inverse = false;
+
+    po.Register("inverse", &inverse,
+                "If false, the filter is applied as FIR filter,"
+                "otherwise its inverse applied as IIR filter.");
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string wav_rspecifier = po.GetArg(1),
+      filter_rspecifier = po.GetArg(2),
+      out_rspecifier = po.GetArg(3);
+
+    SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+    TableWriter<WaveHolder>  wav_writer(out_rspecifier);
+    RandomAccessBaseFloatVectorReader filters(filter_rspecifier);
+    int32 num_done = 0, num_err = 0;
+    for (;! wav_reader.Done(); wav_reader.Next()) {
+      std::string utt = wav_reader.Key();
+      const WaveData &wav = wav_reader.Value();
+      if (!filters.HasKey(utt)) {
+        KALDI_WARN << "Did not find filter for utterance " << utt;
+        num_err++;
+        continue;
+      }
+      const Vector<BaseFloat> lpc_filter = filters.Value(utt);
+      Vector<BaseFloat> filtered_wav(wav.Data().Row(0));
+      BaseFloat samp_freq_input = wav.SampFreq();
+      // If inverse = false, it does FFT-based block Convolution of filter with 
+      // long input signal.
+      // Otherwise inverse of filter is convolved with input signal.
+      // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter
+      // convolving input with this filter is like whitening transform.
+      // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} 
+      //   = conv(y, [1 :-lpc-coeffs])
+      FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse);
+      Matrix<BaseFloat> filtered_wav_mat(1, filtered_wav.Dim());
+      filtered_wav_mat.CopyRowsFromVec(filtered_wav);
+      WaveData out_wav(samp_freq_input, filtered_wav_mat); 
+      wav_writer.Write(utt, out_wav);
+      num_done++;
+    }
+    KALDI_LOG << "Applied filter on "
+              << num_done << " utterances"
+              << num_err << " had errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/featbin/compute-filter.cc b/src/featbin/compute-filter.cc
new file mode 100644
index 00000000000..4f750e418cf
--- /dev/null
+++ b/src/featbin/compute-filter.cc
@@ -0,0 +1,300 @@
+// featbin/compute-filters.cc
+
+// Copyright 2016 Pegah Ghahremani
+
+
+#include "feat/feature-functions.h"
+#include "matrix/srfft.h"
+#include "matrix/kaldi-matrix-inl.h"    
+#include "feat/wave-reader.h"
+namespace kaldi {
+
+// struct used to store statistics required for 
+// computing correlation coefficients 
+struct CorrelationStats {
+  int32 corr_order; // number of correlation coefficient. R[0],..,R[corr_order - 1] 
+  int32 num_samp;   // number of samples
+  BaseFloat samp_sum; // sum of samples.
+  Vector<BaseFloat> l2_norms; // l2_norms[j] - inner product of shifted input by itself as
+                             // sum_{i=0}^corr_window_size x[i+j]^2
+  Vector<BaseFloat> inner_prod; // inner product of input vector with its shifted version by j
+                                // sum_{i=0}^corr_window_size x[i] * x[i+j]
+  CorrelationStats(): corr_order(100), num_samp(0), samp_sum(0) { 
+    l2_norms.Resize(corr_order);
+    inner_prod.Resize(corr_order);}
+    
+  CorrelationStats(int32 corr_order): corr_order(corr_order), 
+    num_samp(0), samp_sum(0) {
+    l2_norms.Resize(corr_order);
+    inner_prod.Resize(corr_order); }
+};
+
+/*
+   This function computes and accumulates statistics 
+   required for computing auto-correlation coefficient using waveform "wave",
+   e.g dot-product of input with its shifted version.
+   inner_prod[j] - inner product of input vector with its shifted version by j
+                  sum_{i=0}^corr_window_size x[i] * x[i+j]
+   l2_norms[j]      - inner product of shifted input by itself as 
+                      sum_{i=0}^corr_window_size x[i+j]^2
+   lpc_order is the size of autocorr_coeffs.
+*/
+void AccStatsForCorrelation(const VectorBase<BaseFloat> &wave,
+                            int32 lpc_order,
+                            CorrelationStats *acc_corr_stats) { 
+  KALDI_ASSERT(acc_corr_stats->inner_prod.Dim() == lpc_order);
+  acc_corr_stats->samp_sum += wave.Sum();
+  acc_corr_stats->num_samp += wave.Dim();
+  int32 corr_window_size = wave.Dim() - lpc_order;
+  Vector<BaseFloat> norm_wave(wave);
+  SubVector<BaseFloat> sub_vec1(norm_wave, 0, corr_window_size);
+  BaseFloat local_l2_norm = VecVec(sub_vec1, sub_vec1), sum;
+
+  acc_corr_stats->inner_prod(0) += local_l2_norm;
+
+  for (int32 lag = 1; lag < lpc_order; lag++) {
+    SubVector<BaseFloat> sub_vec2(norm_wave, lag, corr_window_size);
+    int32 last_ind = corr_window_size + lag - 1;
+    local_l2_norm += (wave(last_ind) * wave(last_ind) - 
+      wave(lag - 1) * wave(lag - 1));
+    sum = VecVec(sub_vec1, sub_vec2);
+    acc_corr_stats->inner_prod(lag) += sum;
+    acc_corr_stats->l2_norms(lag) += local_l2_norm;
+  }
+}
+/*
+  Compute autocorrelation coefficients using accumulated unnormalized statistics
+  such as inner product and l2 norms.
+  inner product and l2_norms are normalized using mean as E[x].
+  autocorr[j] - sum_{i=0}^n ([x[i] - E[x]) * (x[i+j] - E(x)) /
+    [(sum_{i=0}^n ([x[i] - E[x])^2) * (sum_{i=0}^n ([x[i+j] - E[x])^2)]^0.5
+  autocorr[j] - inner_prod[j] / (norms[0] * norms[j])^0.5
+  inner_prod[j] - inner product of input vector with its shifted version by j
+                   sum_{i=0}^n x[i] * x[i+j]
+   l2_norms[j]      - inner product of shifted input by itself as sum_{i=0}^n x[i+j]^2
+*/
+void ComputeCorrelation(const CorrelationStats &acc_corr_stats,
+                        Vector<BaseFloat> *autocorr) {
+
+  KALDI_ASSERT(acc_corr_stats.inner_prod.Dim() == acc_corr_stats.l2_norms.Dim());
+
+  int32 lpc_order = acc_corr_stats.inner_prod.Dim();
+  autocorr->Resize(lpc_order);
+  for (int32 lag = 0; lag < lpc_order; lag++)
+    (*autocorr)(lag) = acc_corr_stats.inner_prod(lag);
+
+  // scale outocorrelation between 0 and 1 using autocorr(0)
+  autocorr->Scale(1.0 / (*autocorr)(0));
+  
+}
+/*
+   Durbin's recursion - converts autocorrelation coefficients to the LPC
+   pTmp - temporal place [n]
+   pAC - autocorrelation coefficients [n + 1]
+   pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
+*/
+double DurbinInternal(int32 n, double *pAC, double *pLP, double *pTmp) {
+  double ki;                // reflection coefficient
+
+  // we add this bias term to pAC[0].
+  // Adding bias term is equivalent to t = teoplitz(pAC) + diag(bias)
+  // which is like shifting eigenvalues of teoplitz(pAC) using bias term
+  // and we can make sure t is convertible.
+  double durbin_bias = 1e-2;
+  int32 max_repeats = 20;
+
+  double E = pAC[0];
+  pLP[0] = 1.0;
+  for (int32 i = 1; i <= n; i++) {
+    // next reflection coefficient
+    ki = pAC[i];
+    for (int32 j = 1; j < i; j++)
+      ki += pLP[j] * pAC[i - j];
+    ki = ki / E;
+
+    if (abs(ki) > 1) {
+      int32 num_repeats = int((pAC[0] - 1.0) / durbin_bias);
+      KALDI_LOG << " warning: In Durbin algorithm, abs(ki) > 1 "
+                << " for iteration = " << i 
+                << " ki = " << ki
+                << " autocorr[0] = " << pAC[0]
+                << " num_repeats = " << num_repeats
+                << "; the bias added";
+      pAC[0] += durbin_bias;
+      if (num_repeats < max_repeats)
+        return -1;
+    }
+    // new error
+    double c = 1 - ki * ki;
+    if (c < 1.0e-5) // remove NaNs for constan signal 
+      c = 1.0e-5;
+   
+    E *= c;
+    // new LP coefficients
+    pTmp[i] = -ki;
+    for (int32 j = 1; j < i; j++)
+      pTmp[j] = pLP[j] - ki * pLP[i - j];
+
+    for (int32 j = 1; j <= i; j++)
+      pLP[j] = pTmp[j];
+  }
+  return E;
+}
+/*
+  This function computes coefficients of forward linear predictor
+   w.r.t autocorrelation coefficients by minimizing the prediction
+   error using MSE.
+   Durbin used to compute LP coefficients using autocorrelation coefficients.
+   R(j) = sum_{i=1}^P R((i+j % p)] * a[i] j = 0, ..,P
+   P is order of Linear prediction
+   lp_filter - [1, -a[1], -a[2] ... ,-a[P]]
+   where a[i] are linear prediction coefficients. (predicted_x[n] = sum_{i=1}^P{a[i] * x[n-i]})
+   x[n] - predicted_x[n] = sum_{i = 0}^P { lp_filter[i] .* x[n] }
+                         = conv(x, lp_filter)
+   R(j) is the j_th autocorrelation coefficient.
+*/
+void ComputeFilters(const VectorBase<BaseFloat> &autocorr, 
+                    Vector<BaseFloat> *lp_filter) {
+  int32 n = autocorr.Dim();
+  lp_filter->Resize(n);
+  // compute lpc coefficients using autocorrelation coefficients
+  // with Durbin algorithm
+  Vector<double> d_autocorr(autocorr),
+   d_lpc_coeffs(n), d_tmp(n);
+
+  KALDI_LOG << "compute lpc using correlations ";
+  while (DurbinInternal(n, d_autocorr.Data(),
+                 d_lpc_coeffs.Data(),
+                 d_tmp.Data()) < 0);
+  lp_filter->CopyFromVec(d_lpc_coeffs);
+  if (KALDI_ISNAN(lp_filter->Sum())) {
+    KALDI_WARN << "NaN encountered in lpc coefficients derived from Durbin algorithm.";
+    lp_filter->Set(0.0);
+    (*lp_filter)(0) = 1.0;
+  }
+
+}
+
+} // namescape kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using kaldi::int32;
+
+    const char *usage = 
+      "Computes LP coefficients per-speaker, by minimizing "
+      "prediction error using MSE.\n"
+      "This coefficient contain speaker-dependent information correspond to each speaker.\n"
+
+      "Usage: compute-filters [options] <wave-rspecifier> <filter-rspecifier> \n"
+      "e.g.: compute-filters " 
+      " scp:data/train/wav.scp ark,scp:filter.ark,filter.scp\n";
+
+    ParseOptions po(usage);
+    std::string spk2utt_rspecifier;
+    bool binary = true; 
+    int32 channel = -1,
+      lpc_order = 100;
+    po.Register("binary", &binary, "write in binary mode (applies only to global filters)");
+    po.Register("lpc-order", &lpc_order, "number of LP coefficients used to extract filters.");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    int32 num_done = 0, num_err = 0;
+    std::string wav_rspecifier = po.GetArg(1),
+      wspecifier = po.GetArg(2);
+    
+    BaseFloatVectorWriter writer(wspecifier);
+    if (spk2utt_rspecifier != "") { 
+      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+      RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+        std::string spk = spk2utt_reader.Key();
+        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+        CorrelationStats acc_corr_stats(lpc_order);
+        for (size_t i = 0; i < uttlist.size(); i++) {
+          std::string utt = uttlist[i];
+          if (!wav_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find wave for utterance " << utt;
+            num_err++;
+            continue;
+          }
+          const WaveData &wav_data = wav_reader.Value(utt);
+          int32 num_chan = wav_data.Data().NumRows(), this_chan = channel;
+          KALDI_ASSERT(num_chan > 0);
+          if (channel == -1) { 
+            this_chan = 0;
+            if (num_chan != 1)
+              KALDI_WARN << "Channel not specified but you have data with "
+                         << num_chan  << " channels; defaulting to zero";
+          } else {
+            if (this_chan >= num_chan) {
+              KALDI_WARN << "File with id " << spk << " has "
+                         << num_chan << " channels but you specified channel "
+                         << channel << ", producing no output.";
+              num_err++;
+              continue;
+            }
+          }
+          Vector<BaseFloat> waveform(wav_data.Data().Row(this_chan));
+          waveform.Scale(1.0 / (1 << 15));
+          AccStatsForCorrelation(waveform, lpc_order,
+                                 &acc_corr_stats);
+        }
+        Vector<BaseFloat> filter, autocorr(lpc_order);
+        ComputeCorrelation(acc_corr_stats,
+                           &autocorr);
+        ComputeFilters(autocorr, &filter); 
+        writer.Write(spk, filter);
+        num_done++;
+      }  
+    } else { // assume the input waveform is per-speaker.
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string spk = wav_reader.Key();
+        const WaveData &wave_data = wav_reader.Value();
+        int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+        KALDI_ASSERT(num_chan > 0);
+        if (channel == -1) { 
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << spk << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            num_err++;
+            continue;
+          }
+        }
+        Vector<BaseFloat> waveform(wave_data.Data().Row(this_chan));
+        Vector<BaseFloat> autocorr, filter;
+        waveform.Scale(1.0 / (1 << 15));
+        KALDI_ASSERT(waveform.Max() <=1 && waveform.Min() >= -1);
+        CorrelationStats acc_corr_stats(lpc_order);
+
+        AccStatsForCorrelation(waveform, lpc_order,
+                               &acc_corr_stats);
+        ComputeCorrelation(acc_corr_stats,
+                           &autocorr);
+        //KALDI_LOG << "autocorr = " << autocorr;
+        ComputeFilters(autocorr, &filter);
+        writer.Write(spk, filter);
+        num_done++;
+      }
+    }
+    KALDI_LOG << "Done " << num_done << " speakers, " << num_err
+              << " with errors."; 
+    return (num_done != 0 ? 0 : 1); 
+  } catch(const std::exception &e) { 
+    std::cerr << e.what();
+    return -1;
+  }
+}