diff --git a/egs/gop/README.md b/egs/gop/README.md
new file mode 100644
index 00000000000..d95f4e966fd
--- /dev/null
+++ b/egs/gop/README.md
@@ -0,0 +1,98 @@
+There is a copy of this document on Google Docs, which renders the equations better:
+[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing)
+
+* * *
+
+# GOP on Kaldi
+
+The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring.
+GOP is widely used in pronunciation evaluation and mispronunciation detection tasks.
+
+This implementation is mainly based on the following paper:
+
+Hu, W., Qian, Y., Soong, F. K., & Wang, Y. (2015). Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers. Speech Communication, 67(January), 154-166.
+
+## GOP-GMM
+
+In the conventional GMM-HMM based system, GOP was first proposed in (Witt et al., 2000). It was defined as the duration normalised log of the posterior:
+
+$$
+GOP(p)=\frac{1}{t_e-t_s+1} \log p(p|\mathbf o)
+$$
+
+where $\mathbf o$ is the input observations, $p$ is the canonical phone, $t_s, t_e$ are the start and end frame indexes.
+
+Assuming $p(q_i)\approx p(q_j)$ for any $q_i, q_j$, we have:
+
+$$
+\log p(p|\mathbf o)=\frac{p(\mathbf o|p)p(p)}{\sum_{q\in Q} p(\mathbf o|q)p(q)}
+                   \approx\frac{p(\mathbf o|p)}{\sum_{q\in Q} p(\mathbf o|q)}
+$$
+
+where $Q$ is the whole phone set.
+
+The numerator of the equation is calculated from forced alignment result and the denominator is calculated from an Viterbi decoding with a unconstrained phone loop.
+
+We do not implement GOP-GMM for Kaldi, as GOP-NN performs much better than GOP-GMM.
+
+## GOP-NN
+
+The definition of GOP-NN is a bit different from the GOP-GMM. GOP-NN was defined as the log phone posterior ratio between the canonical phone and the one with the highest score (Hu et al., 2015).
+
+Firstly we define Log Phone Posterior (LPP):
+
+$$
+LPP(p)=\log p(p|\mathbf o; t_s,t_e)
+$$
+
+Then we define the GOP-NN using LPP:
+
+$$
+GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+$$
+
+LPP could be calculated as:
+
+$$
+LPP(p) \approx \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+$$
+
+$$
+p(p|o_t) = \sum_{s \in p} p(s|o_t)
+$$
+
+where $s$ is the senone label, $\{s|s \in p\}$ is the states belonging to those triphones whose current phone is $p$.
+
+## Phone-level Feature
+
+Normally the classifier-based approach archives better performance than GOP-based approach.
+
+Different from GOP based method, an extra supervised training process is needed. The input features for supervised training are phone-level, segmental features. The phone-level feature is defined as:
+
+$$
+{[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T
+$$
+
+where the Log Posterior Ratio (LPR) between phone $p_j$ and $p_i$ is defined as:
+
+$$
+LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e)
+$$
+
+## Implementation
+
+This implementation consists of a executable binary `bin/compute-gop` and some scripts.
+
+`compute-gop` computes GOP and extracts phone-level features using nnet output probabilities.
+The output probabilities are assumed to be from a log-softmax layer.
+
+The script `run.sh` shows a typical pipeline based on librispeech's model and data.
+
+In Hu's paper, GOP was computed using a feed-forward DNN.
+We have tried to use the output-xent of a chain model to compute GOP, but the result was not good.
+We guess the HMM topo of chain model may not fit for GOP.
+
+The nnet3's TDNN (no chain) model performs well in GOP computing, so this recipe uses it.
+
+## Acknowledgement
+The author of this recipe would like to thank Xingyu Na for his works of model tuning and his helpful suggestions.
diff --git a/egs/gop/s5/cmd.sh b/egs/gop/s5/cmd.sh
new file mode 100644
index 00000000000..9139633e57a
--- /dev/null
+++ b/egs/gop/s5/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="run.pl"
diff --git a/egs/gop/s5/local/make_testcase.sh b/egs/gop/s5/local/make_testcase.sh
new file mode 100755
index 00000000000..884563066b1
--- /dev/null
+++ b/egs/gop/s5/local/make_testcase.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+src=$1
+dst=$2
+
+# Select a very small set for testing
+utils/subset_data_dir.sh --shortest $src 10 $dst
+
+# make fake transcripts as negative examples
+cp $dst/text $dst/text.ori
+sed -i "s/ THERE / THOSE /" $dst/text
+sed -i "s/ IN / ON /" $dst/text
diff --git a/egs/gop/s5/local/remove_phone_markers.pl b/egs/gop/s5/local/remove_phone_markers.pl
new file mode 100755
index 00000000000..16236a749cf
--- /dev/null
+++ b/egs/gop/s5/local/remove_phone_markers.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+# Copyright 2019 Junbo Zhang
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+my $Usage = <<EOU;
+remove_phone_markers.pl:
+This script processes a phone set (i.e. the phones.txt file), remove the stress
+markers and the pos-in-word markers, and creates a new phone.txt file and an
+old->new phone mapping file, in which each line is: "old-integer-id new-integer-id.
+
+Usage: utils/remove_phone_markers.pl <old-phone-symbols> <new-phone-symbols> <mapping>
+ e.g.: utils/remove_phone_markers.pl phones.txt phones-pure.txt phone-to-pure-phone.int
+EOU
+
+if (@ARGV < 3) {
+  die $Usage;
+}
+
+my $old_phone_symbols_filename = shift @ARGV;
+my $new_phone_symbols_filename = shift @ARGV;
+my $mapping_filename = shift @ARGV;
+
+my %id_of_old_phone;
+open(IN, $old_phone_symbols_filename) or die "Can't open $old_phone_symbols_filename";
+while (<IN>) {
+  chomp;
+  my ($phone, $id) = split;
+  next if $phone =~ /\#/;
+  $id_of_old_phone{$phone} = $id;
+}
+close IN;
+
+my $new_id = 0;
+my %id_of_new_phone;
+my %id_old_to_new;
+foreach (sort { $id_of_old_phone{$a} <=> $id_of_old_phone{$b} } keys %id_of_old_phone) {
+  my $old_phone = $_;
+  s/_[BIES]//;
+  s/\d//;
+  my $new_phone = $_;
+  $id_of_new_phone{$new_phone} = $new_id++ if not exists $id_of_new_phone{$new_phone};
+  $id_old_to_new{$id_of_old_phone{$old_phone}} = $id_of_new_phone{$new_phone};
+}
+
+# Write to file
+open(OUT, ">$new_phone_symbols_filename") or die "Can\'t write to $new_phone_symbols_filename";
+foreach (sort { $id_of_new_phone{$a} <=> $id_of_new_phone{$b} } keys %id_of_new_phone) {
+  print OUT "$_\t$id_of_new_phone{$_}\n";
+}
+close OUT;
+
+open(OUT, ">$mapping_filename") or die "Can\'t write to $mapping_filename";
+foreach (sort { $a <=> $b } keys %id_old_to_new) {
+  next if $_ == 0;
+  print OUT "$_ $id_old_to_new{$_}\n";
+}
+close OUT;
diff --git a/egs/gop/s5/path.sh b/egs/gop/s5/path.sh
new file mode 100755
index 00000000000..03df6dd9f2b
--- /dev/null
+++ b/egs/gop/s5/path.sh
@@ -0,0 +1,27 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+# we use this both in the (optional) LM training and the G2P-related scripts
+PYTHON='python2.7'
+
+### Below are the paths used by the optional parts of the recipe
+
+# We only need the Festival stuff below for the optional text normalization(for LM-training) step
+FEST_ROOT=tools/festival
+NSW_PATH=${FEST_ROOT}/festival/bin:${FEST_ROOT}/nsw/bin
+export PATH=$PATH:$NSW_PATH
+
+# SRILM is needed for LM model building
+SRILM_ROOT=$KALDI_ROOT/tools/srilm
+SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64
+export PATH=$PATH:$SRILM_PATH
+
+# Sequitur G2P executable
+sequitur=$KALDI_ROOT/tools/sequitur/g2p.py
+sequitur_path="$(dirname $sequitur)/lib/$PYTHON/site-packages"
+
+# Directory under which the LM training corpus should be extracted
+LM_CORPUS_ROOT=./lm-corpus
diff --git a/egs/gop/s5/run.sh b/egs/gop/s5/run.sh
new file mode 100755
index 00000000000..a731b913552
--- /dev/null
+++ b/egs/gop/s5/run.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Copyright 2019 Junbo Zhang
+# Apache 2.0
+
+# This script shows how to calculate Goodness of Pronunciation (GOP) and
+# extract phone-level pronunciation feature for mispronunciations detection
+# tasks. Read ../README.md or the following paper for details:
+#
+# "Hu et al., Improved mispronunciation detection with deep neural network
+# trained acoustic models and transfer learning based logistic regression
+# classifiers, 2015."
+
+# You might not want to do this for interactive shells.
+set -e
+
+# Before running this recipe, you have to run the librispeech recipe firstly.
+# This script assumes the following paths exist.
+librispeech_eg=../../librispeech/s5
+model=$librispeech_eg/exp/nnet3_cleaned/tdnn_sp
+ivector=$librispeech_eg/exp/nnet3_cleaned/ivectors_test_clean_hires
+lang=$librispeech_eg/data/lang
+test_data=$librispeech_eg/data/test_clean_hires
+
+for d in $model $ivector $lang $test_data; do
+  [ ! -d $d ] && echo "$0: no such path $d" && exit 1;
+done
+
+# Global configurations
+stage=0
+nj=4
+
+data=test_10short
+dir=exp/gop_$data
+
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+
+if [ $stage -le 0 ]; then
+  # Prepare test data
+  [ -d data ] || mkdir -p data/$data
+  local/make_testcase.sh $test_data data/$data
+fi
+
+if [ $stage -le 1 ]; then
+  # Compute Log-likelihoods
+  steps/nnet3/compute_output.sh --cmd "$cmd" --nj $nj \
+    --online-ivector-dir $ivector data/$data $model exp/probs_$data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/nnet3/align.sh --cmd "$cmd" --nj $nj --use_gpu false \
+    --online_ivector_dir $ivector data/$data $lang $model $dir
+fi
+
+if [ $stage -le 3 ]; then
+  # make a map which converts phones to "pure-phones"
+  # "pure-phone" means the phone whose stress and pos-in-word markers are ignored
+  # eg. AE1_B --> AE, EH2_S --> EH, SIL --> SIL
+  local/remove_phone_markers.pl $lang/phones.txt $dir/phones-pure.txt \
+    $dir/phone-to-pure-phone.int
+
+  # Convert transition-id to pure-phone id
+  $cmd JOB=1:$nj $dir/log/ali_to_phones.JOB.log \
+    ali-to-phones --per-frame=true $model/final.mdl "ark,t:gunzip -c $dir/ali.JOB.gz|" \
+      "ark,t:-" \| utils/apply_map.pl -f 2- $dir/phone-to-pure-phone.int \| \
+      gzip -c \>$dir/ali-pure-phone.JOB.gz   || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The outputs of the binary compute-gop are the GOPs and the phone-level features.
+  #
+  # An example of the GOP result (extracted from "ark,t:$dir/gop.3.txt"):
+  # 4446-2273-0031 [ 1 0 ] [ 12 0 ] [ 27 -5.382001 ] [ 40 -13.91807 ] [ 1 -0.2555897 ] \
+  #                [ 21 -0.2897284 ] [ 5 0 ] [ 31 0 ] [ 33 0 ] [ 3 -11.43557 ] [ 25 0 ] \
+  #                [ 16 0 ] [ 30 -0.03224623 ] [ 5 0 ] [ 25 0 ] [ 33 0 ] [ 1 0 ]
+  # It is in the posterior format, where each pair stands for [pure-phone-index gop-value].
+  # For example, [ 27 -5.382001 ] means the GOP of the pure-phone 27 (it corresponds to the
+  # phone "OW", according to "$dir/phones-pure.txt") is -5.382001, indicating the audio
+  # segment of this phone should be a mispronunciation.
+  #
+  # The phone-level features are in matrix format:
+  # 4446-2273-0031  [ -0.2462088 -10.20292 -11.35369 ...
+  #                   -8.584108 -7.629755 -13.04877 ...
+  #                   ...
+  #                   ... ]
+  # The row number is the phone number of the utterance. In this case, it is 17.
+  # The column number is 2 * (pure-phone set size), as the feature is consist of LLR + LPR.
+  # The phone-level features can be used to train a classifier with human labels. See Hu's
+  # paper for detail.
+  $cmd JOB=1:$nj $dir/log/compute_gop.JOB.log \
+    compute-gop --phone-map=$dir/phone-to-pure-phone.int $model/final.mdl \
+      "ark,t:gunzip -c $dir/ali-pure-phone.JOB.gz|" \
+      "ark:exp/probs_$data/output.JOB.ark" \
+      "ark,t:$dir/gop.JOB.txt" "ark,t:$dir/phonefeat.JOB.txt"   || exit 1;
+  echo "Done compute-gop, the results: \"$dir/gop.<JOB>.txt\" in posterior format."
+
+  # We set -5 as a universal empirical threshold here. You can also determine multiple phone
+  # dependent thresholds based on the human-labeled mispronunciation data.
+  echo "The phones whose gop values less than -5 could be treated as mispronunciations."
+fi
diff --git a/egs/gop/s5/steps b/egs/gop/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/gop/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/gop/s5/utils b/egs/gop/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/gop/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS
index b45271765bc..dbf54b9384d 100644
--- a/egs/librispeech/s5/RESULTS
+++ b/egs/librispeech/s5/RESULTS
@@ -1,6 +1,6 @@
 # In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation.
 # The following language models are then used for rescoring:
-# a) tgmed- slightly less pruned 3-gram LM  
+# a) tgmed- slightly less pruned 3-gram LM
 # b) tglarge- the full, non-pruned 3-gram LM
 # c) fglarge- non-pruned 4-gram LM
 #
@@ -337,7 +337,7 @@
 %WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14
 %WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17
 %WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15
-%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 
+%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14
 %WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17
 %WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14
 %WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15
@@ -423,7 +423,7 @@
 %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14
 
 # Results with nnet3 tdnn
-# local/nnet3/run_tdnn.sh
+# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh)
 # (4 epoch training on speed-perturbed data)
 # num_params=19.3M
 %WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0
@@ -444,7 +444,7 @@
 %WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
 # Results with nnet3 tdnn
-# local/nnet3/run_tdnn.sh
+# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh)
 # (4 epoch training on speed-perturbed and volumn-perturbed "cleaned" data)
 # num_params=19.3M, average training time=68.8s per job(on Tesla K80), real-time factor=1.23161
 # for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
@@ -465,6 +465,24 @@
 %WER 14.78 [ 7737 / 52343, 807 ins, 1115 del, 5815 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_15_0.0
 %WER 16.28 [ 8521 / 52343, 843 ins, 1258 del, 6420 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
+# Results with nnet3 tdnn with new configs, a.k.a. xconfig
+# local/nnet3/run_tdnn.sh (linked to local/nnet3/tuning/run_tdnn_1b.sh)
+%WER 4.60 [ 2502 / 54402, 324 ins, 286 del, 1892 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0
+%WER 4.80 [ 2612 / 54402, 350 ins, 285 del, 1977 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tglarge/wer_11_1.0
+%WER 5.97 [ 3248 / 54402, 460 ins, 310 del, 2478 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgmed/wer_11_0.0
+%WER 6.66 [ 3625 / 54402, 479 ins, 392 del, 2754 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgsmall/wer_11_0.0
+%WER 12.29 [ 6262 / 50948, 863 ins, 665 del, 4734 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_fglarge/wer_15_0.0
+%WER 12.89 [ 6565 / 50948, 773 ins, 853 del, 4939 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tglarge/wer_14_0.5
+%WER 15.41 [ 7849 / 50948, 894 ins, 1083 del, 5872 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgmed/wer_15_0.0
+%WER 16.81 [ 8562 / 50948, 896 ins, 1215 del, 6451 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0
+%WER 4.99 [ 2624 / 52576, 393 ins, 253 del, 1978 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_fglarge/wer_13_0.5
+%WER 5.16 [ 2715 / 52576, 359 ins, 319 del, 2037 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tglarge/wer_12_1.0
+%WER 6.29 [ 3307 / 52576, 471 ins, 341 del, 2495 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0
+%WER 7.13 [ 3750 / 52576, 473 ins, 452 del, 2825 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0
+%WER 12.73 [ 6665 / 52343, 894 ins, 711 del, 5060 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_fglarge/wer_14_0.0
+%WER 13.33 [ 6979 / 52343, 920 ins, 796 del, 5263 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tglarge/wer_14_0.0
+%WER 15.90 [ 8323 / 52343, 921 ins, 1126 del, 6276 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_13_0.0
+%WER 17.28 [ 9044 / 52343, 894 ins, 1372 del, 6778 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
 # Results with nnet3 tdnn+sMBR
 # local/nnet3/run_tdnn_discriminative.sh
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 28ee2b92004..00000000000
--- a/egs/librispeech/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-# without cleanup:
-# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-decode_nj=30
-train_set=train_960_cleaned
-gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
-                   # should have alignments for the specified training data.
-nnet3_affix=_cleaned
-
-# Options which are not passed through to run_ivector_common.sh
-affix=
-train_stage=-10
-common_egs_dir=
-reporting_email=
-remove_egs=true
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --nnet3-affix "$nnet3_affix" || exit 1;
-
-
-gmm_dir=exp/${gmm}
-graph_dir=$gmm_dir/graph_tgsmall
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
-train_data_dir=data/${train_set}_sp_hires
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 11 ]; then
-  echo "$0: creating neural net configs";
-
-  # create the config files for nnet initialization
-  python steps/nnet3/tdnn/make_configs.py  \
-    --feat-dir $train_data_dir \
-    --ivector-dir $train_ivector_dir \
-    --ali-dir $ali_dir \
-    --relu-dim 1280 \
-    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
-    --use-presoftmax-prior-scale true \
-   $dir/configs || exit 1;
-fi
-
-
-
-if [ $stage -le 12 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_dnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.0017 \
-    --trainer.optimization.final-effective-lrate 0.00017 \
-    --egs.dir "$common_egs_dir" \
-    --cleanup.remove-egs $remove_egs \
-    --cleanup.preserve-model-interval 100 \
-    --feat-dir=$train_data_dir \
-    --ali-dir $ali_dir \
-    --lang data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-
-fi
-
-if [ $stage -le 13 ]; then
-  # this does offline decoding that should give about the same results as the
-  # real online decoding (the one with --per-utt true)
-  rm $dir/.error 2>/dev/null || true
-  for test in test_clean test_other dev_clean dev_other; do
-    (
-    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
-      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
-    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
-    steps/lmrescore_const_arpa.sh \
-      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
-    steps/lmrescore_const_arpa.sh \
-      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-exit 0;
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..28ee2b92004
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train_960_cleaned
+gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                   # should have alignments for the specified training data.
+nnet3_affix=_cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph_tgsmall
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --relu-dim 1280 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --feat-dir=$train_data_dir \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  rm $dir/.error 2>/dev/null || true
+  for test in test_clean test_other dev_clean dev_other; do
+    (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
+      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..a96a1b33e6c
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+
+# 1b is as 1a but uses xconfigs.
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train_960_cleaned
+gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                   # should have alignments for the specified training data.
+nnet3_affix=_cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph_tgsmall
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-batchnorm-layer name=tdnn0 dim=1280
+  relu-batchnorm-layer name=tdnn1 dim=1280 input=Append(-1,2)
+  relu-batchnorm-layer name=tdnn2 dim=1280 input=Append(-3,3)
+  relu-batchnorm-layer name=tdnn3 dim=1280 input=Append(-7,2)
+  relu-batchnorm-layer name=tdnn4 dim=1280
+  output-layer name=output input=tdnn4 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --feat-dir=$train_data_dir \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  rm $dir/.error 2>/dev/null || true
+  for test in test_clean test_other dev_clean dev_other; do
+    (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
+      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 7cb01b50120..bfb037fc792 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -22,7 +22,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat compile-graph \
-        compare-int-vector
+        compare-int-vector compute-gop
 
 
 OBJFILES =
diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc
new file mode 100644
index 00000000000..63b42212ee7
--- /dev/null
+++ b/src/bin/compute-gop.cc
@@ -0,0 +1,227 @@
+// bin/compute-gop.cc
+
+// Copyright 2019  Junbo Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+   This code computes Goodness of Pronunciation (GOP) and extracts phone-level
+   pronunciation feature for mispronunciations detection tasks, the reference:
+
+   "Improved mispronunciation detection with deep neural network trained acoustic
+   models and transfer learning based logistic regression classifiers"
+   by Hu et al., Speech Comunication, 2015.
+
+   GOP is widely used to detect mispronunciations. The DNN-based GOP was defined
+   as the log phone posterior ratio between the canonical phone and the one with
+   the highest score.
+
+   To compute GOP, we need to compute Log Phone Posterior (LPP):
+     LPP(p) = \log p(p|\mathbf o; t_s,t_e)
+   where {\mathbf o} is the input observations, p is the canonical phone,
+   {t_s, t_e} are the start and end frame indexes.
+
+   LPP could be calculated as the average of the frame-level LPP, i.e. p(p|o_t):
+     LPP(p) = \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+     p(p|o_t) = \sum_{s \in p} p(s|o_t)
+   where s is the senone label, {s|s \in p} is the states belonging to those
+   triphones whose current phone is p.
+
+   GOP is extracted from LPP:
+     GOP(p) = \log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+
+   An array of a phone-level feature for each phone is extracted as well, which
+   could be used to train a classifier to detect mispronunciations. Normally the
+   classifier-based approach archives better performance than the GOP-based approach.
+
+   The phone-level feature is defined as:
+     {[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T
+
+   where the Log Posterior Ratio (LPR) between phone p_j and p_i is defined as:
+     LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e)
+ */
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/hmm-utils.h"
+#include "hmm/tree-accu.h"
+#include "hmm/posterior.h"
+
+namespace kaldi {
+
+/** FrameLevelLpp compute a log posterior for pure-phones by sum the posterior
+    of the states belonging to those triphones whose current phone is the canonical
+    phone:
+
+    p(p|o_t) = \sum_{s \in p} p(s|o_t),
+
+    where s is the senone label, {s|s \in p} is the states belonging to those
+    riphones whose current phone is the canonical phone p.
+
+ */
+void FrameLevelLpp(const SubVector<BaseFloat> &prob_row,
+                   const std::vector<std::set<int32> > &pdf2phones,
+                   const std::vector<int32> *phone_map,
+                   Vector<BaseFloat> *out_frame_level_lpp) {
+  for (int32 i = 0; i < prob_row.Dim(); i++) {
+    std::set<int32> dest_idxs;
+    for (int32 ph : pdf2phones.at(i)) {
+      dest_idxs.insert((phone_map != NULL) ? (*phone_map)[ph] - 1 : ph - 1);
+    }
+
+    for (int32 idx : dest_idxs) {
+      KALDI_ASSERT(idx < out_frame_level_lpp->Dim());
+      (*out_frame_level_lpp)(idx) += prob_row(i);
+    }
+  }
+  out_frame_level_lpp->ApplyLog();
+}
+
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  try {
+    const char *usage =
+        "Compute Goodness Of Pronunciation (GOP) from a matrix of "
+        "probabilities (e.g. from nnet3-compute).\n"
+        "Usage:  compute-gop [options] <model> <alignments-rspecifier> "
+        "<prob-matrix-rspecifier> <gop-wspecifier> "
+        "[<phone-feature-wspecifier>]\n"
+        "e.g.:\n"
+        " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
+        " ark:gop.1 ark:phone-feat.1\n";
+
+    ParseOptions po(usage);
+
+    bool log_applied = true;
+    std::string phone_map_rxfilename;
+
+    po.Register("log-applied", &log_applied,
+        "If true, assume the input probabilities have been applied log.");
+    po.Register("phone-map", &phone_map_rxfilename,
+                "File name containing old->new phone mapping (each line is: "
+                "old-integer-id new-integer-id)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4 && po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_filename = po.GetArg(1),
+                alignments_rspecifier = po.GetArg(2),
+                prob_rspecifier = po.GetArg(3),
+                gop_wspecifier = po.GetArg(4),
+                feat_wspecifier = po.GetArg(5);
+
+    TransitionModel trans_model;
+    {
+      bool binary;
+      Input ki(model_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+    }
+    std::vector<std::set<int32> > pdf2phones;
+    GetPdfToPhonesMap(trans_model, &pdf2phones);
+    int32 phone_num = trans_model.NumPhones();
+
+    std::vector<int32> phone_map;
+    if (phone_map_rxfilename != "") {
+      ReadPhoneMap(phone_map_rxfilename, &phone_map);
+      phone_num = phone_map[phone_map.size() - 1];
+    }
+
+    RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
+    SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
+    PosteriorWriter gop_writer(gop_wspecifier);
+    BaseFloatMatrixWriter feat_writer(feat_wspecifier);
+
+    int32 num_done = 0;
+    for (; !prob_reader.Done(); prob_reader.Next()) {
+      std::string key = prob_reader.Key();
+      auto alignment = alignment_reader.Value(key);
+      Matrix<BaseFloat> &probs = prob_reader.Value();
+      if (log_applied) probs.ApplyExp();
+
+      int32 frame_num = alignment.size();
+      if (alignment.size() != probs.NumRows()) {
+        KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
+        if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
+      }
+
+      KALDI_ASSERT(frame_num > 0);
+      int32 cur_phone_id = alignment[0] - 1;  // start by 0, skipping <eps>
+      int32 duration = 0;
+      Vector<BaseFloat> phone_level_feat(phone_num * 2);  // LPPs and LPRs
+      SubVector<BaseFloat> lpp_part(phone_level_feat, 0, phone_num);
+      std::vector<Vector<BaseFloat> > phone_level_feat_stdvector;
+      Posterior posterior_gop;
+      for (int32 i = 0; i < frame_num; i++) {
+        // Calculate LPP and LPR for each pure-phone
+        Vector<BaseFloat> frame_level_lpp(phone_num);
+        FrameLevelLpp(probs.Row(i), pdf2phones,
+                      (phone_map_rxfilename != "") ? &phone_map : NULL,
+                      &frame_level_lpp);
+
+        // LPP(p)=\frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+        lpp_part.AddVec(1, frame_level_lpp);
+        duration++;
+
+        int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1] - 1: -1;
+        if (next_phone_id != cur_phone_id) {
+          // The current phone's feature have been ready
+          lpp_part.Scale(1.0 / duration);
+
+          // LPR(p_j|p_i)=\log p(p_j|\mathbf o; t_s, t_e)-\log p(p_i|\mathbf o; t_s, t_e)
+          for (int k = 0; k < phone_num; k++)
+            phone_level_feat(phone_num + k) = lpp_part(cur_phone_id) - lpp_part(k);
+          phone_level_feat_stdvector.push_back(phone_level_feat);
+
+          // Compute GOP from LPP
+          // GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+          BaseFloat gop = lpp_part(cur_phone_id) - lpp_part.Max();
+          std::vector<std::pair<int32, BaseFloat> > posterior_item;
+          posterior_item.push_back(std::make_pair(cur_phone_id + 1, gop));
+          posterior_gop.push_back(posterior_item);
+
+          // Reset
+          phone_level_feat.Set(0);
+          duration = 0;
+        }
+        cur_phone_id = next_phone_id;
+      }
+
+      // Write GOPs and the phone-level features
+      Matrix<BaseFloat> feats(phone_level_feat_stdvector.size(), phone_num * 2);
+      for (int32 i = 0; i < phone_level_feat_stdvector.size(); i++) {
+        SubVector<BaseFloat> row(feats, i);
+        row.AddVec(1.0, phone_level_feat_stdvector[i]);
+      }
+      feat_writer.Write(key, feats);
+      gop_writer.Write(key, posterior_gop);
+      num_done++;
+    }
+
+    KALDI_LOG << "Processed " << num_done << " prob matrices.";
+    return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 06edf8d5976..15a1edfd255 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -1289,5 +1289,16 @@ void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
   }
 }
 
+void GetPdfToPhonesMap(const TransitionModel &trans_model,
+                       std::vector<std::set<int32> > *pdf2phones) {
+  pdf2phones->clear();
+  pdf2phones->resize(trans_model.NumPdfs());
+  for (int32 i = 0; i < trans_model.NumTransitionIds(); i++) {
+    int32 trans_id = i + 1;
+    int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
+    int32 phone = trans_model.TransitionIdToPhone(trans_id);
+    (*pdf2phones)[pdf_id].insert(phone);
+  }
+}
 
 } // namespace kaldi
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index a8ad846949e..4415927df4e 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -329,6 +329,12 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
 void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
                                  std::vector<int32> *alignment);
 
+
+// GetPdfToPhonesMap creates a map which maps each pdf-id into its
+// corresponding monophones.
+void GetPdfToPhonesMap(const TransitionModel &trans_model,
+                       std::vector<std::set<int32> > *pdf2phones);
+
 /// @} end "addtogroup hmm_group"
 
 } // end namespace kaldi