From d6dfe9e64b88d4b1a0033a16a5bcdee00583a7ce Mon Sep 17 00:00:00 2001 From: Hang Lyu Date: Tue, 10 Jan 2017 01:01:49 -0500 Subject: [PATCH] Remove sinusoid detection code and old SGMM code/scripts fix delete the commented-out invocations about sgmm and some others --- egs/aurora4/s5/local/run_sgmm.sh | 113 - egs/babel/s5/local/decode_helper.sh | 9 - egs/babel/s5b/local/decode_helper.sh | 9 - egs/babel/s5c/local/decode_helper.sh | 9 - egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh | 359 --- egs/gp/s1/path.sh | 2 +- egs/gp/s1/steps/decode_sgmm_deltas.sh | 162 -- egs/gp/s1/steps/train_sgmm_deltas.sh | 270 --- egs/gp/s5/path.sh | 2 +- egs/gp/s5/run.sh | 6 +- egs/lre07/v2/path.sh | 2 +- egs/rm/s5/local/run_pitch.sh | 1 - egs/rm/s5/local/run_sgmm.sh | 39 - egs/rm/s5/run.sh | 3 +- egs/sprakbanken/s5/local/run_sgmm.sh | 112 - egs/swbd/s5/local/run_sgmm.sh | 38 - egs/swbd/s5/run.sh | 1 - egs/swbd/s5/run_edin.sh | 2 +- egs/swbd/s5b/local/run_sgmm.sh | 38 - egs/vystadial_cz/online_demo/path.sh | 2 +- egs/wsj/s5/local/run_sgmm.sh | 112 - egs/wsj/s5/run.sh | 3 - egs/wsj/s5/steps/align_sgmm.sh | 198 -- egs/wsj/s5/steps/align_sgmm2.sh | 4 +- egs/wsj/s5/steps/decode_sgmm.sh | 266 -- egs/wsj/s5/steps/decode_sgmm2_fromlats.sh | 2 +- egs/wsj/s5/steps/decode_sgmm2_rescore.sh | 4 +- egs/wsj/s5/steps/decode_sgmm_fromlats.sh | 277 --- egs/wsj/s5/steps/decode_sgmm_rescore.sh | 108 - egs/wsj/s5/steps/make_denlats_sgmm.sh | 189 -- egs/wsj/s5/steps/tandem/align_sgmm.sh | 236 -- egs/wsj/s5/steps/tandem/decode_sgmm.sh | 303 --- egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh | 199 -- egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh | 193 -- egs/wsj/s5/steps/tandem/train_sgmm.sh | 315 --- egs/wsj/s5/steps/train_mmi_sgmm.sh | 156 -- egs/wsj/s5/steps/train_sgmm.sh | 280 --- src/Doxyfile | 4 +- src/Makefile | 19 +- src/decoder/Makefile | 2 +- src/doc/kaldi_for_dummies.dox | 2 +- src/feat/Makefile | 7 +- src/feat/sinusoid-detection-test.cc | 452 ---- src/feat/sinusoid-detection.cc | 945 -------- src/feat/sinusoid-detection.h | 436 ---- src/featbin/Makefile | 4 +- src/featbin/detect-sinusoids.cc | 113 - src/sgmm/Makefile | 18 - src/sgmm/am-sgmm-test.cc | 278 --- src/sgmm/am-sgmm.cc | 1395 ----------- src/sgmm/am-sgmm.h | 420 ---- src/sgmm/decodable-am-sgmm.cc | 72 - src/sgmm/decodable-am-sgmm.h | 119 - src/sgmm/estimate-am-sgmm-ebw.cc | 654 ----- src/sgmm/estimate-am-sgmm-ebw.h | 217 -- src/sgmm/estimate-am-sgmm-multi-test.cc | 154 -- src/sgmm/estimate-am-sgmm-multi.cc | 746 ------ src/sgmm/estimate-am-sgmm-multi.h | 146 -- src/sgmm/estimate-am-sgmm-test.cc | 161 -- src/sgmm/estimate-am-sgmm.cc | 2135 ----------------- src/sgmm/estimate-am-sgmm.h | 475 ---- src/sgmm/fmllr-sgmm-test.cc | 233 -- src/sgmm/fmllr-sgmm.cc | 554 ----- src/sgmm/fmllr-sgmm.h | 192 -- src/sgmm/sgmm-clusterable.cc | 280 --- src/sgmm/sgmm-clusterable.h | 112 - src/sgmmbin/Makefile | 31 - src/sgmmbin/init-ubm.cc | 95 - src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc | 216 -- src/sgmmbin/sgmm-acc-stats-ali.cc | 191 -- src/sgmmbin/sgmm-acc-stats-gpost.cc | 174 -- src/sgmmbin/sgmm-acc-stats.cc | 211 -- src/sgmmbin/sgmm-acc-stats2.cc | 217 -- src/sgmmbin/sgmm-acc-tree-stats.cc | 185 -- src/sgmmbin/sgmm-align-compiled.cc | 179 -- src/sgmmbin/sgmm-build-tree.cc | 201 -- src/sgmmbin/sgmm-calc-distances.cc | 74 - src/sgmmbin/sgmm-cluster-phones.cc | 148 -- src/sgmmbin/sgmm-comp-prexform.cc | 84 - src/sgmmbin/sgmm-copy.cc | 74 - src/sgmmbin/sgmm-decode-faster.cc | 218 -- src/sgmmbin/sgmm-est-ebw.cc | 118 - src/sgmmbin/sgmm-est-fmllr-gpost.cc | 261 -- src/sgmmbin/sgmm-est-fmllr.cc | 318 --- src/sgmmbin/sgmm-est-fmllrbasis.cc | 93 - src/sgmmbin/sgmm-est-multi.cc | 233 -- src/sgmmbin/sgmm-est-spkvecs-gpost.cc | 223 -- src/sgmmbin/sgmm-est-spkvecs.cc | 257 -- src/sgmmbin/sgmm-est.cc | 172 -- src/sgmmbin/sgmm-gselect.cc | 125 - src/sgmmbin/sgmm-info.cc | 110 - src/sgmmbin/sgmm-init-from-tree-stats.cc | 147 -- src/sgmmbin/sgmm-init.cc | 111 - src/sgmmbin/sgmm-latgen-faster.cc | 271 --- src/sgmmbin/sgmm-latgen-simple.cc | 232 -- src/sgmmbin/sgmm-mixup.cc | 145 -- src/sgmmbin/sgmm-normalize.cc | 85 - src/sgmmbin/sgmm-post-to-gpost.cc | 190 -- src/sgmmbin/sgmm-rescore-lattice.cc | 165 -- src/sgmmbin/sgmm-sum-accs.cc | 69 - src/sgmmbin/sgmm-sum-tree-stats.cc | 100 - src/sgmmbin/sgmm-write-ubm.cc | 71 - src/tree/clusterable-classes.h | 4 - 103 files changed, 32 insertions(+), 20135 deletions(-) delete mode 100755 egs/aurora4/s5/local/run_sgmm.sh delete mode 100755 egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh delete mode 100755 egs/gp/s1/steps/decode_sgmm_deltas.sh delete mode 100755 egs/gp/s1/steps/train_sgmm_deltas.sh delete mode 100755 egs/rm/s5/local/run_sgmm.sh delete mode 100755 egs/sprakbanken/s5/local/run_sgmm.sh delete mode 100755 egs/swbd/s5/local/run_sgmm.sh delete mode 100755 egs/swbd/s5b/local/run_sgmm.sh delete mode 100755 egs/wsj/s5/local/run_sgmm.sh delete mode 100755 egs/wsj/s5/steps/align_sgmm.sh delete mode 100755 egs/wsj/s5/steps/decode_sgmm.sh delete mode 100755 egs/wsj/s5/steps/decode_sgmm_fromlats.sh delete mode 100755 egs/wsj/s5/steps/decode_sgmm_rescore.sh delete mode 100755 egs/wsj/s5/steps/make_denlats_sgmm.sh delete mode 100755 egs/wsj/s5/steps/tandem/align_sgmm.sh delete mode 100755 egs/wsj/s5/steps/tandem/decode_sgmm.sh delete mode 100755 egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh delete mode 100755 egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh delete mode 100755 egs/wsj/s5/steps/tandem/train_sgmm.sh delete mode 100755 egs/wsj/s5/steps/train_mmi_sgmm.sh delete mode 100755 egs/wsj/s5/steps/train_sgmm.sh delete mode 100644 src/feat/sinusoid-detection-test.cc delete mode 100644 src/feat/sinusoid-detection.cc delete mode 100644 src/feat/sinusoid-detection.h delete mode 100644 src/featbin/detect-sinusoids.cc delete mode 100644 src/sgmm/Makefile delete mode 100644 src/sgmm/am-sgmm-test.cc delete mode 100644 src/sgmm/am-sgmm.cc delete mode 100644 src/sgmm/am-sgmm.h delete mode 100644 src/sgmm/decodable-am-sgmm.cc delete mode 100644 src/sgmm/decodable-am-sgmm.h delete mode 100644 src/sgmm/estimate-am-sgmm-ebw.cc delete mode 100644 src/sgmm/estimate-am-sgmm-ebw.h delete mode 100644 src/sgmm/estimate-am-sgmm-multi-test.cc delete mode 100644 src/sgmm/estimate-am-sgmm-multi.cc delete mode 100644 src/sgmm/estimate-am-sgmm-multi.h delete mode 100644 src/sgmm/estimate-am-sgmm-test.cc delete mode 100644 src/sgmm/estimate-am-sgmm.cc delete mode 100644 src/sgmm/estimate-am-sgmm.h delete mode 100644 src/sgmm/fmllr-sgmm-test.cc delete mode 100644 src/sgmm/fmllr-sgmm.cc delete mode 100644 src/sgmm/fmllr-sgmm.h delete mode 100644 src/sgmm/sgmm-clusterable.cc delete mode 100644 src/sgmm/sgmm-clusterable.h delete mode 100644 src/sgmmbin/Makefile delete mode 100644 src/sgmmbin/init-ubm.cc delete mode 100644 src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc delete mode 100644 src/sgmmbin/sgmm-acc-stats-ali.cc delete mode 100644 src/sgmmbin/sgmm-acc-stats-gpost.cc delete mode 100644 src/sgmmbin/sgmm-acc-stats.cc delete mode 100644 src/sgmmbin/sgmm-acc-stats2.cc delete mode 100644 src/sgmmbin/sgmm-acc-tree-stats.cc delete mode 100644 src/sgmmbin/sgmm-align-compiled.cc delete mode 100644 src/sgmmbin/sgmm-build-tree.cc delete mode 100644 src/sgmmbin/sgmm-calc-distances.cc delete mode 100644 src/sgmmbin/sgmm-cluster-phones.cc delete mode 100644 src/sgmmbin/sgmm-comp-prexform.cc delete mode 100644 src/sgmmbin/sgmm-copy.cc delete mode 100644 src/sgmmbin/sgmm-decode-faster.cc delete mode 100644 src/sgmmbin/sgmm-est-ebw.cc delete mode 100644 src/sgmmbin/sgmm-est-fmllr-gpost.cc delete mode 100644 src/sgmmbin/sgmm-est-fmllr.cc delete mode 100644 src/sgmmbin/sgmm-est-fmllrbasis.cc delete mode 100644 src/sgmmbin/sgmm-est-multi.cc delete mode 100644 src/sgmmbin/sgmm-est-spkvecs-gpost.cc delete mode 100644 src/sgmmbin/sgmm-est-spkvecs.cc delete mode 100644 src/sgmmbin/sgmm-est.cc delete mode 100644 src/sgmmbin/sgmm-gselect.cc delete mode 100644 src/sgmmbin/sgmm-info.cc delete mode 100644 src/sgmmbin/sgmm-init-from-tree-stats.cc delete mode 100644 src/sgmmbin/sgmm-init.cc delete mode 100644 src/sgmmbin/sgmm-latgen-faster.cc delete mode 100644 src/sgmmbin/sgmm-latgen-simple.cc delete mode 100644 src/sgmmbin/sgmm-mixup.cc delete mode 100644 src/sgmmbin/sgmm-normalize.cc delete mode 100644 src/sgmmbin/sgmm-post-to-gpost.cc delete mode 100644 src/sgmmbin/sgmm-rescore-lattice.cc delete mode 100644 src/sgmmbin/sgmm-sum-accs.cc delete mode 100644 src/sgmmbin/sgmm-sum-tree-stats.cc delete mode 100644 src/sgmmbin/sgmm-write-ubm.cc diff --git a/egs/aurora4/s5/local/run_sgmm.sh b/egs/aurora4/s5/local/run_sgmm.sh deleted file mode 100755 index 62be4d83774..00000000000 --- a/egs/aurora4/s5/local/run_sgmm.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash - -# This script is invoked from ../run.sh -# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. - -. cmd.sh - -# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for -# training, but this shouldn't have much effect. - -( - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; - - steps/train_ubm.sh --cmd "$train_cmd" \ - 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \ - exp/ubm5b/final.ubm exp/sgmm5a || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ - --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1; - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter & - done - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & - done - -) & - - -( -# The next commands are the same thing on all the si284 data. - -# SGMM system on the si284 data [sgmm5b] - steps/train_ubm.sh --cmd "$train_cmd" \ - 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5b || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ - exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92 - - utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1; - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ - exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ - exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ - --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284 - - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1 - - for iter in 1 2 3 4; do - for test in dev93 eval92; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter & - - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & - done - done -) & - - - -# Train quinphone SGMM system. - -steps/train_sgmm.sh --cmd "$train_cmd" \ - --context-opts "--context-width=5 --central-position=2" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5c || exit 1; - -# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93. -steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93 - diff --git a/egs/babel/s5/local/decode_helper.sh b/egs/babel/s5/local/decode_helper.sh index 3be49854038..59b2fdad3c9 100755 --- a/egs/babel/s5/local/decode_helper.sh +++ b/egs/babel/s5/local/decode_helper.sh @@ -18,15 +18,6 @@ elif [ "$1" == "FMLLR" ]; then utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1 -elif [ "$1" == "SGMM" ]; then - utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 - - steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \ - $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1; - - steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\ - $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1; - fi diff --git a/egs/babel/s5b/local/decode_helper.sh b/egs/babel/s5b/local/decode_helper.sh index 3be49854038..59b2fdad3c9 100755 --- a/egs/babel/s5b/local/decode_helper.sh +++ b/egs/babel/s5b/local/decode_helper.sh @@ -18,15 +18,6 @@ elif [ "$1" == "FMLLR" ]; then utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1 -elif [ "$1" == "SGMM" ]; then - utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 - - steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \ - $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1; - - steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\ - $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1; - fi diff --git a/egs/babel/s5c/local/decode_helper.sh b/egs/babel/s5c/local/decode_helper.sh index 3be49854038..59b2fdad3c9 100755 --- a/egs/babel/s5c/local/decode_helper.sh +++ b/egs/babel/s5c/local/decode_helper.sh @@ -18,15 +18,6 @@ elif [ "$1" == "FMLLR" ]; then utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1 -elif [ "$1" == "SGMM" ]; then - utils/mkgraph.sh $LANGDIR $MODELDIR $MODELDIR/graph || exit 1 - - steps/decode_sgmm.sh --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR \ - $MODELDIR/graph $DEVDIR $MODELDIR/decode || exit 1; - - steps/decode_sgmm.sh --use-fmllr true --nj 20 --cmd "$decode_cmd" --transform-dir $TRANSFORMDIR\ - $MODELDIR/graph $DEVDIR $MODELDIR/decode_fmllr || exit 1; - fi diff --git a/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh b/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh deleted file mode 100755 index dfe1f211d6c..00000000000 --- a/egs/gp/s1/local/gp_train_multi_sgmm_deltas.sh +++ /dev/null @@ -1,359 +0,0 @@ -#!/bin/bash -u - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This is Subspace Gaussian Mixture Model (SGMM) training-- -# see "The subspace Gaussian mixture model--A structured model for speech recognition" -# by D. Povey et al, Computer Speech and Language, 2011. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used -# retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[0-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -function est_alimodel () { -# If we have speaker vectors, we need an alignment model. This function gets -# the Gaussian-level alignments with the speaker vectors but accumulates stats -# without any speaker vectors; we re-estimate M, w, c and S to get a model -# that's compatible with not having speaker vectors. Note that the transitions -# are not updated since the decoding graph will be shared with the normal model. - local lx=$1 - for L in $LANGUAGES; do - wdir=$dir/$L - local lspkdim=`sgmm-info $wdir/$lx.mdl | grep speaker | awk '{print $NF}'` - if [ "$lspkdim" -le 0 ]; then - echo "est_alimodel: No speaker space in model '$wdir/$lx.mdl'. Returning." - return - fi - done - - local y=0; - local lflags=MwcS # First time don't update v - while [ $y -lt $numiters_alimdl ]; do - [ $y -gt 0 ] && lflags=vMwcS - echo "Pass $y of building alignment model, flags = '$lflags'" - local lmulti_est_opts='' # model, acc, model-out, occs-out tuples - for L in $LANGUAGES; do - ( - data=data/$L/train - lang=data/$L/lang - wdir=$dir/$L - local cur_alimdl=$wdir/tmp$y.alimdl - [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl - feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|" - spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs" - - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc_ali${lx}_$y.TASK_ID.log \ - $sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \ - sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \ - --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk $wdir/$lx.mdl \ - "$feats" ark,s,cs:- ark:- \| \ - sgmm-acc-stats-gpost --update-flags=$lflags $cur_alimdl "$feats" \ - ark,s,cs:- $wdir/$y.TASK_ID.aliacc \ - || { touch $dir/err; \ - error_exit "$L; Align model iter $y: Error accumulating stats"; } - - # Summing accs is quite fast; run locally - sgmm-sum-accs $wdir/sum.aliacc $wdir/$y.*.aliacc || \ - { touch $dir/err; \ - error_exit "$L; Align model iter $y: Error summing stats"; } - )& # Accumulate in parallel for different languages - wdir=$dir/$L - local cur_alimdl=$wdir/tmp$y.alimdl - [ $y -eq 0 ] && cur_alimdl=$wdir/$lx.mdl - lmulti_est_opts="$lmulti_est_opts $cur_alimdl $wdir/sum.aliacc $wdir/tmp$[$y+1].alimdl $wdir/tmp$[$y+1].occs" - done - wait - - submit_jobs.sh "$qcmd" --log=$dir/log/update_ali.$y.log $sjopts \ - sgmm-est-multi --update-flags=$lflags --remove-speaker-space=true \ - $lmulti_est_opts \ - || error_exit "Error estimating alignment models on iter $y"; - - rm -f $dir/??/$y.*.aliacc $dir/??/sum.aliacc || exit 1; - [ $y -gt 0 ] && rm $dir/??/tmp$y.{alimdl,occs} - y=$[$y+1] - done - - for L in $LANGUAGES; do - mv $dir/$L/tmp$y.alimdl $dir/$L/$lx.alimdl - done -} - -nj=4 # Default number of jobs -stage=-5 # Default starting stage (start with tree building) -qcmd="" # Options for the submit_jobs.sh script -sjopts="" # Options for the submit_jobs.sh script -LANGUAGES='GE PO SP SW' # Languages processed - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG 40 39 exp/ubm3c/final.ubm exp/sgmm3c\n\n -Options:\n - --help\t\tPrint this message and exit\n - --lang STR\tList of languages to process (default = '$LANGUAGES')\n - --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n - --qcmd STR\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --sjopts STR\tOptions for the 'submit_jobs.sh' script\n - --stage INT\tStarting stage (e.g. -4 for SGMM init; 2 for iter 2; default=$stage)\n -"; - -echo "$PROG $@" -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --lang) LANGUAGES="$2"; shift 2 ;; - --num-jobs) - shift; nj=`readint $1`; - [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive."; - shift ;; - --qcmd) - shift; qcmd=" --qcmd=${1}"; shift ;; - --sjopts) - shift; sjopts="$1"; shift ;; - --stage) - shift; stage=`readint $1`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as num-leaves - esac -done - -if [ $# != 4 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -# This is SGMM with speaker vectors, on top of LDA+[something] features. -# Any speaker-specific transforms are obtained from the alignment directory. -# To be run from .. - -phndim=$1 -spkdim=$2 -ubm=$3 -dir=$4 - -[ -f $ubm ] || error_exit "UBM file '$ubm' does not exist" -mkdir -p $dir/log || error_exit "Cannot create '$dir/log'" - -# (1): Model initialization; training graph and initial alignment generation. -for L in $LANGUAGES; do -( - data=data/$L/train - lang=data/$L/lang - alidir=exp/$L/tri2a_ali - wdir=$dir/$L - oov_sym=`cat $lang/oov.txt` - mkdir -p $wdir/log || error_exit "Cannot create working directory '$wdir'" - - # Initialize the model (removed the --spk-space-dim option) - if [ $stage -le -5 ]; then - echo "$L: Initializing model" - submit_jobs.sh "$qcmd" --log=$wdir/log/init_sgmm.log $sjopts \ - sgmm-init --phn-space-dim=$phndim $lang/topo $wdir/tree $ubm \ - $wdir/0.mdl || { touch $dir/err; error_exit "$L: SGMM init failed."; } - fi - - # Make training graphs - if [ $stage -le -4 ]; then - echo "$L: Compiling training graphs" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/mkgraphs.TASK_ID.log \ - $sjopts compile-train-graphs $wdir/tree $wdir/0.mdl $lang/L.fst \ - "ark:sym2int.pl --map-oov '$oov_sym' --ignore-first-field $lang/words.txt < $data/split$nj/TASK_ID/text |" \ - "ark:|gzip -c >$wdir/TASK_ID.fsts.gz" \ - || { touch $dir/err; error_exit "$L: Error compiling training graphs"; } - fi - - if [ $stage -le -3 ]; then - echo "$L: Converting alignments" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/convert.TASK_ID.log \ - $sjopts convert-ali $alidir/final.mdl $wdir/0.mdl $wdir/tree \ - "ark:gunzip -c $alidir/TASK_ID.ali.gz|" \ - "ark:|gzip -c >$wdir/TASK_ID.ali.gz" \ - || { touch $dir/err; error_exit "$L: Convert alignment failed."; } - fi - - if [ $stage -le -2 ]; then - echo "$L: Computing cepstral mean and variance statistics" - submit_jobs.sh "$qcmd" --njobs=$nj $sjopts --log=$wdir/log/cmvn.TASK_ID.log \ - compute-cmvn-stats --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \ - scp:$data/split$nj/TASK_ID/feats.scp ark:$wdir/TASK_ID.cmvn \ - || { touch $dir/err; error_exit "$L: Computing CMN/CVN stats failed."; } - fi - - feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - - if [ $stage -le -1 ]; then - echo "$L: Doing Gaussian selection" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/gselectTASK_ID.log \ - $sjopts sgmm-gselect $wdir/0.mdl "$feats" "ark,t:|gzip -c > $wdir/TASK_ID.gselect.gz" \ - || { touch $dir/err; error_exit "$L: Error doing Gaussian selection"; } - fi -)& # Run the language-specific initializations in parallel -done -wait -[ -f $dir/err ] && { rm $dir/err; error_exit "Error initializing models."; } - -# Language independent constants -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -numiters_alimdl=3 # Number of iterations for estimating alignment model. -incsub_interval=8 # increase substates every 8 iterations -# total substates after each such increment -total_substates=( 5000 7000 9000 12000 16000 20000 25000 30000 35000 40000 ) -# For a given number of substates, iterate for $incsub_interval iterations -numiters=$[(${#total_substates[@]}+1)*$incsub_interval] -realign_interval=4 # realign every 4 iterations -spkvec_start=8 # use speaker subspace *after* 8 iterations -spkvec_interval=2 # reestimate the speaker vectors every 2 iterations -randprune=0.1 - -# Initially don't have speaker vectors, but change this after we estimate them. -spkvecs_gen=0 - -x=0 -while [ $x -lt $numiters ]; do - if [ $x -eq 0 ]; then - flags=v # On first iter, don't update M or N. - elif [ $spkdim -gt 0 -a $[$x%2] -eq 0 -a $x -gt $spkvec_start ]; then - # Update N on odd iterations after 1st spkvec iter, if we have spk-space. - flags=NwSvct - else # Else update M but not N. - flags=MwSvct - fi - - if [ $stage -le $x ]; then - echo "Pass $x: update flags = '$flags' " - multi_est_opts='' # Will contain model, acc, model-out, occs-out tuples - for L in $LANGUAGES; do - ( - data=data/$L/train - lang=data/$L/lang - wdir=$dir/$L - feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$wdir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - gselect_opt="--gselect=ark,s,cs:gunzip -c $wdir/TASK_ID.gselect.gz|" - if [ $spkdim -gt 0 -a $spkvecs_gen -eq 1 ]; then - spkvecs_opt="--spk-vecs=ark:$wdir/TASK_ID.vecs" - else - spkvecs_opt='' - fi - silphonelist=`cat $lang/silphones.csl` -# numsubstates=`cat $wdir/numleaves` # Initial #-substates. - - if [ $[$x%$realign_interval] -eq 0 -a $x -gt 0 ]; then - echo "$L; iter $x: Aligning data" - submit_jobs.sh "$qcmd" $sjopts --log=$wdir/log/align.$x.TASK_ID.log \ - --njobs=$nj sgmm-align-compiled $spkvecs_opt $scale_opts \ - "$gselect_opt" --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \ - --beam=8 --retry-beam=40 $wdir/$x.mdl \ - "ark:gunzip -c $wdir/TASK_ID.fsts.gz|" "$feats" \ - "ark:|gzip -c >$wdir/TASK_ID.ali.gz" || \ - { touch $dir/err; error_exit "$L, it $x: Error realigning data"; } - fi - - if [ $spkdim -gt 0 -a $x -gt $spkvec_start \ - -a $[$x%$spkvec_interval] -eq 0 ]; then - echo "$L; iter $x: Computing speaker vectors" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/spkvecs.$x.TASK_ID.log \ - $sjopts ali-to-post "ark:gunzip -c $wdir/TASK_ID.ali.gz|" ark:- \| \ - weight-silence-post 0.01 $silphonelist $wdir/$x.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \ - $spkvecs_opt "$gselect_opt" --rand-prune=$randprune $wdir/$x.mdl \ - "$feats" ark,s,cs:- ark:$wdir/tmpTASK_ID.vecs || \ - { touch $dir/err; error_exit "$L, it $x: Error computing spkvecs"; } - for n in `seq 1 $nj`; do - mv $wdir/tmp${n}.vecs $wdir/${n}.vecs; - done - spkvecs_gen=1 - fi - - submit_jobs.sh "$qcmd" --njobs=$nj --log=$wdir/log/acc.$x.TASK_ID.log \ - $sjopts sgmm-acc-stats --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \ - --update-flags=$flags --rand-prune=$randprune $spkvecs_opt \ - "$gselect_opt" $wdir/$x.mdl "$feats" \ - "ark,s,cs:ali-to-post 'ark:gunzip -c $wdir/TASK_ID.ali.gz|' ark:-|" \ - $wdir/$x.TASK_ID.acc || \ - { touch $dir/err; error_exit "$L, it $x: Error accumulating stats"; } - - # Summing accs is quite fast; run locally - sgmm-sum-accs $wdir/sum.acc $wdir/$x.*.acc || \ - { touch $dir/err; error_exit "$L, it $x: Error summing stats"; } - ) & # Accumulate in parallel for different languages - wdir=$dir/$L - multi_est_opts="$multi_est_opts $wdir/$x.mdl $wdir/sum.acc $wdir/$[$x+1].mdl $wdir/$[$x+1].occs" - done - wait - [ -f $dir/err ] && \ - { rm $dir/err; error_exit "Iter $x: Error in accumulation"; } - - add_dim_opts='' - if [ $x -eq $spkvec_start ]; then - add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim" - elif [ $x -eq $[$spkvec_start*2] ]; then - add_dim_opts="--increase-spk-dim=$spkdim --increase-phn-dim=$phndim" - fi - split_opts='' - if [ $[$x%$incsub_interval] -eq 1 -a $x -gt 1 ]; then - index=$[($x/$incsub_interval)-1] - numsubstates=${total_substates[$index]} - split_opts="--split-substates=$numsubstates" - fi - - submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \ - sgmm-est-multi --update-flags=$flags $split_opts $add_dim_opts \ - $multi_est_opts || error_exit "Error in pass $x estimation." - - # If using speaker vectors, estimate alignment model without spkvecs - if [ $[$x%$incsub_interval] -eq 0 -a $x -gt 0 ]; then - chmod -w $dir/??/$x.mdl $dir/??/$x.occs # Preserve for scoring - [ $spkdim -gt 0 ] && est_alimodel $x; - else - rm -f $dir/??/$x.mdl $dir/??/$x.occs - fi - rm -f $dir/??/$x.*.acc $dir/??/sum.acc - fi # End of current stage - x=$[$x+1]; -done - -for L in $LANGUAGES; do - ( - wdir=$dir/$L - rm -f $wdir/final.mdl $wdir/final.occs; - chmod -w $wdir/$x.mdl $wdir/$x.occs # Preserve for scoring - ln -s $wdir/$x.mdl $wdir/final.mdl; - ln -s $wdir/$x.occs $wdir/final.occs; - # If using speaker vectors, estimate alignment model without spkvecs - [ $spkdim -gt 0 ] && est_alimodel $wdir/$x.mdl; - rm -f $wdir/final.alimdl; - ln -sf $wdir/$x.alimdl $wdir/final.alimdl; - - # Print out summary of the warning messages. - for x in $wdir/log/*.log; do - n=`grep WARNING $x | wc -l`; - if [ $n -ne 0 ]; then echo "$n warnings in $x"; fi; - done - ) -done - -echo Done diff --git a/egs/gp/s1/path.sh b/egs/gp/s1/path.sh index a38149ac899..cee9bacbde9 100644 --- a/egs/gp/s1/path.sh +++ b/egs/gp/s1/path.sh @@ -7,7 +7,7 @@ KALDIROOT=/exports/home/aghoshal/kaldi/trunk KALDISRC=$KALDIROOT/src KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin -KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm +KALDIBIN=$KALDIBIN:$KALDISRC/sgmm2bin:$KALDISRC/lm FSTBIN=$KALDIROOT/tools/openfst/bin LMBIN=$KALDIROOT/tools/irstlm/bin diff --git a/egs/gp/s1/steps/decode_sgmm_deltas.sh b/egs/gp/s1/steps/decode_sgmm_deltas.sh deleted file mode 100755 index 0e15ef5aef5..00000000000 --- a/egs/gp/s1/steps/decode_sgmm_deltas.sh +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Decoding script for SGMM using standard MFCC/PLP + delta + acceleration -# features. - -# assumes you are using speaker vectors [for no vectors, see -# decode_sgmm_novec_lda_etc.sh, if it exists already]. -# if this includes speaker-specific transforms, you have to provide an "old" -# decoding directory where the transforms are located. The data decoded in -# that directory must be split up in the same way as the current directory. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function file_exists () { - [ -f $1 ] || error_exit "$PROG: no such file '$1'" -} - -function readposint () { # Strictly speaking, reading non-negative integers - local retval=${1/#*=/}; # In case --switch=ARG format was used - [[ "$retval" =~ ^[0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not a non-negative integer." - echo $retval -} - -beam=13.0 -nj=1 # Default total number of jobs -jobid=0 # Default job number -qcmd="" # Options for the submit_jobs.sh script -sjopts="" # Options for the submit_jobs.sh script -use_spkvecs='' # Not expecting a model with speaker vectors, by default. - -PROG=`basename $0`; -usage="Usage: $PROG [options] []\n -e.g.: $PROG -j 10 0 exp/sgmm3c/graph_tgpr data/test_dev93 exp/sgmm3c/decode_dev93_tgpr exp/tri2b/decode_dev93_tgpr\n\n -Options:\n - --help\t\tPrint this message and exit.\n - --beam FLOAT\tDecoding beam (default=$beam).\n - -j INT INT\tNumber of parallel jobs to run (default=$nj) and current jobid.\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --sjopts STRING\tOptions for the 'submit_jobs.sh' script.\n - --with-spkvecs\tModel has speaker vectors; do 2-pass decoding.\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --beam) beam=$2; shift 2 ;; - -j) nj=`readposint $2`; jobid=`readposint $3`; shift 3 ;; - --qcmd) qcmd=" --qcmd=${2}"; shift 2 ;; - --sjopts) sjopts="$2"; shift 2 ;; - --with-spkvecs) use_spkvecs=1; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as num-leaves - esac -done - -if [ $# -lt 3 -o $# -gt 4 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -graphdir=$1 -data=$2 -dir=$3 -transdir=$4 -acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. - -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. - -mkdir -p $dir - -if [ $nj -gt 1 ]; then - mydata=$data/split$nj/$jobid -else - mydata=$data -fi - -requirements="$mydata/feats.scp $srcdir/final.mdl $graphdir/HCLG.fst" -[ -z "$use_spkvecs" ] || requirements=$requirements" $srcdir/final.alimdl" -for f in $requirements; do - file_exists $f -done - -if [ ! -z "$transdir" ]; then # "$transdir" nonempty.. - file_exists $transdir/$n.trans -fi - -feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |" - -[ ! -z "$transdir" ] && feats="$feats transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |" - - -# Do Gaussian selection, since we'll have two decoding passes and don't want to -# redo this. Note: it doesn't make a difference if we use final.mdl or -# final.alimdl, they have the same UBM. -sgmm-gselect $srcdir/final.mdl "$feats" "ark:|gzip -c >$dir/$jobid.gselect.gz" \ - 2>$dir/gselect$jobid.log \ - || error_exit "Error in Gaussian selection."; -gselect_opt="--gselect=ark:gunzip -c $dir/$jobid.gselect.gz|" - -target_lat="$dir/lat.$jobid.gz" -[ -z "$use_spkvecs" ] || target_lat="$dir/pre_lat.$jobid.gz" -align_model="$srcdir/final.mdl" -[ -z "$use_spkvecs" ] || align_model="$srcdir/final.alimdl" - -# Generate a state-level lattice for rescoring, with the alignment model and no -# speaker vectors. - -sgmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \ - --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt "$gselect_opt" $align_model \ - $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $target_lat" \ - 2> $dir/decode_pass1.$jobid.log \ - || error_exit "Error in 1st-pass decoding."; - -# Do a second pass "decoding" if using speaker vectors. -if [ ! -z "$use_spkvecs" ]; then - silphonelist=`cat $graphdir/silphones.csl` || exit 1 - ( lattice-determinize --acoustic-scale=$acwt --prune=true --beam=4.0 \ - "ark:gunzip -c $dir/pre_lat.$jobid.gz|" ark:- \ - | lattice-to-post --acoustic-scale=$acwt ark:- ark:- \ - | weight-silence-post 0.0 $silphonelist $srcdir/final.alimdl ark:- ark:- \ - | sgmm-post-to-gpost "$gselect_opt" $srcdir/final.alimdl "$feats" ark:- \ - ark:- \ - | sgmm-est-spkvecs-gpost --spk2utt=ark:$mydata/spk2utt $srcdir/final.mdl \ - "$feats" ark:- "ark:$dir/$jobid.vecs" - ) 2> $dir/vecs.$jobid.log \ - || error_exit "Error estimating speaker vectors."; - - # Now rescore the state-level lattices with the adapted features and the - # corresponding model. Prune and determinize the lattices to limit their size. - - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$mydata/utt2spk \ - --spk-vecs=ark:$dir/$jobid.vecs $srcdir/final.mdl \ - "ark:gunzip -c $dir/pre_lat.$jobid.gz|" "$feats" \ - "ark:|lattice-determinize --acoustic-scale=$acwt --prune=true --beam=6.0 ark:- ark:- | gzip -c > $dir/lat.$jobid.gz" \ - 2>$dir/rescore.$jobid.log \ - || error_exit "Error in 2nd-pass rescoring."; - - rm $dir/pre_lat.$jobid.gz - # The top-level decoding script rescores "lat.$jobid.gz" to get final output. -fi - diff --git a/egs/gp/s1/steps/train_sgmm_deltas.sh b/egs/gp/s1/steps/train_sgmm_deltas.sh deleted file mode 100755 index e68a1757308..00000000000 --- a/egs/gp/s1/steps/train_sgmm_deltas.sh +++ /dev/null @@ -1,270 +0,0 @@ -#!/bin/bash - -# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This is Subspace Gaussian Mixture Model (SGMM) training-- -# see "The subspace Gaussian mixture model--A structured model for speech recognition" -# by D. Povey et al, Computer Speech and Language, 2011. - -function error_exit () { - echo -e "$@" >&2; exit 1; -} - -function readint () { - local retval=${1/#*=/}; # In case --switch=ARG format was used - retval=${retval#0*} # Strip any leading 0's - [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \ - || error_exit "Argument \"$retval\" not an integer." - echo $retval -} - -nj=4 # Default number of jobs -stage=-4 # Default starting stage (start with tree building) -qcmd="" # Options for the submit_jobs.sh script -sjopts="" # Options for the submit_jobs.sh script - -PROG=`basename $0`; -usage="Usage: $PROG [options] \n -e.g.: $PROG 10000 40 39 data/train data/lang exp/tri2a_ali exp/ubm3c/final.ubm exp/sgmm3c\n\n -Options:\n - --help\t\tPrint this message and exit\n - --num-jobs INT\tNumber of parallel jobs to run (default=$nj).\n - --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n - --sjopts STRING\tOptions for the 'submit_jobs.sh' script\n - --stage INT\tStarting stage (e.g. -4 for SGMM init; 2 for iter 2; default=$stage)\n -"; - -while [ $# -gt 0 ]; do - case "${1# *}" in # ${1# *} strips any leading spaces from the arguments - --help) echo -e $usage; exit 0 ;; - --num-jobs) - shift; nj=`readint $1`; - [ $nj -lt 1 ] && error_exit "--num-jobs arg '$nj' not positive."; - shift ;; - --qcmd) - shift; qcmd=" --qcmd=${1}"; shift ;; - --sjopts) - shift; sjopts="$1"; shift ;; - --stage) - shift; stage=`readint $1`; shift ;; - -*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; - *) break ;; # end of options: interpreted as num-leaves - esac -done - -if [ $# != 8 ]; then - error_exit $usage; -fi - -[ -f path.sh ] && . path.sh - -# This is SGMM with speaker vectors, on top of LDA+[something] features. -# Any speaker-specific transforms are obtained from the alignment directory. -# To be run from .. - -totsubstates=$1 -phndim=$2 -spkdim=$3 -data=$4 -lang=$5 -alidir=$6 -ubm=$7 -dir=$8 - -mkdir -p $dir || exit 1; - -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" - -numiters=25 # Total number of iterations -numiters_alimdl=3 # Number of iterations for estimating alignment model. -maxiterinc=15 # Last iter to increase #substates on. -realign_iters="5 10 15"; -spkvec_iters="5 8 12 17" -add_dim_iters="6 8 10 12"; # Iters on which to increase phn dim and/or spk dim, - # if necessary, In most cases, either none of these or only the first of these - # will have any effect (we increase in increments of [feature dim]) - -oov_sym=`cat $lang/oov.txt` -silphonelist=`cat $lang/silphones.csl` - -numsubstates=`cat $dir/numleaves` # Initial #-substates. -# per-iter increment for #substates -incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] - -# Initially don't have speaker vectors, but change this after we estimate them. -spkvecs_opt= -gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/TASK_ID.gselect.gz|" - -randprune=0.1 -mkdir -p $dir/log - -featspart="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk ark:$alidir/TASK_ID.cmvn scp:$data/split$nj/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |" - -if [ ! -f $ubm ]; then - echo "No UBM in $ubm" - exit 1; -fi - -if [ $stage -le -4 ]; then - submit_jobs.sh "$qcmd" --log=$dir/log/init_sgmm.log $sjopts \ - sgmm-init --phn-space-dim=$phndim --spk-space-dim=$spkdim $lang/topo \ - $dir/tree $ubm $dir/0.mdl || error_exit "SGMM init failed." -fi - -if [ $stage -le -3 ]; then -# Make training graphs (this is split in $nj parts). - echo "Compiling training graphs" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/compile_graphsTASK_ID.log \ - $sjopts compile-train-graphs $dir/tree $dir/0.mdl $lang/L.fst \ - "ark:sym2int.pl --map-oov '$oov_sym' --ignore-first-field $lang/words.txt < $data/split$nj/TASK_ID/text |" \ - "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \ - || error_exit "Error compiling training graphs" -fi - -if [ $stage -le -2 ]; then - echo "Doing Gaussian selection" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/gselectTASK_ID.log \ - $sjopts sgmm-gselect $dir/0.mdl "$featspart" "ark,t:|gzip -c > $dir/TASK_ID.gselect.gz" \ - || error_exit "Error doing Gaussian selection" -fi - - -if [ $stage -le -1 ]; then - echo "Converting alignments" # don't bother parallelizing; very fast. - for n in `seq 1 $nj`; do - convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree \ - "ark:gunzip -c $alidir/$n.ali.gz|" "ark:|gzip -c >$dir/$n.ali.gz" \ - 2>$dir/log/convert.$n.log - done -fi - -x=0 -while [ $x -lt $numiters ]; do - if [ $x -eq 0 ]; then - flags=vwcSt # On first iter, don't update M or N. - elif [ $spkdim -gt 0 -a $[$x%2] -eq 1 -a \ - $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then - # Update N on odd iterations after 1st spkvec iter, if we have spk-space. - flags=vNwcSt - else # Else update M but not N. - flags=vMwcSt - fi - - if [ $stage -le $x ]; then - echo "Pass $x: update flags = '$flags' " - if echo $realign_iters | grep -w $x >/dev/null; then - echo "Aligning data" - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/align.$x.TASK_ID.log \ - $sjopts sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \ - --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk --beam=8 --retry-beam=40 \ - $dir/$x.mdl "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \ - "ark:|gzip -c >$dir/TASK_ID.ali.gz" \ - || error_exit "Error realigning data on iter $x" - fi - - if [ $spkdim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/spkvecs.$x.TASK_ID.log \ - $sjopts ali-to-post "ark:gunzip -c $dir/TASK_ID.ali.gz|" ark:- \| \ - weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/TASK_ID/spk2utt \ - $spkvecs_opt "$gselect_opt" --rand-prune=$randprune $dir/$x.mdl \ - "$featspart" ark,s,cs:- ark:$dir/tmpTASK_ID.vecs \ - || error_exit "Error computing speaker vectors on iter $x" - for n in `seq 1 $nj`; do - mv $dir/tmp${n}.vecs $dir/${n}.vecs; - done - spkvecs_opt="--spk-vecs=ark:$dir/TASK_ID.vecs" - fi - - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/acc.$x.TASK_ID.log \ - $sjopts sgmm-acc-stats --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk \ - --update-flags=$flags --rand-prune=$randprune $spkvecs_opt \ - "$gselect_opt" $dir/$x.mdl "$featspart" \ - "ark,s,cs:ali-to-post 'ark:gunzip -c $dir/TASK_ID.ali.gz|' ark:-|" \ - $dir/$x.TASK_ID.acc || error_exit "Error accumulating stats on iter $x" - - add_dim_opts= - if echo $add_dim_iters | grep -w $x >/dev/null; then - add_dim_opts="--increase-phn-dim=$phndim --increase-spk-dim=$spkdim" - fi - - submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log $sjopts \ - sgmm-est --update-flags=$flags --split-substates=$numsubstates \ - $add_dim_opts --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ - "sgmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl \ - || error_exit "Error in pass $x estimation." - - rm -f $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs - fi - - if [ $x -lt $maxiterinc ]; then - numsubstates=$[$numsubstates+$incsubstates] - fi - x=$[$x+1]; -done - -( cd $dir; rm final.mdl final.occs 2>/dev/null; - ln -s $x.mdl final.mdl; - ln -s $x.occs final.occs ) - -if [ $spkdim -gt 0 ]; then - # If we have speaker vectors, we need an alignment model. - # The point of this last phase of accumulation is to get Gaussian-level - # alignments with the speaker vectors but accumulate stats without - # any speaker vectors; we re-estimate M, w, c and S to get a model - # that's compatible with not having speaker vectors. - - # We do this for a few iters, in this recipe. - cur_alimdl=$dir/$x.mdl - y=0; - while [ $y -lt $numiters_alimdl ]; do - echo "Pass $y of building alignment model" - if [ $y -eq 0 ]; then - flags=MwcS # First time don't update v... - else - flags=vMwcS # don't update transitions-- will probably share graph with normal model. - fi - - if [ $stage -le $[$y+100] ]; then - submit_jobs.sh "$qcmd" --njobs=$nj --log=$dir/log/acc_ali.$y.TASK_ID.log \ - $sjopts ali-to-post "ark:gunzip -c $dir/TASK_ID.ali.gz|" ark:- \| \ - sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \ - --utt2spk=ark:$data/split$nj/TASK_ID/utt2spk $dir/$x.mdl \ - "$featspart" ark,s,cs:- ark:- \| \ - sgmm-acc-stats-gpost --update-flags=$flags $cur_alimdl "$featspart" \ - ark,s,cs:- $dir/$y.TASK_ID.aliacc \ - || error_exit "Error accumulating stats for alignment model on iter $y" - - submit_jobs.sh "$qcmd" --log=$dir/log/update_ali.$y.log $sjopts \ - sgmm-est --update-flags=$flags --remove-speaker-space=true \ - $cur_alimdl "sgmm-sum-accs - $dir/$y.*.aliacc|" $dir/$[$y+1].alimdl \ - || error_exit "Error estimating alignment model on iter $y"; - rm $dir/$y.*.aliacc || exit 1; - [ $y -gt 0 ] && rm $dir/$y.alimdl - fi - cur_alimdl=$dir/$[$y+1].alimdl - y=$[$y+1] - done - (cd $dir; rm final.alimdl 2>/dev/null; ln -s $y.alimdl final.alimdl ) -fi - -# Print out summary of the warning messages. -for x in $dir/log/*.log; do - n=`grep WARNING $x | wc -l`; - if [ $n -ne 0 ]; then echo $n warnings in $x; fi; -done - -echo Done diff --git a/egs/gp/s5/path.sh b/egs/gp/s5/path.sh index af75fa50c1b..e9f7a8337bc 100644 --- a/egs/gp/s5/path.sh +++ b/egs/gp/s5/path.sh @@ -9,7 +9,7 @@ KALDI_ROOT=/homes/eva/q/qghoshal/src/kaldi/trunk KALDISRC=$KALDI_ROOT/src KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin -KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm +KALDIBIN=$KALDIBIN:$KALDISRC/sgmm2bin:$KALDISRC/lm FSTBIN=$KALDI_ROOT/tools/openfst/bin LMBIN=$KALDI_ROOT/tools/irstlm/bin diff --git a/egs/gp/s5/run.sh b/egs/gp/s5/run.sh index e563bdff0d1..8054d02988d 100755 --- a/egs/gp/s5/run.sh +++ b/egs/gp/s5/run.sh @@ -347,12 +347,12 @@ for L in $GP_LANGUAGES; do num_states=$(grep "^$L" conf/sgmm.conf | cut -f2) num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3) mkdir -p exp/$L/sgmm2a - steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \ + steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 --spk-dim 0 \ $num_states $num_substates data/$L/train data/$L/lang exp/$L/tri1_ali \ exp/$L/ubm2a/final.ubm exp/$L/sgmm2a >& exp/$L/sgmm2a/train.log mkdir -p exp/$L/sgmm2b - steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \ + steps/train_sgmm2.sh --cmd "$train_cmd" --cluster-thresh 100 \ $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali \ exp/$L/ubm2a/final.ubm exp/$L/sgmm2b >& exp/$L/sgmm2b/train.log ) & @@ -370,7 +370,7 @@ for L in $GP_LANGUAGES; do $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/$sgmm $graph_dir - steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ + steps/decode_sgmm2.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \ exp/$L/$sgmm/decode_dev_${lm_suffix} ) & done # loop over LMs diff --git a/egs/lre07/v2/path.sh b/egs/lre07/v2/path.sh index 7cf73af8c53..d55f970d1fb 100755 --- a/egs/lre07/v2/path.sh +++ b/egs/lre07/v2/path.sh @@ -1,3 +1,3 @@ export KALDI_ROOT=$(cd ../../..; pwd) -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH export LC_ALL=C diff --git a/egs/rm/s5/local/run_pitch.sh b/egs/rm/s5/local/run_pitch.sh index ed17b628f47..7ff2bd975e1 100755 --- a/egs/rm/s5/local/run_pitch.sh +++ b/egs/rm/s5/local/run_pitch.sh @@ -208,7 +208,6 @@ done # local/run_raw_fmllr.sh # You don't have to run all 3 of the below, e.g. you can just run the run_sgmm2.sh -#local/run_sgmm.sh local/run_sgmm2.sh #local/run_sgmm2x.sh diff --git a/egs/rm/s5/local/run_sgmm.sh b/egs/rm/s5/local/run_sgmm.sh deleted file mode 100755 index 3a9ce297ada..00000000000 --- a/egs/rm/s5/local/run_sgmm.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -. cmd.sh - -## SGMM on top of LDA+MLLT+SAT features. -if [ ! -f exp/ubm4a/final.mdl ]; then - steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 400 data/train data/lang exp/tri3b_ali exp/ubm4a || exit 1; -fi -steps/train_sgmm.sh --cmd "$train_cmd" 2500 7500 data/train data/lang exp/tri3b_ali exp/ubm4a/final.ubm exp/sgmm4a || exit 1; - -utils/mkgraph.sh data/lang exp/sgmm4a exp/sgmm4a/graph || exit 1; - -steps/decode_sgmm.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ - --transform-dir exp/tri3b/decode exp/sgmm4a/graph data/test exp/sgmm4a/decode || exit 1; - -steps/decode_sgmm.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \ - --transform-dir exp/tri3b/decode exp/sgmm4a/graph data/test exp/sgmm4a/decode_fmllr || exit 1; - - # Now we'll align the SGMM system to prepare for discriminative training. - steps/align_sgmm.sh --nj 8 --cmd "$train_cmd" --transform-dir exp/tri3b \ - --use-graphs true --use-gselect true data/train data/lang exp/sgmm4a exp/sgmm4a_ali || exit 1; - steps/make_denlats_sgmm.sh --nj 8 --sub-split 20 --cmd "$decode_cmd" --transform-dir exp/tri3b \ - data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 \ - data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats exp/sgmm4a_mmi_b0.2 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm4a/decode exp/sgmm4a_mmi_b0.2/decode_it$iter & - done - -wait -steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1; -steps/decode_combine.sh data/test data/lang exp/sgmm4a/decode exp/tri3b_mmi/decode exp/combine_4a_3b/decode || exit 1; -# combining the sgmm run and the best MMI+fMMI run. -steps/decode_combine.sh data/test data/lang exp/sgmm4a/decode exp/tri3b_fmmi_c/decode_it5 exp/combine_4a_3b_fmmic5/decode || exit 1; - -steps/decode_combine.sh data/test data/lang exp/sgmm4a_mmi_b0.2/decode_it4 exp/tri3b_fmmi_c/decode_it5 exp/combine_4a_mmi_3b_fmmic5/decode || exit 1; - diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh index 00bac326a80..aa838ceda89 100755 --- a/egs/rm/s5/run.sh +++ b/egs/rm/s5/run.sh @@ -233,8 +233,7 @@ done # local/run_raw_fmllr.sh -# You don't have to run all 3 of the below, e.g. you can just run the run_sgmm2.sh -#local/run_sgmm.sh +# You don't have to run all 2 of the below, e.g. you can just run the run_sgmm2.sh local/run_sgmm2.sh #local/run_sgmm2x.sh diff --git a/egs/sprakbanken/s5/local/run_sgmm.sh b/egs/sprakbanken/s5/local/run_sgmm.sh deleted file mode 100755 index 27d8449896f..00000000000 --- a/egs/sprakbanken/s5/local/run_sgmm.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -# This script is invoked from ../run.sh -# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. - -. cmd.sh - -# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for -# training, but this shouldn't have much effect. - -( - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; - - steps/train_ubm.sh --cmd "$train_cmd" \ - 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \ - exp/ubm5a/final.ubm exp/sgmm5a || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ - --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1; - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter & - done - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & - done - -) & - - -( -# The next commands are the same thing on all the si284 data. - -# SGMM system on the si284 data [sgmm5b] - steps/train_ubm.sh --cmd "$train_cmd" \ - 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5b || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ - exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92 - - utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1; - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ - exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ - exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ - --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284 - - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1 - - for iter in 1 2 3 4; do - for test in dev93 eval92; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter & - - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & - done - done -) & - - - -# Train quinphone SGMM system. - -steps/train_sgmm.sh --cmd "$train_cmd" \ - --context-opts "--context-width=5 --central-position=2" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5c || exit 1; - -# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93. -steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93 diff --git a/egs/swbd/s5/local/run_sgmm.sh b/egs/swbd/s5/local/run_sgmm.sh deleted file mode 100755 index da9af425fd8..00000000000 --- a/egs/swbd/s5/local/run_sgmm.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -. cmd.sh - - -# Build a SGMM system on just the 100k_nodup data, on top of LDA+MLLT+SAT. -if [ ! -f exp/ubm5a/final.ubm ]; then - steps/train_ubm.sh --cmd "$train_cmd" 700 data/train_100k_nodup data/lang \ - exp/tri4a_ali_100k_nodup exp/ubm5a || exit 1; -fi -steps/train_sgmm.sh --cmd "$train_cmd" \ - 4500 40000 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup \ - exp/ubm5a/final.ubm exp/sgmm5a || exit 1; - -utils/mkgraph.sh data/lang_test exp/sgmm5a exp/sgmm5a/graph || exit 1; - -steps/decode_sgmm.sh --cmd "$decode_cmd" --config conf/decode.config \ - --nj 30 --transform-dir exp/tri4a/decode_eval2000 \ - exp/sgmm5a/graph data/eval2000 exp/sgmm5a/decode_eval2000 - - # Now discriminatively train the SGMM system on 100k_nodup data. -steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4a_ali_100k_nodup \ - --use-graphs true --use-gselect true data/train_100k_nodup data/lang exp/sgmm5a exp/sgmm5a_ali_100k_nodup - - # Took the beam down to 10 to get acceptable decoding speed. -steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --beam 9.0 --lattice-beam 6 --cmd "$decode_cmd" \ - --transform-dir exp/tri4a_ali_100k_nodup \ - data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup - -steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4a_ali_100k_nodup --boost 0.1 \ - data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup exp/sgmm5a_mmi_b0.1 - -for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4a/decode_eval2000 data/lang_test data/eval2000 exp/sgmm5a/decode_eval2000 \ - exp/sgmm5a_mmi_b0.1/decode_eval2000_it$iter & -done - diff --git a/egs/swbd/s5/run.sh b/egs/swbd/s5/run.sh index 7286938b290..d61b818fe1b 100755 --- a/egs/swbd/s5/run.sh +++ b/egs/swbd/s5/run.sh @@ -161,7 +161,6 @@ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ -#local/run_sgmm.sh local/run_sgmm2.sh # Building a larger SAT system. diff --git a/egs/swbd/s5/run_edin.sh b/egs/swbd/s5/run_edin.sh index 5778d017529..8aff7e40c66 100755 --- a/egs/swbd/s5/run_edin.sh +++ b/egs/swbd/s5/run_edin.sh @@ -340,7 +340,7 @@ done # TODO(arnab): add SGMM and hybrid -# local/run_sgmm.sh +# local/run_sgmm2.sh # # Recipe with DNN system on top of fMLLR features # local/run_hybrid.sh diff --git a/egs/swbd/s5b/local/run_sgmm.sh b/egs/swbd/s5b/local/run_sgmm.sh deleted file mode 100755 index da9af425fd8..00000000000 --- a/egs/swbd/s5b/local/run_sgmm.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -. cmd.sh - - -# Build a SGMM system on just the 100k_nodup data, on top of LDA+MLLT+SAT. -if [ ! -f exp/ubm5a/final.ubm ]; then - steps/train_ubm.sh --cmd "$train_cmd" 700 data/train_100k_nodup data/lang \ - exp/tri4a_ali_100k_nodup exp/ubm5a || exit 1; -fi -steps/train_sgmm.sh --cmd "$train_cmd" \ - 4500 40000 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup \ - exp/ubm5a/final.ubm exp/sgmm5a || exit 1; - -utils/mkgraph.sh data/lang_test exp/sgmm5a exp/sgmm5a/graph || exit 1; - -steps/decode_sgmm.sh --cmd "$decode_cmd" --config conf/decode.config \ - --nj 30 --transform-dir exp/tri4a/decode_eval2000 \ - exp/sgmm5a/graph data/eval2000 exp/sgmm5a/decode_eval2000 - - # Now discriminatively train the SGMM system on 100k_nodup data. -steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4a_ali_100k_nodup \ - --use-graphs true --use-gselect true data/train_100k_nodup data/lang exp/sgmm5a exp/sgmm5a_ali_100k_nodup - - # Took the beam down to 10 to get acceptable decoding speed. -steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --beam 9.0 --lattice-beam 6 --cmd "$decode_cmd" \ - --transform-dir exp/tri4a_ali_100k_nodup \ - data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup - -steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4a_ali_100k_nodup --boost 0.1 \ - data/train_100k_nodup data/lang exp/sgmm5a_ali_100k_nodup exp/sgmm5a_denlats_100k_nodup exp/sgmm5a_mmi_b0.1 - -for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4a/decode_eval2000 data/lang_test data/eval2000 exp/sgmm5a/decode_eval2000 \ - exp/sgmm5a_mmi_b0.1/decode_eval2000_it$iter & -done - diff --git a/egs/vystadial_cz/online_demo/path.sh b/egs/vystadial_cz/online_demo/path.sh index e582fdc47e8..f54d95d60a8 100755 --- a/egs/vystadial_cz/online_demo/path.sh +++ b/egs/vystadial_cz/online_demo/path.sh @@ -29,7 +29,7 @@ SILENCE=models/silence.csl kaldisrc=`pwd`/../../../src openfst=`pwd`/../../../tools/openfst/ -export PATH=$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/sgmmbin:$kaldisrc/onl-rec:$openfst/bin:"$PATH" +export PATH=$kaldisrc/bin:$kaldisrc/fgmmbin:$kaldisrc/gmmbin:$kaldisrc/nnetbin:$kaldisrc/sgmm2bin:$kaldisrc/featbin:$kaldisrc/fstbin:$kaldisrc/latbin:$kaldisrc/onlinebin:$kaldisrc/onl-rec:$openfst/bin:"$PATH" export LD_LIBRARY_PATH=$kaldisrc/onl-rec:$kaldisrc/pykaldi/kaldi:$openfst/lib:$openfst/lib/fst:$LD_LIBRARY_PATH export PYTHONPATH=$kaldisrc/pykaldi:$kaldisrc/pykaldi/pyfst:$PYTHONPATH diff --git a/egs/wsj/s5/local/run_sgmm.sh b/egs/wsj/s5/local/run_sgmm.sh deleted file mode 100755 index 27d8449896f..00000000000 --- a/egs/wsj/s5/local/run_sgmm.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -# This script is invoked from ../run.sh -# It contains some SGMM-related scripts that I am breaking out of the main run.sh for clarity. - -. cmd.sh - -# SGMM system on si84 data [sgmm5a]. Note: the system we aligned from used the si284 data for -# training, but this shouldn't have much effect. - -( - steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_si84 data/lang exp/tri4b exp/tri4b_ali_si84 || exit 1; - - steps/train_ubm.sh --cmd "$train_cmd" \ - 400 data/train_si84 data/lang exp/tri4b_ali_si84 exp/ubm5a || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 3500 10000 data/train_si84 data/lang exp/tri4b_ali_si84 \ - exp/ubm5a/final.ubm exp/sgmm5a || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5a exp/sgmm5a/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5a/graph_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si84 \ - --use-graphs true --use-gselect true data/train_si84 data/lang exp/sgmm5a exp/sgmm5a_ali_si84 || exit 1; - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1/decode_tgpr_dev93_it$iter & - done - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \ - --update-opts "--cov-min-value=0.9" data/train_si84 data/lang exp/sgmm5a_ali_si84 exp/sgmm5a_denlats_si84 exp/sgmm5a_mmi_b0.1_m0.9 - - for iter in 1 2 3 4; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_dev93 data/lang_test_tgpr data/test_dev93 exp/sgmm5a/decode_tgpr_dev93 \ - exp/sgmm5a_mmi_b0.1_m0.9/decode_tgpr_dev93_it$iter & - done - -) & - - -( -# The next commands are the same thing on all the si284 data. - -# SGMM system on the si284 data [sgmm5b] - steps/train_ubm.sh --cmd "$train_cmd" \ - 600 data/train_si284 data/lang exp/tri4b_ali_si284 exp/ubm5b || exit 1; - - steps/train_sgmm.sh --cmd "$train_cmd" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5b || exit 1; - - ( - utils/mkgraph.sh data/lang_test_tgpr exp/sgmm5b exp/sgmm5b/graph_tgpr - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - exp/sgmm5b/graph_tgpr data/test_dev93 exp/sgmm5b/decode_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_eval92 \ - exp/sgmm5b/graph_tgpr data/test_eval92 exp/sgmm5b/decode_tgpr_eval92 - - utils/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm5b exp/sgmm5b/graph_bd_tgpr || exit 1; - steps/decode_sgmm.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \ - exp/sgmm5b/graph_bd_tgpr data/test_dev93 exp/sgmm5b/decode_bd_tgpr_dev93 - steps/decode_sgmm.sh --nj 8 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \ - exp/sgmm5b/graph_bd_tgpr data/test_eval92 exp/sgmm5b/decode_bd_tgpr_eval92 - ) & - - steps/align_sgmm.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \ - --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm5b exp/sgmm5b_ali_si284 - - steps/make_denlats_sgmm.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 - - steps/train_mmi_sgmm.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \ - data/train_si284 data/lang exp/sgmm5b_ali_si284 exp/sgmm5b_denlats_si284 exp/sgmm5b_mmi_b0.1 - - for iter in 1 2 3 4; do - for test in dev93 eval92; do - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm5b/decode_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_tgpr_${test}_it$iter & - - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $iter \ - --transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm5b/decode_bd_tgpr_${test} \ - exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter & - done - done -) & - - - -# Train quinphone SGMM system. - -steps/train_sgmm.sh --cmd "$train_cmd" \ - --context-opts "--context-width=5 --central-position=2" \ - 5500 25000 data/train_si284 data/lang exp/tri4b_ali_si284 \ - exp/ubm5b/final.ubm exp/sgmm5c || exit 1; - -# Decode from lattices in exp/sgmm5a/decode_tgpr_dev93. -steps/decode_sgmm_fromlats.sh --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \ - data/test_dev93 data/lang_test_tgpr exp/sgmm5a/decode_tgpr_dev93 exp/sgmm5c/decode_tgpr_dev93 diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh index ca13c1704f2..fb004117658 100755 --- a/egs/wsj/s5/run.sh +++ b/egs/wsj/s5/run.sh @@ -361,9 +361,6 @@ local/run_mmi_tri4b.sh #local/run_nnet2.sh -## Segregated some SGMM builds into a separate file. -#local/run_sgmm.sh - # You probably want to run the sgmm2 recipe as it's generally a bit better: local/run_sgmm2.sh diff --git a/egs/wsj/s5/steps/align_sgmm.sh b/egs/wsj/s5/steps/align_sgmm.sh deleted file mode 100755 index 782157f5ebe..00000000000 --- a/egs/wsj/s5/steps/align_sgmm.sh +++ /dev/null @@ -1,198 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# Computes training alignments and (if needed) speaker-vectors, given an -# SGMM system. If the system is built on top of SAT, you should supply -# transforms with the --transform-dir option. - -# If you supply the --use-graphs option, it will use the training -# graphs from the source directory. - -# Begin configuration section. -stage=0 -nj=4 -cmd=run.pl -use_graphs=false # use graphs from srcdir -use_gselect=false # use gselect info from srcdir [regardless, we use - # Gaussian-selection info, we might have to compute it though.] -gselect=15 # Number of Gaussian-selection indices for SGMMs. -# Begin configuration. -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -beam=10 -retry_beam=40 -transform_dir= # directory to find fMLLR transforms in. -# End configuration options. - -echo "$0 $@" # Print the command line for logging - -[ -f path.sh ] && . ./path.sh # source the path. -. parse_options.sh || exit 1; - -if [ $# != 4 ]; then - echo "usage: steps/align_sgmm.sh " - echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" - echo " exp/sgmm4a exp/sgmm5a_ali" - echo "main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --use-graphs true # use graphs in src-dir" - echo " --transform-dir # directory to find fMLLR transforms" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - exit 1; -fi - -data=$1 -lang=$2 -srcdir=$3 -dir=$4 - -oov=`cat $lang/oov.int` || exit 1; -silphonelist=`cat $lang/phones/silence.csl` || exit 1; -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` -sdata=$data/split$nj - -mkdir -p $dir/log -cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. -cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. -echo $nj > $dir/num_jobs -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; - -utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -cp $srcdir/{tree,final.mdl} $dir || exit 1; -[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir -cp $srcdir/final.occs $dir; - -## Set up features. -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - cp $srcdir/final.mat $dir - ;; - *) echo "Invalid feature type $feat_type" && exit 1; -esac -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option during alignment." -fi -## - -## Set up model and alignment model. -mdl=$srcdir/final.mdl -if [ -f $srcdir/final.alimdl ]; then - alimdl=$srcdir/final.alimdl -else - alimdl=$srcdir/final.mdl -fi -[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; - -## Work out where we're getting the graphs from. -if $use_graphs; then - [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ - echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; - [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; - graphdir=$srcdir - ln.pl $srcdir/fsts.*.gz $dir -else - graphdir=$dir - if [ $stage -le 0 ]; then - echo "$0: compiling training graphs" - tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; - $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ - compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ - "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; - fi -fi - -## Work out where we're getting the Gaussian-selection info from -if $use_gselect; then - [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ - echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; - [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; - graphdir=$srcdir - gselect_opt="--gselect=ark,s,cs:gunzip -c $srcdir/gselect.JOB.gz|" - ln.pl $srcdir/gselect.*.gz $dir -else - graphdir=$dir - if [ $stage -le 1 ]; then - echo "$0: computing Gaussian-selection info" - # Note: doesn't matter whether we use $alimdl or $mdl, they will - # have the same gselect info. - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect --full-gmm-nbest=$gselect $alimdl \ - "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; - fi - gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" -fi - - -if [ $alimdl == $mdl ]; then - # Speaker-independent decoding-- just one pass. Not normal. - T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; - [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; - - if [ $stage -le 2 ]; then - echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" - $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ - sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ - "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; - fi - echo "$0: done aligning data." - exit 0; -fi - -# Continue with system with speaker vectors. -if [ $stage -le 2 ]; then - echo "$0: aligning data in $data using model $alimdl" - $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ - sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ - "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; -fi - -if [ $stage -le 3 ]; then - echo "$0: computing speaker vectors (1st pass)" - $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ - ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ - sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ - sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ - $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; -fi - -if [ $stage -le 4 ]; then - echo "$0: computing speaker vectors (2nd pass)" - $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ - ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ - --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; - rm $dir/pre_vecs.* -fi - -if [ $stage -le 5 ]; then - echo "$0: doing final alignment." - $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ - sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ - --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ - $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; -fi - -rm $dir/pre_ali.*.gz - -echo "$0: done aligning data." - -utils/summarize_warnings.pl $dir/log - -exit 0; diff --git a/egs/wsj/s5/steps/align_sgmm2.sh b/egs/wsj/s5/steps/align_sgmm2.sh index 8f68a2f7a08..d2f829f7e3e 100755 --- a/egs/wsj/s5/steps/align_sgmm2.sh +++ b/egs/wsj/s5/steps/align_sgmm2.sh @@ -30,8 +30,8 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# != 4 ]; then - echo "usage: steps/align_sgmm.sh " - echo "e.g.: steps/align_sgmm.sh --transform-dir exp/tri3b data/train data/lang \\" + echo "usage: steps/align_sgmm2.sh " + echo "e.g.: steps/align_sgmm2.sh --transform-dir exp/tri3b data/train data/lang \\" echo " exp/sgmm4a exp/sgmm5a_ali" echo "main options (for others, see top of script file)" echo " --config # config containing options" diff --git a/egs/wsj/s5/steps/decode_sgmm.sh b/egs/wsj/s5/steps/decode_sgmm.sh deleted file mode 100755 index 2faf2c10e0f..00000000000 --- a/egs/wsj/s5/steps/decode_sgmm.sh +++ /dev/null @@ -1,266 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# This script does decoding with an SGMM system, with speaker vectors. -# If the SGMM system was -# built on top of fMLLR transforms from a conventional system, you should -# provide the --transform-dir option. - -# Begin configuration section. -stage=1 -alignment_model= -transform_dir= # dir to find fMLLR transforms. -nj=4 # number of decoding jobs. -acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. -cmd=run.pl -beam=15.0 -gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: - # the first_pass_gselect variable is used for the 1st pass of - # decoding and can be tighter. -first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in - # the 1st pass of decoding (lattice generation). -max_active=7000 - -#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming -# in the other scripts -lattice_beam=6.0 # Beam we use in lattice generation. -vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for - # speaker-vector computation. Can be quite tight (actually we could - # probably just do best-path. -use_fmllr=false -fmllr_iters=10 -fmllr_min_count=1000 -skip_scoring=false -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: steps/decode_sgmm.sh [options] " - echo " e.g.: steps/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" - echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" - echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." - echo " --alignment-model # Model for the first-pass decoding." - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --cmd # Command to run in parallel with" - echo " --beam # Decoding beam; default 13.0" - exit 1; -fi - -graphdir=$1 -data=$2 -dir=$3 -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. - -for f in $graphdir/HCLG.fst $data/feats.scp $srcdir/final.mdl; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - -sdata=$data/split$nj; -silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` -gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" -gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" - -mkdir -p $dir/log -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -echo $nj > $dir/num_jobs - - -## Set up features. -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi -## - -## Calculate FMLLR pre-transforms if needed. We are doing this here since this -## step is requried by models both with and without speaker vectors -if $use_fmllr; then - if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then - echo "$0: computing pre-transform for fMLLR computation." - sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; - fi -fi - -## Save Gaussian-selection info to disk. -# Note: we can use final.mdl regardless of whether there is an alignment model-- -# they use the same UBM. -if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ - "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; -fi - -## Work out name of alignment model. ## -if [ -z "$alignment_model" ]; then - if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; - else alignment_model=$srcdir/final.mdl; fi -fi -[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; - -# Generate state-level lattice which we can rescore. This is done with the -# alignment model and no speaker-vectors. -if [ $stage -le 2 ]; then - if [ -f "$graphdir/num_pdfs" ]; then - [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $alignment_model | grep pdfs | awk '{print $NF}'` ] || \ - { echo "Mismatch in number of pdfs with $alignment_model"; exit 1; } - fi - $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ - sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ - --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \ - $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; -fi - -## Check if the model has speaker vectors -spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` - -if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: - -# Estimate speaker vectors (1st pass). Prune before determinizing -# because determinization can take a while on un-pruned lattices. -# Note: the sgmm-post-to-gpost stage is necessary because we have -# a separate alignment-model and final model, otherwise we'd skip it -# and use sgmm-est-spkvecs. - if [ $stage -le 3 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ - sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ - sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; - fi - -# Estimate speaker vectors (2nd pass). Since we already have spk vectors, -# at this point we need to rescore the lattice to get the correct posteriors. - if [ $stage -le 4 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --speedup=true --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; - fi - rm $dir/pre_vecs.* - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --speedup=true --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ]; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - fi - rm $dir/pre_lat.*.gz - -else ### For models without speaker vectors: - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --speedup=true --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ] && $use_fmllr; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - rm $dir/pre_lat.*.gz - else # If no adaptation needed, determinize the lattice. - $cmd JOB=1:$nj $dir/log/determinize.JOB.log \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam \ - "ark:gunzip -c $dir/pre_lat.JOB.gz|" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - rm $dir/pre_lat.*.gz - fi - -fi - -if [ $stage -le 7 ]; then - steps/diagnostic/analyze_lats.sh --cmd "$cmd" $graphdir $dir -fi - -if [ $stage -le 8 ]; then - if ! $skip_scoring ; then - [ ! -x local/score.sh ] && \ - echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; - echo "score best paths" - local/score.sh --cmd "$cmd" $data $graphdir $dir || - { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } - #echo "score confidence and timing with sclite" - #local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir - fi -fi -echo "Decoding done." -exit 0; diff --git a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh index 7a3a4f6bd48..c84e597192e 100755 --- a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh +++ b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh @@ -43,7 +43,7 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# -ne 4 ]; then - echo "Usage: steps/decode_sgmm_fromlats.sh [options] " + echo "Usage: steps/decode_sgmm2_fromlats.sh [options] " echo "" echo "main options (for others, see top of script file)" echo " --transform-dir # directory of previous decoding" diff --git a/egs/wsj/s5/steps/decode_sgmm2_rescore.sh b/egs/wsj/s5/steps/decode_sgmm2_rescore.sh index a37a47350d7..c258ad00067 100755 --- a/egs/wsj/s5/steps/decode_sgmm2_rescore.sh +++ b/egs/wsj/s5/steps/decode_sgmm2_rescore.sh @@ -26,8 +26,8 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# -ne 4 ]; then - echo "Usage: steps/decode_sgmm_rescore.sh [options] " - echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" + echo "Usage: steps/decode_sgmm2_rescore.sh [options] " + echo " e.g.: steps/decode_sgmm2_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" echo "main options (for others, see top of script file)" echo " --transform-dir # directory of previous decoding" diff --git a/egs/wsj/s5/steps/decode_sgmm_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm_fromlats.sh deleted file mode 100755 index bb1dacd113f..00000000000 --- a/egs/wsj/s5/steps/decode_sgmm_fromlats.sh +++ /dev/null @@ -1,277 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# This script does decoding with an SGMM system, with speaker vectors. -# If the SGMM system was -# built on top of fMLLR transforms from a conventional system, you should -# provide the --transform-dir option. -# This script does not use a decoding graph, but instead you provide -# a previous decoding directory with lattices in it. This script will only -# make use of the word sequences in the lattices; it limits the decoding -# to those sequences. You should also provide a "lang" directory from -# which this script will use the G.fst and L.fst. - -# Begin configuration section. -stage=1 -alignment_model= -transform_dir= # dir to find fMLLR transforms. -acwt=0.08333 # Just a default value, used for adaptation and beam-pruning.. -batch_size=75 # Limits memory blowup in compile-train-graphs-fsts -cmd=run.pl -beam=20.0 -gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: - # the first_pass_gselect variable is used for the 1st pass of - # decoding and can be tighter. -first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in - # the 1st pass of decoding (lattice generation). -max_active=7000 - -#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming -# in the other scripts -lattice_beam=8.0 # Beam we use in lattice generation. -vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for - # speaker-vector computation. Can be quite tight (actually we could - # probably just do best-path. -use_fmllr=false -fmllr_iters=10 -fmllr_min_count=1000 -scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" -skip_scoring=false -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 4 ]; then - echo "Usage: steps/decode_sgmm_fromlats.sh [options] " - echo "" - echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." - echo " --alignment-model # Model for the first-pass decoding." - echo " --config # config containing options" - echo " --cmd # Command to run in parallel with" - echo " --beam # Decoding beam; default 13.0" - exit 1; -fi - -data=$1 -lang=$2 -olddir=$3 -dir=$4 -srcdir=`dirname $dir` - -for f in $data/feats.scp $lang/G.fst $lang/L_disambig.fst $lang/phones/disambig.int \ - $srcdir/final.mdl $srcdir/tree $olddir/lat.1.gz; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - -nj=`cat $olddir/num_jobs` || exit 1; -sdata=$data/split$nj; -silphonelist=`cat $lang/phones/silence.csl` || exit 1 -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` -gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" -gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" - -mkdir -p $dir/log -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -echo $nj > $dir/num_jobs - - -## Set up features - -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" -if [ -z "$transform_dir" ] && [ -f $olddir/trans.1 ]; then - transform_dir=$olddir -fi - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi - -## Calculate FMLLR pre-transforms if needed. We are doing this here since this -## step is requried by models both with and without speaker vectors -if $use_fmllr; then - if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then - echo "$0: computing pre-transform for fMLLR computation." - sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; - fi -fi - -## Save Gaussian-selection info to disk. -# Note: we can use final.mdl regardless of whether there is an alignment model-- -# they use the same UBM. -if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ - "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; -fi - -## Work out name of alignment model. ## -if [ -z "$alignment_model" ]; then - if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; - else alignment_model=$srcdir/final.mdl; fi -fi -[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; - -# Generate state-level lattice which we can rescore. This is done with the -# alignment model and no speaker-vectors. -if [ $stage -le 2 ]; then - $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ - lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \ - fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \ - fstdeterminizestar ark:- ark:- \| \ - compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \ - --batch-size=$batch_size $scale_opts \ - $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \ - sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ - --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ - --word-symbol-table=$lang/words.txt "$gselect_opt_1stpass" $alignment_model \ - "ark:-" "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; -fi - -## Check if the model has speaker vectors -spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` - -if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: - -# Estimate speaker vectors (1st pass). Prune before determinizing -# because determinization can take a while on un-pruned lattices. -# Note: the sgmm-post-to-gpost stage is necessary because we have -# a separate alignment-model and final model, otherwise we'd skip it -# and use sgmm-est-spkvecs. - if [ $stage -le 3 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ - sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ - sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata/JOB/spk2utt \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; - fi - -# Estimate speaker vectors (2nd pass). Since we already have spk vectors, -# at this point we need to rescore the lattice to get the correct posteriors. - if [ $stage -le 4 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; - fi - rm $dir/pre_vecs.* - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ]; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - fi - rm $dir/pre_lat.*.gz - -else ### For models without speaker vectors: - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --utt2spk=ark:$sdata/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata/JOB/spk2utt "$gselect_opt" \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ] && $use_fmllr; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata/JOB/utt2spk \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - rm $dir/pre_lat.*.gz - else # Already done with decoding if no adaptation needed. - for n in `seq 1 $nj`; do - mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz - done - fi - -fi - -# The output of this script is the files "lat.*.gz"-- we'll rescore this at -# different acoustic scales to get the final output. - - -if [ $stage -le 7 ]; then - if ! $skip_scoring ; then - [ ! -x local/score.sh ] && \ - echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; - echo "score best paths" - local/score.sh --cmd "$cmd" $data $lang $dir || - { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } - # echo "score confidence and timing with sclite" - # local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $lang $dir - fi -fi -echo "Decoding done." -exit 0; diff --git a/egs/wsj/s5/steps/decode_sgmm_rescore.sh b/egs/wsj/s5/steps/decode_sgmm_rescore.sh deleted file mode 100755 index 398c8931e7f..00000000000 --- a/egs/wsj/s5/steps/decode_sgmm_rescore.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# This script does decoding with an SGMM system, by rescoring lattices -# generated from a previous SGMM system. The directory with the lattices -# is assumed to contain speaker vectors, if used. Basically it rescores -# the lattices one final time, using the same setup as the final decoding -# pass of the source dir. The assumption is that the model may have -# been discriminatively trained. - -# If the system was built on top of fMLLR transforms from a conventional system, -# you should provide the --transform-dir option. - -# Begin configuration section. -transform_dir= # dir to find fMLLR transforms. -cmd=run.pl -iter=final -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 4 ]; then - echo "Usage: steps/decode_sgmm_rescore.sh [options] " - echo " e.g.: steps/decode_sgmm_rescore.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" - echo " exp/sgmm3a/graph_tgpr data/test_dev93 exp/sgmm3a/decode_dev93_tgpr exp/sgmm3a_mmi/decode_dev93_tgpr" - echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." - echo " --config # config containing options" - echo " --cmd # Command to run in parallel with" - echo " --iter # iteration of model to use (default: final)" - exit 1; -fi - -graphdir=$1 -data=$2 -olddir=$3 -dir=$4 -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. - -for f in $graphdir/words.txt $data/feats.scp $olddir/lat.1.gz $olddir/gselect.1.gz \ - $srcdir/$iter.mdl; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - -nj=`cat $olddir/num_jobs` || exit 1; -sdata=$data/split$nj; -gselect_opt="--gselect=ark,s,cs:gunzip -c $olddir/gselect.JOB.gz|" -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` - -mkdir -p $dir/log -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -echo $nj > $dir/num_jobs - -if [ -f $olddir/vecs.1 ]; then - echo "$0: using speaker vectors from $olddir" - spkvecs_opt="--spk-vecs=ark:$olddir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" -else - echo "$0: no speaker vectors found." - spkvecs_opt= -fi - - -## Set up features. -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi - -if [ -f $olddir/trans.1 ]; then - echo "$0: using (in addition to any previous transforms) transforms from $olddir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$olddir/trans.JOB ark:- ark:- |" -fi -## - -# Rescore the state-level lattices with the model provided. Just -# one command in this script. -echo "$0: rescoring lattices with SGMM model in $srcdir/$iter.mdl" -$cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt \ - $srcdir/$iter.mdl "ark:gunzip -c $olddir/lat.JOB.gz|" "$feats" \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - -[ ! -x local/score.sh ] && \ - echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; -local/score.sh --cmd "$cmd" $data $graphdir $dir - -exit 0; diff --git a/egs/wsj/s5/steps/make_denlats_sgmm.sh b/egs/wsj/s5/steps/make_denlats_sgmm.sh deleted file mode 100755 index fbd59378c9c..00000000000 --- a/egs/wsj/s5/steps/make_denlats_sgmm.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 Guoguo Chen - -# Create denominator lattices for MMI/MPE training, with SGMM models. If the -# features have fMLLR transforms you have to supply the --transform-dir option. -# It gets any speaker vectors from the "alignment dir" ($alidir). Note: this is -# possibly a slight mismatch because the speaker vectors come from supervised -# adaptation. - -# Begin configuration section. -nj=4 -cmd=run.pl -sub_split=1 -beam=13.0 -lattice_beam=7.0 -acwt=0.1 -max_active=5000 -transform_dir= -max_mem=20000000 # This will stop the processes getting too large. -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# != 4 ]; then - echo "Usage: steps/make_denlats_sgmm.sh [options] " - echo " e.g.: steps/make_denlats_sgmm.sh data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" - echo "Works for (delta|lda) features, and (with --transform-dir option) such features" - echo " plus transforms." - echo "" - echo "Main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --sub-split # e.g. 40; use this for " - echo " # large databases so your jobs will be smaller and" - echo " # will (individually) finish reasonably soon." - echo " --transform-dir # directory to find fMLLR transforms." - exit 1; -fi - -data=$1 -lang=$2 -alidir=$3 # could also be $srcdir, but only if no vectors supplied. -dir=$4 - -sdata=$data/split$nj -splice_opts=`cat $alidir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` -mkdir -p $dir/log -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -echo $nj > $dir/num_jobs - -utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; - -oov=`cat $lang/oov.int` || exit 1; - -mkdir -p $dir - -cp -RH $lang $dir/ - -# Compute grammar FST which corresponds to unigram decoding graph. -new_lang="$dir/"$(basename "$lang") -echo "$0: Making unigram grammar FST in $new_lang" -cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ - awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ - utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \ - || exit 1; - -# mkgraph.sh expects a whole directory "lang", so put everything in one directory... -# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and -# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph. - -echo "$0: Compiling decoding graph in $dir/dengraph" -if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then - echo "$0: Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." -else - utils/mkgraph.sh $new_lang $alidir $dir/dengraph || exit 1; -fi - -if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" - cp $alidir/final.mat $dir - ;; - *) echo "$0: Invalid feature type $feat_type" && exit 1; -esac - -if [ ! -z "$transform_dir" ]; then # add transforms to features... - echo "$0: using fMLLR transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." - [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ - && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; - [ -f $alidir/final.mat ] && ! cmp $transform_dir/final.mat $alidir/final.mat && \ - echo "$0: LDA transforms differ between $alidir and $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" -else - echo "$0: Assuming you don't have a SAT system, since no --transform-dir option supplied " -fi - -if [ -f $alidir/gselect.1.gz ]; then - gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|" -else - echo "$0: no such file $alidir/gselect.1.gz" && exit 1; -fi - -if [ -f $alidir/vecs.1 ]; then - spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" -else - if [ -f $alidir/final.alimdl ]; then - echo "$0: You seem to have an SGMM system with speaker vectors," - echo "yet we can't find speaker vectors. Perhaps you supplied" - echo "the model director instead of the alignment directory?" - exit 1; - fi -fi - -# if this job is interrupted by the user, we want any background jobs to be -# killed too. -cleanup() { - local pids=$(jobs -pr) - [ -n "$pids" ] && kill $pids -} -trap "cleanup" INT QUIT TERM EXIT - -if [ $sub_split -eq 1 ]; then - $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ - sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \ - --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ - --max-mem=$max_mem --max-active=$max_active \ - --word-symbol-table=$lang/words.txt $alidir/final.mdl \ - $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; -else - # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim - # to have at most two jobs running at each time. The idea is that if we have - # stragglers from one job, we can be processing another one at the same time. - rm $dir/.error 2>/dev/null - - prev_pid= - for n in `seq $[nj+1]`; do - if [ $n -gt $nj ]; then - this_pid= - elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $alidir/final.mdl ]; then - echo "$0: Not processing subset $n as already done (delete $dir/.done.$n if not)"; - this_pid= - else - sdata2=$data/split$nj/$n/split${sub_split}utt; - split_data.sh --per-utt $sdata/$n $sub_split || exit 1; - mkdir -p $dir/log/$n - mkdir -p $dir/part - feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g` - spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` - gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` - $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ - sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \ - --beam=$beam --lattice-beam=$lattice_beam \ - --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ - --word-symbol-table=$lang/words.txt $alidir/final.mdl \ - $dir/dengraph/HCLG.fst "$feats_subset" \ - "ark:|gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error & - this_pid=$! - fi - if [ ! -z "$prev_pid" ]; then # Wait for the previous job to merge lattices. - wait $prev_pid - [ -f $dir/.error ] && \ - echo "$0: error generating denominator lattices" && exit 1; - rm $dir/.merge_error 2>/dev/null - echo "$0: Merging archives for data subset $prev_n" - for k in `seq $sub_split`; do - gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error; - done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error; - [ -f $dir/.merge_error ] && \ - echo "$0: Merging lattices for subset $prev_n failed" && exit 1; - rm $dir/lat.$prev_n.*.gz - touch $dir/.done.$prev_n - fi - prev_n=$n - prev_pid=$this_pid - done -fi - - -echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/wsj/s5/steps/tandem/align_sgmm.sh b/egs/wsj/s5/steps/tandem/align_sgmm.sh deleted file mode 100755 index bb3ba79bd9f..00000000000 --- a/egs/wsj/s5/steps/tandem/align_sgmm.sh +++ /dev/null @@ -1,236 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Korbinian Riedhammer -# Apache 2.0 - -# Computes training alignments and (if needed) speaker-vectors, given an -# SGMM system. If the system is built on top of SAT, you should supply -# transforms with the --transform-dir option. - -# If you supply the --use-graphs option, it will use the training -# graphs from the source directory. - -# Begin configuration section. -stage=0 -nj=4 -cmd=run.pl -use_graphs=false # use graphs from srcdir -use_gselect=false # use gselect info from srcdir [regardless, we use - # Gaussian-selection info, we might have to compute it though.] -gselect=15 # Number of Gaussian-selection indices for SGMMs. -# Begin configuration. -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -beam=10 -retry_beam=40 -transform_dir= # directory to find fMLLR transforms in. -# End configuration options. - -echo "$0 $@" # Print the command line for logging - -[ -f path.sh ] && . ./path.sh # source the path. -. parse_options.sh || exit 1; - -if [ $# != 5 ]; then - echo "usage: steps/tandem/align_sgmm.sh " - echo "e.g.: steps/tandem/align_sgmm.sh --transform-dir exp/tri3b data1/train data1/lang \\" - echo " exp/sgmm4a exp/sgmm5a_ali" - echo "main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --use-graphs true # use graphs in src-dir" - echo " --transform-dir # directory to find fMLLR transforms" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - exit 1; -fi - -data1=$1 -data2=$2 -lang=$3 -srcdir=$4 -dir=$5 - -oov=`cat $lang/oov.int` || exit 1; -silphonelist=`cat $lang/phones/silence.csl` || exit 1; - -mkdir -p $dir/log -echo $nj > $dir/num_jobs - -utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -## Set up features. - -sdata1=$data1/split$nj -sdata2=$data2/split$nj -[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1; -[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1; - -cp $srcdir/{tree,final.mdl} $dir || exit 1; -[ -f $srcdir/final.alimdl ] && cp $srcdir/final.alimdl $dir -cp $srcdir/final.occs $dir; - -## Set up features. -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. -normft2=`cat $srcdir/normft2 2>/dev/null` - -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi - -case $feat_type in - delta) - echo "$0: feature type is $feat_type" - ;; - lda) - echo "$0: feature type is $feat_type" - cp $srcdir/{lda,final}.mat $dir/ || exit 1; - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac - -# set up feature stream 1; this are usually spectral features, so we will add -# deltas or splice them -feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |" - -if [ "$feat_type" == "delta" ]; then - feats1="$feats1 add-deltas ark:- ark:- |" -elif [ "$feat_type" == "lda" ]; then - feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |" -fi - -# set up feature stream 2; this are usually bottleneck or posterior features, -# which may be normalized if desired -feats2="scp:$sdata2/JOB/feats.scp" - -if [ "$normft2" == "true" ]; then - feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |" -fi - -# assemble tandem features -feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |" - -# add transformation, if applicable -if [ "$feat_type" == "lda" ]; then - feats="$feats transform-feats $dir/final.mat ark:- ark:- |" -fi - -# splicing/normalization options -cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null - -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option during alignment." -fi -## - -## Set up model and alignment model. -mdl=$srcdir/final.mdl -if [ -f $srcdir/final.alimdl ]; then - alimdl=$srcdir/final.alimdl -else - alimdl=$srcdir/final.mdl -fi -[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; - -## Work out where we're getting the graphs from. -if $use_graphs; then - [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ - echo "$0: you specified --use-graphs true, but #jobs mismatch." && exit 1; - [ ! -f $srcdir/fsts.1.gz ] && echo "No graphs in $srcdir" && exit 1; - graphdir=$srcdir - ln.pl $srcdir/fsts.*.gz $dir -else - graphdir=$dir - if [ $stage -le 0 ]; then - echo "$0: compiling training graphs" - tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata1/JOB/text|"; - $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ - compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ - "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; - fi -fi - -## Work out where we're getting the Gaussian-selection info from -if $use_gselect; then - [ "$nj" != "`cat $srcdir/num_jobs`" ] && \ - echo "$0: you specified --use-gselect true, but #jobs mismatch." && exit 1; - [ ! -f $srcdir/gselect.1.gz ] && echo "No gselect info in $srcdir" && exit 1; - graphdir=$srcdir - gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|" - ln.pl $srcdir/gselect.*.gz $dir -else - graphdir=$dir - if [ $stage -le 1 ]; then - echo "$0: computing Gaussian-selection info" - # Note: doesn't matter whether we use $alimdl or $mdl, they will - # have the same gselect info. - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect --full-gmm-nbest=$gselect $alimdl \ - "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; - fi - gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" -fi - - -if [ $alimdl == $mdl ]; then - # Speaker-independent decoding-- just one pass. Not normal. - T=`sgmm-info $mdl | grep 'speaker vector space' | awk '{print $NF}'` || exit 1; - [ "$T" -ne 0 ] && echo "No alignment model, yet speaker vector space nonempty" && exit 1; - - if [ $stage -le 2 ]; then - echo "$0: aligning data in $data using model $mdl (no speaker-vectors)" - $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ - sgmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $alimdl \ - "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; - fi - echo "$0: done aligning data." - exit 0; -fi - -# Continue with system with speaker vectors. -if [ $stage -le 2 ]; then - echo "$0: aligning data in $data using model $alimdl" - $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ - sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam $alimdl \ - "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; -fi - -if [ $stage -le 3 ]; then - echo "$0: computing speaker vectors (1st pass)" - $cmd JOB=1:$nj $dir/log/spk_vecs1.JOB.log \ - ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ - sgmm-post-to-gpost "$gselect_opt" $alimdl "$feats" ark:- ark:- \| \ - sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \ - $mdl "$feats" ark,s,cs:- ark:$dir/pre_vecs.JOB || exit 1; -fi - -if [ $stage -le 4 ]; then - echo "$0: computing speaker vectors (2nd pass)" - $cmd JOB=1:$nj $dir/log/spk_vecs2.JOB.log \ - ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" \ - --spk-vecs=ark:$dir/pre_vecs.JOB $mdl "$feats" ark,s,cs:- ark:$dir/vecs.JOB || exit 1; - rm $dir/pre_vecs.* -fi - -if [ $stage -le 5 ]; then - echo "$0: doing final alignment." - $cmd JOB=1:$nj $dir/log/align_pass2.JOB.log \ - sgmm-align-compiled $scale_opts "$gselect_opt" --beam=$beam --retry-beam=$retry_beam \ - --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ - $mdl "ark:gunzip -c $graphdir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; -fi - -rm $dir/pre_ali.*.gz - -echo "$0: done aligning data." - -utils/summarize_warnings.pl $dir/log - -exit 0; diff --git a/egs/wsj/s5/steps/tandem/decode_sgmm.sh b/egs/wsj/s5/steps/tandem/decode_sgmm.sh deleted file mode 100755 index c980bf13f4f..00000000000 --- a/egs/wsj/s5/steps/tandem/decode_sgmm.sh +++ /dev/null @@ -1,303 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# Korbinian Riedhammer - -# This script does decoding with an SGMM system, with speaker vectors. -# If the SGMM system was -# built on top of fMLLR transforms from a conventional system, you should -# provide the --transform-dir option. - -# Begin configuration section. -stage=1 -alignment_model= -transform_dir= # dir to find fMLLR transforms. -nj=4 # number of decoding jobs. -acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. -cmd=run.pl -beam=15.0 -gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note: - # the first_pass_gselect variable is used for the 1st pass of - # decoding and can be tighter. -first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in - # the 1st pass of decoding (lattice generation). -max_active=7000 - -#WARNING: This option is renamed lattice_beam (it was renamed to follow the naming -# in the other scripts -lattice_beam=8.0 # Beam we use in lattice generation. -vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for - # speaker-vector computation. Can be quite tight (actually we could - # probably just do best-path. -use_fmllr=false -fmllr_iters=10 -fmllr_min_count=1000 -skip_scoring=false -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 4 ]; then - echo "Usage: steps/tandem/decode_sgmm.sh [options] " - echo " e.g.: steps/tandem/decode_sgmm.sh --transform-dir exp/tri3b/decode_dev93_tgpr \\" - echo " exp/sgmm3a/graph_tgpr {mfcc,bottleneck}/data/test_dev93 exp/sgmm3a/decode_dev93_tgpr" - echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." - echo " --alignment-model # Model for the first-pass decoding." - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --cmd # Command to run in parallel with" - echo " --beam # Decoding beam; default 13.0" - exit 1; -fi - -graphdir=$1 -data1=$2 -data2=$3 -dir=$4 -srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory. - -for f in $graphdir/HCLG.fst $data1/feats.scp $data2/feats.scp $srcdir/final.mdl; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - -silphonelist=`cat $graphdir/phones/silence.csl` || exit 1 -gselect_opt="--gselect=ark:gunzip -c $dir/gselect.JOB.gz|" -gselect_opt_1stpass="$gselect_opt copy-gselect --n=$first_pass_gselect ark:- ark:- |" - -mkdir -p $dir/log -echo $nj > $dir/num_jobs - -sdata1=$data1/split$nj; -sdata2=$data2/split$nj; -[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1; -[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1; - - -## Set up features. - -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. -normft2=`cat $srcdir/normft2 2>/dev/null` - -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi - -case $feat_type in - delta) - echo "$0: feature type is $feat_type" - ;; - lda) - echo "$0: feature type is $feat_type" - cp $srcdir/{lda,final}.mat $dir/ - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac - -# set up feature stream 1; this are usually spectral features, so we will add -# deltas or splice them -feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |" - -if [ "$feat_type" == "delta" ]; then - feats1="$feats1 add-deltas ark:- ark:- |" -elif [ "$feat_type" == "lda" ]; then - feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |" -fi - -# set up feature stream 2; this are usually bottleneck or posterior features, -# which may be normalized if desired -feats2="scp:$sdata2/JOB/feats.scp" - -if [ "$normft2" == "true" ]; then - echo "Using cmvn for feats2" - feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |" -fi - -# assemble tandem features -feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |" - -# add transformation, if applicable -if [ "$feat_type" == "lda" ]; then - feats="$feats transform-feats $dir/final.mat ark:- ark:- |" -fi - -# splicing/normalization options -cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null - -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" && exit 1; - [ "$nj" -ne "`cat $transform_dir/num_jobs`" ] \ - && echo "$0: #jobs mismatch with transform-dir." && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -elif grep 'transform-feats --utt2spk' $srcdir/log/acc.0.1.log 2>/dev/null; then - echo "$0: **WARNING**: you seem to be using an SGMM system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi -## - - -## Calculate FMLLR pre-transforms if needed. We are doing this here since this -## step is requried by models both with and without speaker vectors -if $use_fmllr; then - if [ ! -f $srcdir/final.fmllr_mdl ] || [ $srcdir/final.fmllr_mdl -ot $srcdir/final.mdl ]; then - echo "$0: computing pre-transform for fMLLR computation." - sgmm-comp-prexform $srcdir/final.mdl $srcdir/final.occs $srcdir/final.fmllr_mdl || exit 1; - fi -fi - -## Save Gaussian-selection info to disk. -# Note: we can use final.mdl regardless of whether there is an alignment model-- -# they use the same UBM. -if [ $stage -le 1 ]; then - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect --full-gmm-nbest=$gselect $srcdir/final.mdl \ - "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1; -fi - -## Work out name of alignment model. ## -if [ -z "$alignment_model" ]; then - if [ -f "$srcdir/final.alimdl" ]; then alignment_model=$srcdir/final.alimdl; - else alignment_model=$srcdir/final.mdl; fi -fi -[ ! -f "$alignment_model" ] && echo "$0: no alignment model $alignment_model " && exit 1; - -# Generate state-level lattice which we can rescore. This is done with the -# alignment model and no speaker-vectors. -if [ $stage -le 2 ]; then - $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \ - sgmm-latgen-faster --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ - --acoustic-scale=$acwt --determinize-lattice=false --allow-partial=true \ - --word-symbol-table=$graphdir/words.txt "$gselect_opt_1stpass" $alignment_model \ - $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/pre_lat.JOB.gz" || exit 1; -fi - -## Check if the model has speaker vectors -spkdim=`sgmm-info $srcdir/final.mdl | grep 'speaker vector' | awk '{print $NF}'` - -if [ $spkdim -gt 0 ]; then ### For models with speaker vectors: - -# Estimate speaker vectors (1st pass). Prune before determinizing -# because determinization can take a while on un-pruned lattices. -# Note: the sgmm-post-to-gpost stage is necessary because we have -# a separate alignment-model and final model, otherwise we'd skip it -# and use sgmm-est-spkvecs. - if [ $stage -le 3 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass1.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $alignment_model ark:- ark:- \| \ - sgmm-post-to-gpost "$gselect_opt" $alignment_model "$feats" ark:- ark:- \| \ - sgmm-est-spkvecs-gpost --spk2utt=ark:$sdata1/JOB/spk2utt \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/pre_vecs.JOB" || exit 1; - fi - -# Estimate speaker vectors (2nd pass). Since we already have spk vectors, -# at this point we need to rescore the lattice to get the correct posteriors. - if [ $stage -le 4 ]; then - $cmd JOB=1:$nj $dir/log/vecs_pass2.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --spk-vecs=ark:$dir/pre_vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/pre_vecs.JOB \ - $srcdir/final.mdl "$feats" ark,s,cs:- "ark:$dir/vecs.JOB" || exit 1; - fi - rm $dir/pre_vecs.* - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --spk-vecs=ark:$dir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" --spk-vecs=ark:$dir/vecs.JOB \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ]; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata1/JOB/utt2spk --spk-vecs=ark:$dir/vecs.JOB \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - fi - rm $dir/pre_lat.*.gz - -else ### For models without speaker vectors: - - if $use_fmllr; then - # Estimate fMLLR transforms (note: these may be on top of any - # fMLLR transforms estimated with the baseline GMM system. - if [ $stage -le 5 ]; then # compute fMLLR transforms. - echo "$0: computing fMLLR transforms." - $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ - gunzip -c $dir/pre_lat.JOB.gz \| \ - sgmm-rescore-lattice --utt2spk=ark:$sdata1/JOB/utt2spk \ - "$gselect_opt" $srcdir/final.mdl ark:- "$feats" ark:- \| \ - lattice-prune --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$vecs_beam ark:- ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - weight-silence-post 0.0 $silphonelist $srcdir/final.mdl ark:- ark:- \| \ - sgmm-est-fmllr --spk2utt=ark:$sdata1/JOB/spk2utt "$gselect_opt" \ - --fmllr-iters=$fmllr_iters --fmllr-min-count=$fmllr_min_count \ - $srcdir/final.fmllr_mdl "$feats" ark,s,cs:- "ark:$dir/trans.JOB" || exit 1; - fi - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" - fi - -# Now rescore the state-level lattices with the adapted features and the -# corresponding model. Prune and determinize the lattices to limit -# their size. - if [ $stage -le 6 ] && $use_fmllr; then - $cmd JOB=1:$nj $dir/log/rescore.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" --utt2spk=ark:$sdata1/JOB/utt2spk \ - $srcdir/final.mdl "ark:gunzip -c $dir/pre_lat.JOB.gz|" "$feats" ark:- \| \ - lattice-determinize-pruned --acoustic-scale=$acwt --beam=$lattice_beam ark:- \ - "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1; - rm $dir/pre_lat.*.gz - else # Already done with decoding if no adaptation needed. - for n in `seq 1 $nj`; do - mv $dir/pre_lat.${n}.gz $dir/lat.${n}.gz - done - fi - -fi - -# The output of this script is the files "lat.*.gz"-- we'll rescore this at -# different acoustic scales to get the final output. - - -if [ $stage -le 7 ]; then - if ! $skip_scoring ; then - [ ! -x local/score.sh ] && \ - echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; - echo "score best paths" - local/score.sh --cmd "$cmd" $data $graphdir $dir || - { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } - # echo "score confidence and timing with sclite" - # local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir - fi -fi -echo "Decoding done." -exit 0; diff --git a/egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh b/egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh deleted file mode 100755 index 6ee4609fb48..00000000000 --- a/egs/wsj/s5/steps/tandem/make_denlats_sgmm.sh +++ /dev/null @@ -1,199 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# Korbinian Riedhammer - -# Create denominator lattices for MMI/MPE training, with SGMM models. If the -# features have fMLLR transforms you have to supply the --transform-dir option. -# It gets any speaker vectors from the "alignment dir" ($srcdir). Note: this is -# possibly a slight mismatch because the speaker vectors come from supervised -# adaptation. - -# Begin configuration section. -nj=4 -cmd=run.pl -sub_split=1 -beam=13.0 -lattice_beam=7.0 -acwt=0.1 -max_active=5000 -transform_dir= -max_mem=20000000 # This will stop the processes getting too large. -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# != 5 ]; then - echo "Usage: steps/tandem/make_denlats_sgmm.sh [options] " - echo " e.g.: steps/tandem/make_denlats_sgmm.sh {mfcc,bottleneck}/data/train data/lang exp/sgmm4a_ali exp/sgmm4a_denlats" - echo "Works for (delta|lda) features, and (with --transform-dir option) such features" - echo " plus transforms." - echo "" - echo "Main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --sub-split # e.g. 40; use this for " - echo " # large databases so your jobs will be smaller and" - echo " # will (individually) finish reasonably soon." - echo " --transform-dir # directory to find fMLLR transforms." - exit 1; -fi - -data1=$1 -data2=$2 -lang=$3 -srcdir=$4 # could also be $srcdir, but only if no vectors supplied. -dir=$5 - -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -normft2=`cat $srcdir/normft2 2>/dev/null` -mkdir -p $dir/log - -utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; - -sdata1=$data1/split$nj -sdata2=$data2/split$nj -[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1; -[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1; - -echo $nj > $dir/num_jobs - -oov=`cat $lang/oov.int` || exit 1; - -mkdir -p $dir - -cp -r $lang $dir/ - -# Compute grammar FST which corresponds to unigram decoding graph. - -cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \ - awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \ - utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \ - || exit 1; - -# mkgraph.sh expects a whole directory "lang", so put everything in one directory... -# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and -# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph. - -if [ -s $dir/dengraph/HCLG.fst ]; then - echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation." -else - utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1; -fi - -# Set up features -if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi - -case $feat_type in - delta) - echo "$0: feature type is $feat_type" - ;; - lda) - echo "$0: feature type is $feat_type" - cp $srcdir/{lda,final}.mat $dir/ || exit 1; - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac - -# set up feature stream 1; this are usually spectral features, so we will add -# deltas or splice them -feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |" - -if [ "$feat_type" == "delta" ]; then - feats1="$feats1 add-deltas ark:- ark:- |" -elif [ "$feat_type" == "lda" ]; then - feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |" -fi - -# set up feature stream 2; this are usually bottleneck or posterior features, -# which may be normalized if desired -feats2="scp:$sdata2/JOB/feats.scp" - -if [ "$normft2" == "true" ]; then - feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |" -fi - -# assemble tandem features -feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |" - -# add transformation, if applicable -if [ "$feat_type" == "lda" ]; then - feats="$feats transform-feats $dir/final.mat ark:- ark:- |" -fi - -# splicing/normalization options -cp $srcdir/{splice_opts,normft2,tandem} $dir 2>/dev/null - - -if [ ! -z "$transform_dir" ]; then # add transforms to features... - echo "$0: using fMLLR transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." - [ "`cat $transform_dir/num_jobs`" -ne "$nj" ] \ - && echo "$0: mismatch in number of jobs with $transform_dir" && exit 1; - [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \ - echo "$0: LDA transforms differ between $srcdir and $transform_dir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |" -else - echo "Assuming you don't have a SAT system, since no --transform-dir option supplied " -fi - -if [ -f $srcdir/gselect.1.gz ]; then - gselect_opt="--gselect=ark:gunzip -c $srcdir/gselect.JOB.gz|" -else - echo "$0: no such file $srcdir/gselect.1.gz" && exit 1; -fi - -if [ -f $srcdir/vecs.1 ]; then - spkvecs_opt="--spk-vecs=ark:$srcdir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" -else - if [ -f $srcdir/final.alimdl ]; then - echo "You seem to have an SGMM system with speaker vectors," - echo "yet we can't find speaker vectors. Perhaps you supplied" - echo "the model director instead of the alignment directory?" - exit 1; - fi -fi - -if [ $sub_split -eq 1 ]; then - $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \ - sgmm-latgen-faster $spkvecs_opt "$gselect_opt" --beam=$beam \ - --lattice-beam=$lattice_beam --acoustic-scale=$acwt \ - --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ - $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; -else - for n in `seq $nj`; do - if [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then - echo "Not processing subset $n as already done (delete $dir/.done.$n if not)"; - else - ssdata1=$data1/split$nj/$n/split${sub_split}utt; - split_data.sh --per-utt $sdata1/$n $sub_split || exit 1; - ssdata2=$data2/split$nj/$n/split${sub_split}utt; - split_data.sh --per-utt $sdata2/$n $sub_split || exit 1; - mkdir -p $dir/log/$n - mkdir -p $dir/part - feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g` - spkvecs_opt_subset=`echo $spkvecs_opt | sed "s/JOB/$n/g"` - gselect_opt_subset=`echo $gselect_opt | sed "s/JOB/$n/g"` - $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ - sgmm-latgen-faster $spkvecs_opt_subset "$gselect_opt_subset" \ - --beam=$beam --lattice-beam=$lattice_beam \ - --acoustic-scale=$acwt --max-mem=$max_mem --max-active=$max_active \ - --word-symbol-table=$lang/words.txt $srcdir/final.mdl \ - $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1; - echo Merging archives for data subset $n - rm $dir/.error 2>/dev/null; - for k in `seq $sub_split`; do - gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error; - done | gzip -c > $dir/lat.$n.gz || touch $dir/.error; - [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1; - rm $dir/lat.$n.*.gz - touch $dir/.done.$n - fi - done -fi - - -echo "$0: done generating denominator lattices with SGMMs." diff --git a/egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh b/egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh deleted file mode 100755 index 3077fbceef3..00000000000 --- a/egs/wsj/s5/steps/tandem/train_mmi_sgmm.sh +++ /dev/null @@ -1,193 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# Korbinian Riedhammer - -# MMI training (or optionally boosted MMI, if you give the --boost option), -# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update. -# -# Begin configuration section. -cmd=run.pl -num_iters=4 -boost=0.0 -cancel=true # if true, cancel num and den counts on each frame. -acwt=0.1 -stage=0 - -update_opts= -transform_dir= -# End configuration section - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 6 ]; then - echo "Usage: steps/tandem/train_mmi_sgmm.sh " - echo " e.g.: steps/tandem/train_mmi_sgmm.sh {mfcc,bottleneck}/data1/train_si84 data1/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" - echo "Main options (for others, see top of script file)" - echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" - echo " --cancel (true|false) # cancel stats (true by default)" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --config # config containing options" - echo " --stage # stage to do partial re-run from." - echo " --transform-dir # directory to find fMLLR transforms." - exit 1; -fi - -data1=$1 -data2=$2 -lang=$3 -alidir=$4 -denlatdir=$5 -dir=$6 -mkdir -p $dir/log - -utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -for f in $data1/feats.scp $data2/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done -nj=`cat $alidir/num_jobs` || exit 1; -[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ - echo "$alidir and $denlatdir have different num-jobs" && exit 1; - -mkdir -p $dir/log -echo $nj > $dir/num_jobs - -cp $alidir/{final.mdl,tree} $dir -silphonelist=`cat $lang/phones/silence.csl` || exit 1; - -# Set up features - -sdata1=$data1/split$nj -sdata2=$data2/split$nj -[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1; -[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1; - -splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. -normft2=`cat $alidir/normft2 2>/dev/null` - -if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi - -case $feat_type in - delta) - echo "$0: feature type is $feat_type" - ;; - lda) - echo "$0: feature type is $feat_type" - cp $alidir/{lda,final}.mat $dir/ || exit 1; - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac - -# set up feature stream 1; this are usually spectral features, so we will add -# deltas or splice them -feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |" - -if [ "$feat_type" == "delta" ]; then - feats1="$feats1 add-deltas ark:- ark:- |" -elif [ "$feat_type" == "lda" ]; then - feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |" -fi - -# set up feature stream 2; this are usually bottleneck or posterior features, -# which may be normalized if desired -feats2="scp:$sdata2/JOB/feats.scp" - -if [ "$normft2" == "true" ]; then - feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |" -fi - -# assemble tandem features -feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |" - -# add transformation, if applicable -if [ "$feat_type" == "lda" ]; then - feats="$feats transform-feats $dir/final.mat ark:- ark:- |" -fi - -# splicing/normalization options -cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null - - -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \ - && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -else - echo "$0: no fMLLR transforms." -fi - -if [ -f $alidir/vecs.1 ]; then - echo "$0: using speaker vectors from $alidir" - spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata1/JOB/utt2spk" -else - echo "$0: no speaker vectors." - spkvecs_opt= -fi - -if [ -f $alidir/gselect.1.gz ]; then - echo "$0: using Gaussian-selection info from $alidir" - gselect_opt="--gselect=ark:gunzip -c $alidir/gselect.JOB.gz|" -else - echo "$0: error: no Gaussian-selection info found" && exit 1; -fi - -lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" -if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then - lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" -fi - - -cur_mdl=$alidir/final.mdl -x=0 -while [ $x -lt $num_iters ]; do - echo "Iteration $x of MMI training" - # Note: the num and den states are accumulated at the same time, so we - # can cancel them per frame. - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ - sgmm-rescore-lattice "$gselect_opt" $spkvecs_opt $cur_mdl "$lats" "$feats" ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - sum-post --merge=$cancel --scale1=-1 \ - ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ - sgmm-acc-stats2 "$gselect_opt" $spkvecs_opt $cur_mdl "$feats" ark,s,cs:- \ - $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; - - n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; - [ "$n" -ne $[$nj*2] ] && \ - echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; - $cmd $dir/log/den_acc_sum.$x.log \ - sgmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; - rm $dir/den_acc.$x.*.acc - $cmd $dir/log/num_acc_sum.$x.log \ - sgmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; - rm $dir/num_acc.$x.*.acc - - $cmd $dir/log/update.$x.log \ - sgmm-est-ebw $update_opts $cur_mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; - fi - cur_mdl=$dir/$[$x+1].mdl - - - # Some diagnostics: the objective function progress and auxiliary-function - # improvement. Note: this code is same as in train_mmi.sh - tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while() { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf - objf=`cat $dir/tmpf | awk '{print $1}'`; - nf=`cat $dir/tmpf | awk '{print $2}'`; - rm $dir/tmpf - impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` - impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. - echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log - x=$[$x+1] -done - -echo "MMI training finished" - -rm $dir/final.mdl 2>/dev/null -ln -s $x.mdl $dir/final.mdl - -exit 0; diff --git a/egs/wsj/s5/steps/tandem/train_sgmm.sh b/egs/wsj/s5/steps/tandem/train_sgmm.sh deleted file mode 100755 index 48f392141a1..00000000000 --- a/egs/wsj/s5/steps/tandem/train_sgmm.sh +++ /dev/null @@ -1,315 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# Korbinian Riedhammer - -# SGMM training, with speaker vectors. This script would normally be called on -# top of fMLLR features obtained from a conventional system, but it also works -# on top of any type of speaker-independent features (based on -# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The -# subspace Gaussian mixture model--A structured model for speech recognition". -# (Computer Speech and Language, 2011). - -# Begin configuration section. -nj=4 -cmd=run.pl -stage=-6 -context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a -# quinphone system. -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -num_iters=25 # Total number of iterations -num_iters_alimdl=3 # Number of iterations for estimating alignment model. -max_iter_inc=15 # Last iter to increase #substates on. -realign_iters="5 10 15"; # Iters to realign on. -spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on. -increase_dim_iters="6 8"; # Iters on which to increase phn dim and/or spk dim; - # rarely necessary, and if it is, only the 1st will normally be necessary. -rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training. -phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1] -spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim] -power=0.2 # Exponent for number of gaussians according to occurrence counts -beam=8 -retry_beam=40 -cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves -normft2=true -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - - -if [ $# != 8 ]; then - echo "Usage: steps/tandem/train_sgmm.sh " - echo " e.g.: steps/tandem/train_sgmm.sh 3500 10000 {mfcc,bottleneck},data/train_si84 data/lang \\" - echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a" - echo "main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --silence-weight # weight for silence (e.g. 0.5 or 0.0)" - echo " --num-iters <#iters> # Number of iterations of E-M" - exit 1; -fi - - -num_leaves=$1 -totsubstates=$2 -data1=$3 -data2=$4 -lang=$5 -alidir=$6 -ubm=$7 -dir=$8 - -# Check some files. -for f in $data1/feats.scp $data2/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - - -# Set some variables. -oov=`cat $lang/oov.int` -silphonelist=`cat $lang/phones/silence.csl` -numsubstates=$num_leaves # Initial #-substates. -incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates -feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1; -[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric. -[ -z $phn_dim ] && phn_dim=$[$feat_dim+1] -[ -z $spk_dim ] && spk_dim=$feat_dim -nj=`cat $alidir/num_jobs` || exit 1; - -mkdir -p $dir/log -echo $nj > $dir/num_jobs - -utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -sdata1=$data1/split$nj; -sdata2=$data2/split$nj; -[[ -d $sdata1 && $data1/feats.scp -ot $sdata1 ]] || split_data.sh $data1 $nj || exit 1; -[[ -d $sdata2 && $data2/feats.scp -ot $sdata2 ]] || split_data.sh $data2 $nj || exit 1; - -spkvecs_opt= # Empty option for now, until we estimate the speaker vectors. -gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" - -## Set up features. -splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. -normft2=`cat $alidir/normft2 2>/dev/null` - -if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi - -case $feat_type in - delta) - echo "$0: feature type is $feat_type" - ;; - lda) - echo "$0: feature type is $feat_type" - cp $alidir/{lda,final}.mat $dir/ || exit 1; - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac - -# set up feature stream 1; this are usually spectral features, so we will add -# deltas or splice them -feats1="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata1/JOB/utt2spk scp:$sdata1/JOB/cmvn.scp scp:$sdata1/JOB/feats.scp ark:- |" - -if [ "$feat_type" == "delta" ]; then - feats1="$feats1 add-deltas ark:- ark:- |" -elif [ "$feat_type" == "lda" ]; then - feats1="$feats1 splice-feats $splice_opts ark:- ark:- | transform-feats $dir/lda.mat ark:- ark:- |" -fi - -# set up feature stream 2; this are usually bottleneck or posterior features, -# which may be normalized if desired -feats2="scp:$sdata2/JOB/feats.scp" - -if [ "$normft2" == "true" ]; then - feats2="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata2/JOB/utt2spk scp:$sdata2/JOB/cmvn.scp $feats2 ark:- |" -fi - -# assemble tandem features -feats="ark,s,cs:paste-feats '$feats1' '$feats2' ark:- |" - -# add transformation, if applicable -if [ "$feat_type" == "lda" ]; then - feats="$feats transform-feats $dir/final.mat ark:- ark:- |" -fi - -# splicing/normalization options -cp $alidir/{splice_opts,normft2,tandem} $dir 2>/dev/null - -if [ -f $alidir/trans.1 ]; then - echo "$0: using transforms from $alidir" - feats="$feats transform-feats --utt2spk=ark:$sdata1/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" -fi -## - - -if [ $stage -le -6 ]; then - echo "$0: accumulating tree stats" - $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ - acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ - "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; - [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1; - sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; - rm $dir/*.treeacc -fi - -if [ $stage -le -5 ]; then - echo "$0: Getting questions for tree clustering." - # preparing questions, roots file... - cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; - cat $lang/phones/extra_questions.int >> $dir/questions.int - compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; - - echo "$0: Building the tree" - $cmd $dir/log/build_tree.log \ - build-tree --verbose=1 --max-leaves=$num_leaves \ - --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ - $dir/questions.qst $lang/topo $dir/tree || exit 1; -fi - -if [ $stage -le -4 ]; then - echo "$0: Initializing the model" - # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims - # will be truncated on initialization. - $cmd $dir/log/init_sgmm.log \ - sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo \ - $dir/tree $ubm $dir/0.mdl || exit 1; -fi - -if [ $stage -le -3 ]; then - echo "$0: doing Gaussian selection" - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect $dir/0.mdl "$feats" \ - "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1; -fi - -if [ $stage -le -2 ]; then - echo "$0: compiling training graphs" - text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata1/JOB/text|" - $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ - compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl $lang/L.fst \ - "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; -fi - -if [ $stage -le -1 ]; then - echo "$0: Converting alignments" - $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \ - convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \ - "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; -fi - -x=0 -while [ $x -lt $num_iters ]; do - echo "$0: training pass $x ... " - if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then - echo "$0: re-aligning data" - $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ - sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \ - --utt2spk=ark:$sdata1/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \ - $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ - "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; - fi - if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \ - ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata1/JOB/spk2utt \ - $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \ - ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1; - fi - spkvecs_opt[$n]="--spk-vecs=ark:$dir/vecs.JOB" - fi - if [ $x -eq 0 ]; then - flags=vwcSt # on the first iteration, don't update projections M or N - elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then - # Update N if we have speaker-vector space and x is odd, - # and we've already updated the speaker vectors... - flags=vNwcSt - else - # otherwise update M. - flags=vMwcSt - fi - - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ - sgmm-acc-stats $spkvecs_opt --utt2spk=ark:$sdata1/JOB/utt2spk \ - --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \ - $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \ - $dir/$x.JOB.acc || exit 1; - fi - - # The next option is needed if the user specifies a phone or speaker sub-space - # dimension that's higher than the "normal" one. - increase_dim_opts= - if echo $increase_dim_iters | grep -w $x >/dev/null; then - increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim" - # Note: the command below might have a null effect on some iterations. - if [ $spk_dim -gt $feat_dim ]; then - cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \ - copy-vector --print-args=false --change-dim=$spk_dim \ - ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \ - mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1; - fi - fi - - if [ $stage -le $x ]; then - $cmd $dir/log/update.$x.log \ - sgmm-est --update-flags=$flags --split-substates=$numsubstates $increase_dim_opts \ - --power=$power --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \ - $dir/$[$x+1].mdl || exit 1; - rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null - fi - - if [ $x -lt $max_iter_inc ]; then - numsubstates=$[$numsubstates+$incsubstates] - fi - x=$[$x+1]; -done - -rm $dir/final.mdl $dir/final.occs 2>/dev/null -ln -s $x.mdl $dir/final.mdl -ln -s $x.occs $dir/final.occs - -if [ $spk_dim -gt 0 ]; then - # We need to create an "alignment model" that's been trained - # without the speaker vectors, to do the first-pass decoding with. - # in test time. - - # We do this for a few iters, in this recipe. - final_mdl=$dir/$x.mdl - cur_alimdl=$dir/$x.mdl - while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do - echo "$0: building alignment model (pass $x)" - if [ $x -eq $num_iters ]; then # 1st pass of building alimdl. - flags=MwcS # don't update v the first time. Note-- we never update transitions. - # they wouldn't change anyway as we use the same alignment as previously. - else - flags=vMwcS - fi - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \ - ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ - sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \ - --utt2spk=ark:$sdata1/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \ - sgmm-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \ - $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1; - $cmd $dir/log/update_ali.$x.log \ - sgmm-est --update-flags=$flags --remove-speaker-space=true --power=$power $cur_alimdl \ - "sgmm-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1; - rm $dir/$x.*.aliacc || exit 1; - [ $x -gt $num_iters ] && rm $dir/$x.alimdl - fi - cur_alimdl=$dir/$[$x+1].alimdl - x=$[$x+1] - done - rm $dir/final.alimdl 2>/dev/null - ln -s $x.alimdl $dir/final.alimdl -fi - -utils/summarize_warnings.pl $dir/log - -echo Done diff --git a/egs/wsj/s5/steps/train_mmi_sgmm.sh b/egs/wsj/s5/steps/train_mmi_sgmm.sh deleted file mode 100755 index cb0700e92fc..00000000000 --- a/egs/wsj/s5/steps/train_mmi_sgmm.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# MMI training (or optionally boosted MMI, if you give the --boost option), -# for SGMMs. 4 iterations (by default) of Extended Baum-Welch update. -# -# Begin configuration section. -cmd=run.pl -num_iters=4 -boost=0.0 -cancel=true # if true, cancel num and den counts on each frame. -acwt=0.1 -stage=0 - -update_opts= -transform_dir= -# End configuration section - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. parse_options.sh || exit 1; - -if [ $# -ne 5 ]; then - echo "Usage: steps/train_mmi_sgmm.sh " - echo " e.g.: steps/train_mmi_sgmm.sh data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b_mmi" - echo "Main options (for others, see top of script file)" - echo " --boost # (e.g. 0.1), for boosted MMI. (default 0)" - echo " --cancel (true|false) # cancel stats (true by default)" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --config # config containing options" - echo " --stage # stage to do partial re-run from." - echo " --transform-dir # directory to find fMLLR transforms." - exit 1; -fi - -data=$1 -lang=$2 -alidir=$3 -denlatdir=$4 -dir=$5 -mkdir -p $dir/log - -utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.1.gz; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done -nj=`cat $alidir/num_jobs` || exit 1; -[ "$nj" -ne "`cat $denlatdir/num_jobs`" ] && \ - echo "$alidir and $denlatdir have different num-jobs" && exit 1; - -sdata=$data/split$nj -splice_opts=`cat $alidir/splice_opts 2>/dev/null` -cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` -mkdir -p $dir/log -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; -cp $alidir/splice_opts $dir 2>/dev/null -cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. -echo $nj > $dir/num_jobs - -cp $alidir/tree $dir -cp $alidir/final.mdl $dir/0.mdl -cp $alidir/final.alimdl $dir - -silphonelist=`cat $lang/phones/silence.csl` || exit 1; - -# Set up features - -if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" - cp $alidir/final.mat $dir - ;; - *) echo "Invalid feature type $feat_type" && exit 1; -esac - -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -f $transform_dir/trans.1 ] && echo "$0: no such file $transform_dir/trans.1" \ - && exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |" -else - echo "$0: no fMLLR transforms." -fi - -if [ -f $alidir/vecs.1 ]; then - echo "$0: using speaker vectors from $alidir" - spkvecs_opt="--spk-vecs=ark:$alidir/vecs.JOB --utt2spk=ark:$sdata/JOB/utt2spk" -else - echo "$0: no speaker vectors." - spkvecs_opt= -fi - -if [ -f $alidir/gselect.1.gz ]; then - echo "$0: using Gaussian-selection info from $alidir" - gselect_opt="--gselect=ark,s,cs:gunzip -c $alidir/gselect.JOB.gz|" -else - echo "$0: error: no Gaussian-selection info found" && exit 1; -fi - -lats="ark:gunzip -c $denlatdir/lat.JOB.gz|" -if [[ "$boost" != "0.0" && "$boost" != 0 ]]; then - lats="$lats lattice-boost-ali --b=$boost --silence-phones=$silphonelist $alidir/final.mdl ark:- 'ark,s,cs:gunzip -c $alidir/ali.JOB.gz|' ark:- |" -fi - -x=0 -while [ $x -lt $num_iters ]; do - echo "Iteration $x of MMI training" - # Note: the num and den states are accumulated at the same time, so we - # can cancel them per frame. - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ - sgmm-rescore-lattice --speedup=true "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$lats" "$feats" ark:- \| \ - lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \ - sum-post --merge=$cancel --scale1=-1 \ - ark:- "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- |" ark:- \| \ - sgmm-acc-stats2 "$gselect_opt" $spkvecs_opt $dir/$x.mdl "$feats" ark,s,cs:- \ - $dir/num_acc.$x.JOB.acc $dir/den_acc.$x.JOB.acc || exit 1; - - n=`echo $dir/{num,den}_acc.$x.*.acc | wc -w`; - [ "$n" -ne $[$nj*2] ] && \ - echo "Wrong number of MMI accumulators $n versus 2*$nj" && exit 1; - $cmd $dir/log/den_acc_sum.$x.log \ - sgmm-sum-accs $dir/den_acc.$x.acc $dir/den_acc.$x.*.acc || exit 1; - rm $dir/den_acc.$x.*.acc - $cmd $dir/log/num_acc_sum.$x.log \ - sgmm-sum-accs $dir/num_acc.$x.acc $dir/num_acc.$x.*.acc || exit 1; - rm $dir/num_acc.$x.*.acc - - $cmd $dir/log/update.$x.log \ - sgmm-est-ebw $update_opts $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl || exit 1; - fi - - # Some diagnostics: the objective function progress and auxiliary-function - # improvement. Note: this code is same as in train_mmi.sh - tail -n 50 $dir/log/acc.$x.*.log | perl -e '$acwt=shift @ARGV; while() { if(m/gmm-acc-stats2.+Overall weighted acoustic likelihood per frame was (\S+) over (\S+) frames/) { $tot_aclike += $1*$2; $tot_frames1 += $2; } if(m|lattice-to-post.+Overall average log-like/frame is (\S+) over (\S+) frames. Average acoustic like/frame is (\S+)|) { $tot_den_lat_like += $1*$2; $tot_frames2 += $2; $tot_den_aclike += $3*$2; } } if (abs($tot_frames1 - $tot_frames2) > 0.01*($tot_frames1 + $tot_frames2)) { print STDERR "Frame-counts disagree $tot_frames1 versus $tot_frames2\n"; } $tot_den_lat_like /= $tot_frames2; $tot_den_aclike /= $tot_frames2; $tot_aclike *= ($acwt / $tot_frames1); $num_like = $tot_aclike + $tot_den_aclike; $per_frame_objf = $num_like - $tot_den_lat_like; print "$per_frame_objf $tot_frames1\n"; ' $acwt > $dir/tmpf - objf=`cat $dir/tmpf | awk '{print $1}'`; - nf=`cat $dir/tmpf | awk '{print $2}'`; - rm $dir/tmpf - impr=`grep -w Overall $dir/log/update.$x.log | awk '{x += $10*$12;} END{print x;}'` - impr=`perl -e "print ($impr*$acwt/$nf);"` # We multiply by acwt, and divide by $nf which is the "real" number of frames. - echo "Iteration $x: objf was $objf, MMI auxf change was $impr" | tee $dir/objf.$x.log - x=$[$x+1] -done - -echo "MMI training finished" - -rm $dir/final.mdl 2>/dev/null -ln -s $x.mdl $dir/final.mdl - -exit 0; diff --git a/egs/wsj/s5/steps/train_sgmm.sh b/egs/wsj/s5/steps/train_sgmm.sh deleted file mode 100755 index 0d372be2d84..00000000000 --- a/egs/wsj/s5/steps/train_sgmm.sh +++ /dev/null @@ -1,280 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# SGMM training, with speaker vectors. This script would normally be called on -# top of fMLLR features obtained from a conventional system, but it also works -# on top of any type of speaker-independent features (based on -# deltas+delta-deltas or LDA+MLLT). For more info on SGMMs, see the paper "The -# subspace Gaussian mixture model--A structured model for speech recognition". -# (Computer Speech and Language, 2011). - -# Begin configuration section. -nj=4 -cmd=run.pl -stage=-6 -context_opts= # e.g. set it to "--context-width=5 --central-position=2" for a -# quinphone system. -scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" -num_iters=25 # Total number of iterations -num_iters_alimdl=3 # Number of iterations for estimating alignment model. -max_iter_inc=15 # Last iter to increase #substates on. -realign_iters="5 10 15"; # Iters to realign on. -spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on. -increase_dim_iters="6 8"; # Iters on which to increase phn dim and/or spk dim; - # rarely necessary, and if it is, only the 1st will normally be necessary. -rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training. -phn_dim= # You can use this to set the phonetic subspace dim. [default: feat-dim+1] -spk_dim= # You can use this to set the speaker subspace dim. [default: feat-dim] -power=0.25 # Exponent for number of gaussians according to occurrence counts -beam=8 -retry_beam=40 -cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - - -if [ $# != 7 ]; then - echo "Usage: steps/train_sgmm.sh " - echo " e.g.: steps/train_sgmm.sh 3500 10000 data/train_si84 data/lang \\" - echo " exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a" - echo "main options (for others, see top of script file)" - echo " --config # config containing options" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --silence-weight # weight for silence (e.g. 0.5 or 0.0)" - echo " --num-iters <#iters> # Number of iterations of E-M" - exit 1; -fi - - -num_leaves=$1 -totsubstates=$2 -data=$3 -lang=$4 -alidir=$5 -ubm=$6 -dir=$7 - -# Check some files. -for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - - -# Set some variables. -oov=`cat $lang/oov.int` -silphonelist=`cat $lang/phones/silence.csl` -numsubstates=$num_leaves # Initial #-substates. -incsubstates=$[($totsubstates-$numsubstates)/$max_iter_inc] # per-iter increment for #substates -feat_dim=`gmm-info $alidir/final.mdl 2>/dev/null | awk '/feature dimension/{print $NF}'` || exit 1; -[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric. -[ -z $phn_dim ] && phn_dim=$[$feat_dim+1] -[ -z $spk_dim ] && spk_dim=$feat_dim -nj=`cat $alidir/num_jobs` || exit 1; -ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; - -mkdir -p $dir/log -echo $nj > $dir/num_jobs -sdata=$data/split$nj; -splice_opts=`cat $alidir/splice_opts 2>/dev/null` -cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` -cp $alidir/splice_opts $dir 2>/dev/null -cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. - -utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; -cp $lang/phones.txt $dir || exit 1; - -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; - -spkvecs_opt= # Empty option for now, until we estimate the speaker vectors. -gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" - -## Set up features. -if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi -echo "$0: feature type is $feat_type" - -case $feat_type in - delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";; - lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" - cp $alidir/final.mat $dir - ;; - *) echo "$0: invalid feature type $feat_type" && exit 1; -esac -if [ -f $alidir/trans.1 ]; then - echo "$0: using transforms from $alidir" - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" -fi -## - - -if [ $stage -le -6 ]; then - echo "$0: accumulating tree stats" - $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ - acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ - "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; - [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1; - sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; - rm $dir/*.treeacc -fi - -if [ $stage -le -5 ]; then - echo "$0: Getting questions for tree clustering." - # preparing questions, roots file... - cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1; - cat $lang/phones/extra_questions.int >> $dir/questions.int - compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; - - echo "$0: Building the tree" - $cmd $dir/log/build_tree.log \ - build-tree --verbose=1 --max-leaves=$num_leaves \ - --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ - $dir/questions.qst $lang/topo $dir/tree || exit 1; -fi - -if [ $stage -le -4 ]; then - echo "$0: Initializing the model" - # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims - # will be truncated on initialization. - $cmd $dir/log/init_sgmm.log \ - sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo \ - $dir/tree $ubm $dir/0.mdl || exit 1; -fi - -if [ $stage -le -3 ]; then - echo "$0: doing Gaussian selection" - $cmd JOB=1:$nj $dir/log/gselect.JOB.log \ - sgmm-gselect $dir/0.mdl "$feats" \ - "ark,t:|gzip -c >$dir/gselect.JOB.gz" || exit 1; -fi - -if [ $stage -le -2 ]; then - echo "$0: compiling training graphs" - text="ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text|" - $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ - compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/0.mdl $lang/L.fst \ - "$text" "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; -fi - -if [ $stage -le -1 ]; then - echo "$0: Converting alignments" - $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \ - convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \ - "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; -fi - -x=0 -while [ $x -lt $num_iters ]; do - echo "$0: training pass $x ... " - if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then - echo "$0: re-aligning data" - $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ - sgmm-align-compiled $spkvecs_opt $scale_opts "$gselect_opt" \ - --utt2spk=ark:$sdata/JOB/utt2spk --beam=$beam --retry-beam=$retry_beam \ - $dir/$x.mdl "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ - "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; - fi - if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/spkvecs.$x.JOB.log \ - ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ - weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \ - sgmm-est-spkvecs --rand-prune=$rand_prune --spk2utt=ark:$sdata/JOB/spk2utt \ - $spkvecs_opt "$gselect_opt" $dir/$x.mdl "$feats" ark,s,cs:- \ - ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1; - fi - spkvecs_opt[$n]="--spk-vecs=ark:$dir/vecs.JOB" - fi - if [ $x -eq 0 ]; then - flags=vwcSt # on the first iteration, don't update projections M or N - elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then - # Update N if we have speaker-vector space and x is odd, - # and we've already updated the speaker vectors... - flags=vNwcSt - else - # otherwise update M. - flags=vMwcSt - fi - - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ - sgmm-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \ - --update-flags=$flags "$gselect_opt" --rand-prune=$rand_prune \ - $dir/$x.mdl "$feats" "ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|" \ - $dir/$x.JOB.acc || exit 1; - fi - - # The next option is needed if the user specifies a phone or speaker sub-space - # dimension that's higher than the "normal" one. - increase_dim_opts= - if echo $increase_dim_iters | grep -w $x >/dev/null; then - increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim" - # Note: the command below might have a null effect on some iterations. - if [ $spk_dim -gt $feat_dim ]; then - cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \ - copy-vector --print-args=false --change-dim=$spk_dim \ - ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \ - mv $dir/vecs_tmp.JOB $dir/vecs.JOB || exit 1; - fi - fi - - if [ $stage -le $x ]; then - $cmd $dir/log/update.$x.log \ - sgmm-est --update-flags=$flags --split-substates=$numsubstates $increase_dim_opts \ - --power=$power --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \ - $dir/$[$x+1].mdl || exit 1; - rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 2>/dev/null - fi - - if [ $x -lt $max_iter_inc ]; then - numsubstates=$[$numsubstates+$incsubstates] - fi - x=$[$x+1]; -done - -rm $dir/final.mdl $dir/final.occs 2>/dev/null -ln -s $x.mdl $dir/final.mdl -ln -s $x.occs $dir/final.occs - -if [ $spk_dim -gt 0 ]; then - # We need to create an "alignment model" that's been trained - # without the speaker vectors, to do the first-pass decoding with. - # in test time. - - # We do this for a few iters, in this recipe. - final_mdl=$dir/$x.mdl - cur_alimdl=$dir/$x.mdl - while [ $x -lt $[$num_iters+$num_iters_alimdl] ]; do - echo "$0: building alignment model (pass $x)" - if [ $x -eq $num_iters ]; then # 1st pass of building alimdl. - flags=MwcS # don't update v the first time. Note-- we never update transitions. - # they wouldn't change anyway as we use the same alignment as previously. - else - flags=vMwcS - fi - if [ $stage -le $x ]; then - $cmd JOB=1:$nj $dir/log/acc_ali.$x.JOB.log \ - ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ - sgmm-post-to-gpost $spkvecs_opt "$gselect_opt" \ - --utt2spk=ark:$sdata/JOB/utt2spk $final_mdl "$feats" ark,s,cs:- ark:- \| \ - sgmm-acc-stats-gpost --rand-prune=$rand_prune --update-flags=$flags \ - $cur_alimdl "$feats" ark,s,cs:- $dir/$x.JOB.aliacc || exit 1; - $cmd $dir/log/update_ali.$x.log \ - sgmm-est --update-flags=$flags --remove-speaker-space=true --power=$power $cur_alimdl \ - "sgmm-sum-accs - $dir/$x.*.aliacc|" $dir/$[$x+1].alimdl || exit 1; - rm $dir/$x.*.aliacc || exit 1; - [ $x -gt $num_iters ] && rm $dir/$x.alimdl - fi - cur_alimdl=$dir/$[$x+1].alimdl - x=$[$x+1] - done - rm $dir/final.alimdl 2>/dev/null - ln -s $x.alimdl $dir/final.alimdl -fi - -utils/summarize_warnings.pl $dir/log - -echo Done diff --git a/src/Doxyfile b/src/Doxyfile index f5e874be3ad..bf2dc5197e2 100644 --- a/src/Doxyfile +++ b/src/Doxyfile @@ -453,9 +453,9 @@ WARN_LOGFILE = # the lines after "doc itf" are copied from SUBDIRS in the Makefile. INPUT = doc itf \ - base matrix util feat tree thread gmm transform sgmm \ + base matrix util feat tree thread gmm transform \ fstext hmm lm decoder lat cudamatrix nnet \ - bin fstbin gmmbin fgmmbin sgmmbin featbin \ + bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 sgmm2bin nnet2 nnet2bin nnet3 nnet3bin \ kwsbin ivector ivectorbin diff --git a/src/Makefile b/src/Makefile index 9905be869a0..8bc18b254e9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -5,15 +5,15 @@ SHELL := /bin/bash -SUBDIRS = base matrix util feat tree thread gmm transform sgmm \ +SUBDIRS = base matrix util feat tree thread gmm transform \ fstext hmm lm decoder lat kws cudamatrix nnet \ - bin fstbin gmmbin fgmmbin sgmmbin featbin \ + bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \ ivector ivectorbin online2 online2bin lmbin chainbin -MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \ +MEMTESTDIRS = base matrix util feat tree thread gmm transform \ fstext hmm lm decoder lat nnet kws chain \ - bin fstbin gmmbin fgmmbin sgmmbin featbin \ + bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \ ivector ivectorbin online2 online2bin lmbin @@ -153,8 +153,8 @@ $(EXT_SUBDIRS) : mklibdir # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \ - base matrix util feat tree thread gmm transform sgmm sgmm2 fstext hmm \ +bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \ + base matrix util feat tree thread gmm transform sgmm2 fstext hmm \ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 #2)The libraries have inter-dependencies @@ -166,12 +166,11 @@ feat: base matrix util gmm transform tree thread tree: base util thread matrix gmm: base util matrix tree thread transform: base util matrix gmm tree thread -sgmm: base util matrix gmm tree transform thread hmm sgmm2: base util matrix gmm tree transform thread hmm fstext: base util thread matrix tree hmm: base tree matrix util thread lm: base util thread matrix fstext -decoder: base util thread matrix gmm sgmm hmm tree transform lat +decoder: base util thread matrix gmm hmm tree transform lat lat: base util thread hmm tree matrix cudamatrix: base util thread matrix nnet: base util hmm tree thread matrix cudamatrix @@ -180,8 +179,8 @@ nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain fstex chain: lat hmm tree fstext matrix cudamatrix util thread base ivector: base util matrix thread transform tree gmm #3)Dependencies for optional parts of Kaldi -onlinebin: base matrix util feat tree gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread -# python-kaldi-decoding: base matrix util feat tree thread gmm transform sgmm sgmm2 fstext hmm decoder lat online +onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread +# python-kaldi-decoding: base matrix util feat tree thread gmm transform sgmm2 fstext hmm decoder lat online online: decoder gmm transform feat matrix util base lat hmm thread tree online2: decoder gmm transform feat matrix util base lat hmm thread tree ivector cudamatrix nnet2 nnet3 chain kws: base util thread hmm tree matrix lat diff --git a/src/decoder/Makefile b/src/decoder/Makefile index fe489d1cb3f..93db701cb7a 100644 --- a/src/decoder/Makefile +++ b/src/decoder/Makefile @@ -11,7 +11,7 @@ OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-dec LIBNAME = kaldi-decoder -ADDLIBS = ../lat/kaldi-lat.a ../sgmm/kaldi-sgmm.a ../hmm/kaldi-hmm.a \ +ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ ../matrix/kaldi-matrix.a ../base/kaldi-base.a diff --git a/src/doc/kaldi_for_dummies.dox b/src/doc/kaldi_for_dummies.dox index 75a58011b1d..49c9fb69e42 100644 --- a/src/doc/kaldi_for_dummies.dox +++ b/src/doc/kaldi_for_dummies.dox @@ -413,7 +413,7 @@ b.) \c path.sh
export KALDI_ROOT=`pwd`/../.. # Setting paths to useful tools -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH # Defining audio data directory (modify it for your installation directory!) export DATA_ROOT="/home/{user}/kaldi-trunk/egs/digits/digits_audio" diff --git a/src/feat/Makefile b/src/feat/Makefile index 71a34192347..e987de55b38 100644 --- a/src/feat/Makefile +++ b/src/feat/Makefile @@ -6,13 +6,12 @@ include ../kaldi.mk TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \ feature-functions-test pitch-functions-test feature-sdc-test \ - resample-test online-feature-test sinusoid-detection-test \ - signal-test + resample-test online-feature-test signal-test OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \ feature-spectrogram.o mel-computations.o wave-reader.o \ - pitch-functions.o resample.o online-feature.o sinusoid-detection.o \ - signal.o feature-window.o + pitch-functions.o resample.o online-feature.o signal.o \ + feature-window.o LIBNAME = kaldi-feat diff --git a/src/feat/sinusoid-detection-test.cc b/src/feat/sinusoid-detection-test.cc deleted file mode 100644 index 68148b44ccf..00000000000 --- a/src/feat/sinusoid-detection-test.cc +++ /dev/null @@ -1,452 +0,0 @@ -// feat/sinusoid-detection-test.cc - -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include - -#include "base/kaldi-math.h" -#include "feat/sinusoid-detection.h" - - -namespace kaldi { - -// this function is used for testing AddSinusoid. -void AddSinusoidSimple(BaseFloat samp_freq, - const Sinusoid &sinusoid, - VectorBase *signal) { - for (int32 i = 0; i < signal->Dim(); i++) - (*signal)(i) += sinusoid.amplitude * - cos(M_2PI * sinusoid.freq / samp_freq * i + sinusoid.phase); -} - -void UnitTestAddSinusoid() { - BaseFloat samp_freq = 560.1; - int32 length = 511; - Vector orig(length); - orig.SetRandn(); - Vector orig2(orig); - Sinusoid sinusoid(49.20, 2.111, 1.5); - - AddSinusoid(samp_freq, sinusoid, &orig); - AddSinusoidSimple(samp_freq, sinusoid, &orig2); - AssertEqual(orig, orig2); -} - - - -void UnitTestQuadraticMaximizeEqualSpaced() { - for (int32 n = 0; n < 50; n++) { - - // Let the cubic function be y = a x^2 + b x + c, and let - // y0,y1,y2 be its values evaluated at x = [0, 1, 2]; we - // want it evaluated at arbitrary x. - - BaseFloat a = -0.5 + RandUniform(), b = -0.5 + RandUniform(), c = -0.5 + RandUniform(); - BaseFloat y[3]; - for (int32 i = 0; i < 3; i++) { - BaseFloat x = i; - y[i] = a * x * x + b * x + c; - } - BaseFloat x_max, y_max; - SinusoidDetector::QuadraticMaximizeEqualSpaced(y[0], y[1], y[2], &x_max, &y_max); - - for (int32 m = 0; m <= 10; m++) { - BaseFloat x_test = 0.1 * m; - BaseFloat y_test = a * x_test * x_test + b * x_test + c; - KALDI_ASSERT(y_test <= y_max + 1.0e-05); - } - } -} - -void UnitTestQuadraticMaximize() { - for (int32 n = 0; n < 50; n++) { - - // Let the cubic function be y = a x^2 + b x + c, and let - // y0,y1,y2 be its values evaluated at x = [0, 1, 2]; we - // want it evaluated at arbitrary x. - - BaseFloat a = -0.5 + RandUniform(), b = -0.5 + RandUniform(), c = -0.5 + RandUniform(), - x = 0.1 + RandUniform() * 0.98; - BaseFloat y[3]; - for (int32 i = 0; i < 3; i++) { - BaseFloat this_x; - if (i == 0) { this_x = 0.0; } - else if (i == 1) { this_x = x; } - else { this_x = 1.0; } - y[i] = a * this_x * this_x + b * this_x + c; - } - BaseFloat x_max, y_max; - SinusoidDetector::QuadraticMaximize(x, y[0], y[1], y[2], &x_max, &y_max); - - for (int32 m = 0; m <= 10; m++) { - BaseFloat x_test = 0.1 * m; - BaseFloat y_test = a * x_test * x_test + b * x_test + c; - if (n < 100 && m == 5) { - KALDI_VLOG(2) << "Checking y_test <= y_max: " - << y_test << " <= " << y_max << " [x_max = " - << x_max << "]"; - KALDI_ASSERT(y_test <= y_max + 1.0e-05); - } - } - } -} - - -void UnitTestSinusoidDetector() { - BaseFloat samp_freq = 4000 + (rand() % 2000); - int32 num_samp = 128 + rand() % 400; - SinusoidDetector detector(samp_freq, num_samp); - - for (int32 i = 0; i < 40; i++) { - - Vector signal(num_samp); - - // Sinusoid ref_sinusoid(1.3, 312.5, M_PI * 0.0); - // Sinusoid ref_sinusoid(1.3, 324.125, M_PI * 0.5); - - BaseFloat nyquist = samp_freq * 0.5; - BaseFloat freq = nyquist * RandUniform(); - BaseFloat amplitude = RandUniform(); - BaseFloat phase = M_2PI * RandUniform(); - - Sinusoid ref_sinusoid(amplitude, freq, phase); - - AddSinusoid(samp_freq, ref_sinusoid, &signal); - - - BaseFloat orig_energy = VecVec(signal, signal); - KALDI_LOG << "Real frequency is " << freq << ", amplitude " - << amplitude << ", phase " << phase << ", samp-freq " - << samp_freq; - KALDI_LOG << "Total energy of signal (with sinusoid) is " << orig_energy; - - Sinusoid sinusoid; - BaseFloat min_energy = 0.0; - BaseFloat energy = detector.DetectSinusoid(min_energy, - signal, &sinusoid); - - Vector new_signal(signal); - sinusoid.phase += M_PI; // Reverse the phase. - AddSinusoid(samp_freq, sinusoid, &new_signal); - BaseFloat delta_energy = VecVec(signal, signal) - - VecVec(new_signal, new_signal); - KALDI_LOG << "Projected delta energy = " << energy - << " and observed was " << delta_energy; - - BaseFloat remaining_energy = VecVec(new_signal, new_signal); - if (remaining_energy > 0.01 * orig_energy) { - KALDI_WARN << "Energy remaining is " << remaining_energy - << " vs. original " << orig_energy; - BaseFloat relative_freq = freq / nyquist; - BaseFloat inv_num_samp = 1.0 / num_samp; - // We only tolerate this kind of error for very ridiculous frequency, - // close to zero or the Nyquist. - KALDI_ASSERT(relative_freq < inv_num_samp || - relative_freq > 1.0 - inv_num_samp); - } - } -} - -// as UnitTestSinusoidDetector(), but doing it in noisy signals. -void UnitTestSinusoidDetectorNoisy() { - BaseFloat samp_freq = 4000 + (rand() % 2000); - int32 num_samp = 128 + rand() % 400; - SinusoidDetector detector(samp_freq, num_samp); - - for (int32 i = 0; i < 40; i++) { - - Vector signal(num_samp); - - signal.SetRandn(); - - BaseFloat rand_energy = VecVec(signal, signal); - - // Sinusoid ref_sinusoid(1.3, 312.5, M_PI * 0.0); - // Sinusoid ref_sinusoid(1.3, 324.125, M_PI * 0.5); - - BaseFloat nyquist = samp_freq * 0.5; - BaseFloat freq = nyquist * RandUniform(); - BaseFloat amplitude = 10.0 * RandUniform(); - BaseFloat phase = M_2PI * RandUniform(); - - Sinusoid ref_sinusoid(amplitude, freq, phase); - - AddSinusoid(samp_freq, ref_sinusoid, &signal); - - BaseFloat tot_energy = VecVec(signal, signal); - - KALDI_LOG << "Real frequency is " << freq << ", amplitude " - << amplitude << ", phase " << phase << ", samp-freq " - << samp_freq; - KALDI_LOG << "Total energy of signal (with noise + sinusoid) is " << tot_energy; - - Sinusoid sinusoid; - BaseFloat min_energy = 0.0; - BaseFloat energy = detector.DetectSinusoid(min_energy, - signal, &sinusoid); - - Vector new_signal(signal); - sinusoid.phase += M_PI; // reverse the phase. - AddSinusoid(samp_freq, sinusoid, &new_signal); - BaseFloat delta_energy = VecVec(signal, signal) - - VecVec(new_signal, new_signal); - KALDI_LOG << "Projected delta energy = " << energy - << " and observed was " << delta_energy; - - BaseFloat min_energy_diff = 0.99 * (tot_energy - rand_energy); - - if (delta_energy < min_energy_diff) { - KALDI_WARN << "Energy reduction is " << delta_energy - << " vs. expected " << (tot_energy - rand_energy); - BaseFloat relative_freq = freq / nyquist; - BaseFloat inv_num_samp = 1.0 / num_samp; - // We only tolerate this kind of error for very ridiculous frequency, - // close to zero or the Nyquist. - KALDI_ASSERT(relative_freq < inv_num_samp || - relative_freq > 1.0 - inv_num_samp); - } - } -} - - -void AddFreqToSignal(BaseFloat base_freq, - BaseFloat samp_freq, - BaseFloat tolerance, - BaseFloat gain, - VectorBase *signal) { - BaseFloat error_scale = (2 * RandUniform() - 1) * tolerance; - BaseFloat freq = base_freq * (1.0 + error_scale); - KALDI_VLOG(3) << "base-freq = " << base_freq << ", factor = " << error_scale; - for (int32 i = 0; i < signal->Dim(); i++) - (*signal)(i) += gain * sin(i * 2.0 * 3.14159 * freq / samp_freq); -} - - -void GenerateDtmfTestCase( - BaseFloat sampling_rate, - Vector *signal, - std::vector *ref_output) { - // the "ref_output" should correlate with the first of each run of frames with the same label. - - BaseFloat min_duration_secs = 0.04; // min duration of dtmf or non-tone segments. - BaseFloat min_dialtone_duration_secs = 0.1; - BaseFloat frequency_tolerance = 0.035; - BaseFloat dialtone_frequency_tolerance = 0.4 * (440.0 - 425.0) / 440.0; - - int32 num_events = 2 * (5 + rand() % 5) + 1; // odd number. - int32 tot_signal_dim = 0; - - ref_output->resize(num_events); - std::vector > all_signals(num_events); - for (int32 i = 0; i < num_events; i++) { - MultiSinusoidDetectorOutput &this_output = (*ref_output)[i]; - Vector &this_signal = all_signals[i]; - BaseFloat duration_secs = min_duration_secs * (1 + rand() % 3); - int32 num_samp = sampling_rate * duration_secs; - tot_signal_dim += num_samp; - - this_signal.Resize(num_samp); - this_signal.SetRandn(); - - if (i % 2 == 0); // do nothing; - else if (rand() % 2 == 0 && duration_secs >= min_dialtone_duration_secs) { - // dialtone. - BaseFloat freq; - if (rand() % 3 == 0) { freq = 350; } - else if (rand() % 2 == 0) { freq = 440; } - else { freq = 425; } - BaseFloat gain = 10.0 * (1.0 + rand() % 2); - AddFreqToSignal(freq, sampling_rate, dialtone_frequency_tolerance, - gain, &(this_signal)); - this_output.freq1 = freq; - } else { - // dtmf. use a subset of tones as examples. - BaseFloat freq1, freq2; - char c; - if (rand() % 4 == 0) { - c = '8'; freq1 = 852; freq2 = 1336; - } else if (rand() % 3 == 0) { - c = '0'; freq1 = 941; freq2 = 1336; - } else if (rand() % 2 == 0) { - c = '#'; freq1 = 941; freq2 = 1477; - } else { - c = '1'; freq1 = 697; freq2 = 1209; - } - BaseFloat base_gain = 10.0 * (1.0 + (rand() % 3)), - gain_factor = 1.0 + 0.1 * (-2 + rand() % 5), - gain1 = base_gain, gain2 = gain_factor * base_gain; - AddFreqToSignal(freq1, sampling_rate, frequency_tolerance, gain1, - &(this_signal)); - AddFreqToSignal(freq2, sampling_rate, frequency_tolerance, gain2, - &(this_signal)); - this_output.freq1 = freq1; - this_output.freq2 = freq2; - } - } - signal->Resize(tot_signal_dim); - int32 signal_offset = 0; - for (int32 i = 0; i < num_events; i++) { - int32 this_dim = all_signals[i].Dim(); - signal->Range(signal_offset, this_dim).CopyFromVec(all_signals[i]); - signal_offset += this_dim; - } -} - - -/* - -// Just a basic test to check that it produces output. - -void UnitTestToneDetection() { - BaseFloat samp_freq = (rand() % 2) == 0 ? 8000 : 16000; - ToneDetectionConfig config; - - int32 num_frames = 100 + (rand() % 100); - int32 frame_length = static_cast(samp_freq * config.frame_length_secs); - - int32 num_samples = frame_length * num_frames + rand() % frame_length; - Vector signal(num_samples); - signal.SetRandn(); - - ToneDetector tone_detector(config, samp_freq); - - int32 signal_offset = 0; - - std::vector tone_detector_output; - - while (signal_offset < num_samples) { - int32 signal_remaining = num_samples - signal_offset, - chunk_size = std::min((rand() % 200) + 100, - signal_remaining); - SubVector signal_part(signal, signal_offset, chunk_size); - tone_detector.AcceptWaveform(signal_part); - signal_offset += chunk_size; - - if (signal_offset == num_samples) - tone_detector.WaveformFinished(); - while (!tone_detector.Done() && - (rand() % 2 == 0 || signal_offset == num_samples)) { - ToneDetectorOutput *output = new ToneDetectorOutput(); - tone_detector.GetNextFrame(output); - tone_detector_output.push_back(output); - } - } - KALDI_ASSERT(signal_offset == num_samples); - - Vector signal2(signal.Dim()); - signal_offset = 0; - for (int32 i = 0; i < tone_detector_output.size(); i++) { - ToneDetectorOutput *output = tone_detector_output[i]; - signal2.Range(signal_offset, - output->signal.Dim()).CopyFromVec(output->signal); - signal_offset += output->signal.Dim(); - if (output->frame_type != 'n') { - KALDI_ERR << "Frame " << i << " badly classified, should be 'n', is: " - << output->frame_type; - } - delete output; - } - KALDI_ASSERT(signal_offset == num_samples && - signal.ApproxEqual(signal2, 1.0e-10)); - -} - -std::ostringstream & operator << (std::ostringstream &ostr, - const ToneDetectorOutput &output) { - ostr << output.frame_type; - if (output.frame_type == 'd') - ostr << output.dialtone_freq; - ostr << ' '; - return ostr; -} - -*/ - - -// This version of the unit-test generates a signal that has tones in it, and -// runs the detection on that signal. -void UnitTestToneDetection2() { - BaseFloat samp_freq = (rand() % 2) == 0 ? 8000 : 16000; - Vector signal; - std::vector ref_output; - GenerateDtmfTestCase(samp_freq, &signal, &ref_output); - - MultiSinusoidDetectorConfig config; - - int32 num_samples = signal.Dim(); - KALDI_ASSERT(num_samples > 0); - - MultiSinusoidDetector multi_sinusoid_detector(config, samp_freq); - - int32 signal_offset = 0; - - std::vector multi_sinusoid_detector_output; - - while (signal_offset < num_samples) { - int32 signal_remaining = num_samples - signal_offset, - chunk_size = std::min((rand() % 200) + 100, - signal_remaining); - SubVector signal_part(signal, signal_offset, chunk_size); - multi_sinusoid_detector.AcceptWaveform(signal_part); - signal_offset += chunk_size; - - if (signal_offset == num_samples) - multi_sinusoid_detector.WaveformFinished(); - while (!multi_sinusoid_detector.Done() && - (rand() % 2 == 0 || signal_offset == num_samples)) { - MultiSinusoidDetectorOutput *output = new MultiSinusoidDetectorOutput(); - multi_sinusoid_detector.GetNextFrame(output); - multi_sinusoid_detector_output.push_back(output); - } - } - KALDI_ASSERT(signal_offset == num_samples); - - // std::ostringstream str_ref, str_hyp; - //for (size_t i = 0; i < ref_output.size(); i++) - // str_ref << ref_output[i]; - - - for (size_t i = 0; i < multi_sinusoid_detector_output.size(); i++) { - MultiSinusoidDetectorOutput *output = multi_sinusoid_detector_output[i]; - KALDI_LOG << "tot-energy = " << output->tot_energy - << ", freq1 " << output->freq1 << ", energy1 " << output->energy1 - << ", freq2 " << output->freq2 << ", energy2 " << output->energy2; - delete output; - } -} - - - -} // namespace kaldi - -int main() { - using namespace kaldi; - - SetVerboseLevel(4); - - UnitTestToneDetection2(); - UnitTestAddSinusoid(); - UnitTestQuadraticMaximizeEqualSpaced(); - UnitTestQuadraticMaximize(); - for (int32 i = 0; i < 10; i++) { - UnitTestSinusoidDetector(); - UnitTestSinusoidDetectorNoisy(); - } - -} diff --git a/src/feat/sinusoid-detection.cc b/src/feat/sinusoid-detection.cc deleted file mode 100644 index bf6b0b9e4fe..00000000000 --- a/src/feat/sinusoid-detection.cc +++ /dev/null @@ -1,945 +0,0 @@ -// feat/sinusoid-detection.cc - -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "feat/sinusoid-detection.h" -#include "matrix/matrix-functions.h" -#include "feat/resample.h" - -namespace kaldi { - - - -// This function adds the given sinusoid to the signal, as: -// (*signal)(t) += amplitude * cos(2 pi freq/samp_freq t + phase). -void AddSinusoid(BaseFloat samp_freq, - const Sinusoid &sinusoid, - VectorBase *signal) { - // treat "factor" as a complex variable equal to exp(i * 2 pi freq / samp_freq); it's - // the factor by which we multiply on each frame. - BaseFloat factor_real = cos(M_2PI * sinusoid.freq / samp_freq), - factor_im = sin(M_2PI * sinusoid.freq / samp_freq); - BaseFloat *signal_data = signal->Data(); - int32 dim = signal->Dim(), batch_size = 100; - // process frames in batches of size "batch_size", after which we recompute - // the starting point to prevent loss of accuracy due to drift. - for (int32 b = 0; b * batch_size < dim; b++) { - int32 t_offset = b * batch_size, - t_end = std::min(dim, t_offset + batch_size); - double phase = sinusoid.phase + M_2PI * t_offset * sinusoid.freq / samp_freq; - // treat x as a complex variable which initially is equal to amplitude * exp(i * phase), - // but which gets multiplied by "factor" on each frame. - BaseFloat x_real = sinusoid.amplitude * cos(phase), - x_im = sinusoid.amplitude * sin(phase); - for (int32 t = t_offset; t < t_end; t++) { - signal_data[t] += x_real; - ComplexMul(factor_real, factor_im, &x_real, &x_im); // x *= factor. - } - } -} - - -// static -void SinusoidDetector::QuadraticMaximizeEqualSpaced( - BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat *x_max, BaseFloat *y_max) { - // Let the function be y = a x^2 + b x + c, and - // suppose we have the values of y(0), y(1) and y(2). - // We have y0 = c, y1 = a + b + c, and y2 = 4a + 2b + c, - // so c = y0. - // Also, y2 - 2 y1 = 2a - c, so - // a = (y2 - 2 y1 + c) / 2, and - // b = y1 - a - c. - BaseFloat c = y0, a = y2 - 2 * y1 + c, b = y1 - a - c; - if (a >= 0) { - // The maximum of the function will occur at one of the end points. - if (y0 > y2) { - *x_max = 0; - *y_max = y0; - } else { - *x_max = 2; - *y_max = y2; - } - } else { - // derivative y' = 2a x + b. y' = 0 at x = -b / 2 a. - BaseFloat x = -b / (2.0 * a); - if (x <= 0.0) { - *x_max = 0; - *y_max = y0; - } else if (x >= 2.0) { - *x_max = 0; - *y_max = y2; - } else { - *x_max = x; - *y_max = a * x * x + b * x + c; - } - } -} - -// static -void SinusoidDetector::QuadraticMaximize( - BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat *x_max, BaseFloat *y_max) { - // Let the function be y = a x^2 + b x + c, and - // suppose we have the values of y(0), y(x1) and y(1), - // where 0 < x1 < 1. - // We have y0 = c, y1 = x1^2 a + x1 b + c, and y2 = a + b + c, - // so c = y0. - // Also, x1.y2 - y1 = a (x1 - x1^2) + (x1 - 1) c, so - // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and - // b = y2 - a - c. - BaseFloat c = y0, - a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1), - b = y2 - a - c; - - // TODO: remove these lines. - AssertEqual(y1, a * x1 * x1 + b * x1 + c); - AssertEqual(y2, a + b + c); - - if (a >= 0) { - // The maximum of the function will occur at one of the end points. - if (y0 > y2) { - *x_max = 0; - *y_max = y0; - } else { - *x_max = 1.0; - *y_max = y2; - } - } else { - // derivative y' = 2a x + b. y' = 0 at x = -b / 2 a. - BaseFloat x = -b / (2.0 * a); - if (x <= 0.0) { - *x_max = 0.0; - *y_max = y0; - } else if (x >= 1.0) { - *x_max = 1.0; - *y_max = y2; - } else { - *x_max = x; - *y_max = a * x * x + b * x + c; - } - } -} - -//static -BaseFloat SinusoidDetector::QuadraticInterpolate( - BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat x) { - // Let the function be y = a x^2 + b x + c, and - // suppose we have the values of y(0), y(x1) and y(1), - // where 0 < x1 < 1. - // We have y0 = c, y1 = x1^2 a + x1 b + c, and y2 = a + b + c, - // so c = y0. - // Also, x1.y2 - y1 = a (x1 - x1^2) + (x1 - 1) c, so - // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and - // b = y2 - a - c. - KALDI_ASSERT(x1 >= 0.0 && x1 <= 1.0); - if (x1 == 0.0) return y0; - else if (x1 == 1.0) return y2; - - BaseFloat c = y0, - a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1), - b = y2 - a - c; - return a * x * x + b * x + c; -} - -// This function does -// (*cos)(t) = cos(2 pi t freq / samp_freq) -// (*sin)(t) = sin(2 pi t freq / samp_freq) -//static -void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq, - BaseFloat freq, - VectorBase *cos_vec, - VectorBase *sin_vec) { - int32 dim = cos_vec->Dim(), batch_size = 100; - KALDI_ASSERT(dim == sin_vec->Dim()); - BaseFloat *cos_data = cos_vec->Data(), *sin_data = sin_vec->Data(); - BaseFloat factor_real = cos(M_2PI * freq / samp_freq), - factor_im = sin(M_2PI * freq / samp_freq); - - // process frames in batches of size "batch_size", after which we recompute - // the starting point to prevent loss of accuracy due to drift. - for (int32 b = 0; b * batch_size < dim; b++) { - int32 t_offset = b * batch_size, - t_end = std::min(dim, t_offset + batch_size); - double phase = M_2PI * t_offset * freq / samp_freq; - // treat x as a complex variable which initially is equal to amplitude * exp(i * phase), - // but which gets multiplied by "factor" on each frame. - BaseFloat x_real = cos(phase), x_im = sin(phase); - for (int32 t = t_offset; t < t_end; t++) { - cos_data[t] = x_real; - sin_data[t] = x_im; - ComplexMul(factor_real, factor_im, &x_real, &x_im); // x *= factor. - } - } -} - -SinusoidDetector::SinusoidDetector(BaseFloat samp_freq, - int32 num_samp): - samp_freq_(samp_freq), - num_samples_(num_samp), - num_samples_padded_(RoundUpToNearestPowerOfTwo(num_samp)), - fft_(num_samples_padded_), - factor1_(3.1), - factor2_(1.42) { - ComputeCoefficients(); -} - -void SinusoidDetector::SelfTest( - const VectorBase &signal, - const std::vector &info, - BaseFloat final_freq, - BaseFloat final_energy) { - int32 num_bins = num_samples_padded_ * 2 + 1; - - - { - BaseFloat cutoff = 0.0; - for (int32 k = 0; k <= num_bins; k += 4) - cutoff = std::max(cutoff, info[k].energy); - BaseFloat energy_upper_bound = factor1_ * cutoff; - if (final_energy > energy_upper_bound) { - KALDI_WARN << "Self-testing failed [factor1]: " - << final_energy << " > " << energy_upper_bound - << ", num-samples is " << num_samples_ - << ", freq/nyquist = " - << (final_freq / (samp_freq_ * 0.5)) - << "- would require factor1 >= " - << (final_energy / cutoff); - } - } - { - BaseFloat cutoff = 0.0; - for (int32 k = 0; k <= num_bins; k += 2) - if (info[k].valid) - cutoff = std::max(cutoff, info[k].energy); - BaseFloat energy_upper_bound = factor2_ * cutoff; - if (final_energy > energy_upper_bound) { - KALDI_WARN << "Self-testing failed [factor2]: " - << final_energy << " > " << energy_upper_bound - << ", num-samples is " << num_samples_ - << ", freq/nyquist = " - << (final_freq / (samp_freq_ * 0.5)) - << "- would require factor2 >= " - << (final_energy / cutoff); - - } - } - -} - - -BaseFloat SinusoidDetector::OptimizeFrequency( - const std::vector &info, - int32 *bin_out, - BaseFloat *offset_out) const { - - BaseFloat max_energy = 0.0; - *bin_out = -1; - int32 max_freq = num_samples_padded_ * 2; - - // For each bin, we consider the frequency range [bin, bin+1, bin+2], - // and if we have info for all those bins, do a quadratic interpolation to - // find the maximum within the range. - for (int32 bin = 0; bin + 2 <= max_freq; bin++) { - if (info[bin].valid && info[bin+1].valid && info[bin+2].valid) { - // First handle the left side of the bin. - BaseFloat best_x, best_y; - QuadraticMaximizeEqualSpaced(info[bin].energy, info[bin+1].energy, - info[bin+2].energy, &best_x, &best_y); - if (best_y > max_energy) { - max_energy = best_y; - if (best_x <= 1.0) { - *bin_out = bin; - *offset_out = best_x; - } else { - *bin_out = bin + 1; - *offset_out = best_x - 1; - } - } - } - } - return max_energy; -} - - -BaseFloat SinusoidDetector::DetectSinusoid( - BaseFloat min_energy, - const VectorBase &signal, - Sinusoid *sinusoid) { - if (signal(0) == 0.0 && signal.Norm(2.0) == 0.0) - return 0.0; - KALDI_ASSERT(signal.Dim() == num_samples_); - Vector fft(num_samples_padded_); - fft.Range(0, num_samples_).CopyFromVec(signal); - bool forward = true; - fft_.Compute(fft.Data(), forward); - - std::vector info; - ComputeCoarseInfo(fft, &info); - // we now have info for the "coarse" bins. - - // each element b of "bins" will be a multiple of 4: it's possible - // that the best frequency is in the range [b, b+4] - std::vector bins; - FindCandidateBins(min_energy, info, &bins); - - if (bins.empty()) - return 0.0; // not enough energy in signal. - - for (size_t i = 0; i < bins.size(); i++) { - int32 bin = bins[i]; - ComputeBinInfo(signal, bin, &(info[bin])); - } - - std::vector bins2; - FindCandidateBins2(min_energy, info, &bins2); - - for (size_t i = 0; i < bins2.size(); i++) { - int32 bin = bins2[i]; - ComputeBinInfo(signal, bin, &(info[bin])); - } - - // compute energy for the predicted-optimum point, which will usually be - // between bins, with an offset. - int32 bin; - BaseFloat offset; - - BaseFloat opt_energy = OptimizeFrequency(info, &bin, &offset); - - if (opt_energy == 0.0) - return 0.0; - - BaseFloat max_freq = (bin + offset) * samp_freq_ / (num_samples_padded_ * 4); - - KALDI_VLOG(4) << "Best frequency based on interpolation is " - << max_freq << ", best energy is " - << opt_energy << ", bin is " << bin; - - OptimizedInfo final_info; - - FineOptimizeFrequency(signal, bin, offset, &info, &final_info); - - // the following while loop will rarely be accessed. - while (final_info.offset == 0.0 && bin > 0) { - bin--; - FineOptimizeFrequency(signal, bin, 1.0, &info, &final_info); - } - - // the following while loop will rarely be accessed. - while (final_info.offset == 1.0 && bin < num_samples_padded_ * 2) { - bin++; - FineOptimizeFrequency(signal, bin, 0.0, &info, &final_info); - } - - if (bin <= 1 || bin >= num_samples_padded_ * 2 - 2) { - // If we're in the lowest or next-to-lowest bin, or the highest or - // next-to-highest allowed bin (note, "bin" here is a range, and it can - // never have the value num_samples_padded_ * 2), we tend to get more - // estimation error than usual, so do another round of optimization. - FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info); - } - - BaseFloat final_freq = (final_info.bin + final_info.offset) * samp_freq_ / (num_samples_padded_ * 4); - KALDI_VLOG(4) << "Final optimized info is: freq " << final_freq - << ", cos coeff " << final_info.cos_coeff << ", sin coeff " - << final_info.sin_coeff << ", energy " << final_info.energy; - - if (GetVerboseLevel() > 1) - SelfTest(signal, info, final_freq, final_info.energy); - - if (final_info.energy >= min_energy) { - sinusoid->amplitude = std::sqrt(final_info.cos_coeff * final_info.cos_coeff - + final_info.sin_coeff * final_info.sin_coeff); - sinusoid->freq = final_freq; - sinusoid->phase = -std::atan2(final_info.sin_coeff, final_info.cos_coeff); - KALDI_VLOG(4) << "Phase is " << sinusoid->phase << ", amplitude is " - << sinusoid->amplitude << ", freq is " << sinusoid->freq; - return final_info.energy; - } else { - return 0.0; - } -} - - -/* - This function computes, the original FFT bins, the amount of energy in - the signal that can be explained by a sinusoid at the corresponding frequency. - - Let f be the continuous-valued frequency. - - Define the vector C_f as - C_f = [ c_0, c_1 ... c_n ] where c_k = cos(2 pi k f / samp_freq). [obviously this notation depends on f]. - and S_f the same thing with sin in place of cos. - - Let the signal, as a vector, be V. - We want to maximize the (positive) energy-difference: - ||V||^2 - || V - c C_f - s S_f ||^2 - where c and s are the coefficients of C_f and S_f. - This quantity can be expanded as follows, where . means dot product. - \delta E = -c^2 C_f.C_f - s^2 S_f.S_f - 2 c s C_f.S_f + 2 c V.C_f + 2 s V.S_f. - which can be written as follows, where . means dot-product and ' means transpose: - \delta E = 2 [c s] v - [c s] M [c s]' - where M = [ C_f.C_f, C_f.S_f, C_f.S_f, S_f.S_f ], - and v = [V.C_f, V.S_f]. - If M is invertible (i.e. for nonzero frequencies), this is maximized by - [c s] = M^-1 v - giving us the value. - \delta E = v' M^{-1} v. - We'll compute the inverse of M in advance, inside ComputeCoefficients(), using - the formula [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a] For zero frequency and at the - Nyquist, M has the value [ a 0; 0 0 ], and we have the same type of expression - limited to the first dim of v, i.e. Minv = [ a^{-1} 0; 0 0 ], a kind of pseudo-inverse. - */ - -void SinusoidDetector::ComputeCoarseInfo( - const Vector &fft, - std::vector *info) const { - info->resize(num_samples_padded_ * 2 + 1); // 4 times resolution of FFT itself. - - const BaseFloat *fft_data = fft.Data(); - - int32 num_bins = num_samples_padded_ / 2 + 1; - for (int32 k = 0; k < num_bins; k++) { - BaseFloat real, im; - if (k == 0) { - real = fft_data[0]; - im = 0.0; - } else if (k == num_samples_padded_ / 2) { - real = fft_data[1]; - im = 0.0; - } else { - real = fft_data[k * 2]; - im = fft_data[k * 2 + 1]; - } - // v1 and v2 are the two components of the vector v in the math above. - BaseFloat v1 = real, v2 = -im; - // Minv_'s row indexes correspond to frequencies with 4 times more - // resolution than the FFT bins. - const BaseFloat *Minv_data = Minv_.RowData(k * 4); - // The Matrix M^{-1} is of the form [a b; b d] - BaseFloat a = Minv_data[0], b = Minv_data[1], d = Minv_data[2]; - // compute \delta E = v' M^{-1} v. - BaseFloat delta_e = v1 * v1 * a + v2 * v2 * d + 2 * v1 * v2 * b; - InfoForBin &this_info = (*info)[k * 4]; - this_info.valid = true; - this_info.cos_dot = real; - this_info.sin_dot = -im; - this_info.energy = delta_e; - } -} - - -void SinusoidDetector::ComputeCoefficients() { - int32 num_samp = num_samples_; - int32 num_freq = num_samples_padded_ * 2 + 1; - cos_.Resize(num_freq, num_samp); - sin_.Resize(num_freq, num_samp); - - Vector cc(num_freq), cs(num_freq); - for (int32 k = 0; k < num_freq; k++) { - BaseFloat freq = k * samp_freq_ / (num_samples_padded_ * 4); - SubVector c(cos_, k), s(sin_, k); - CreateCosAndSin(samp_freq_, freq, &c, &s); - cc(k) = VecVec(c, c); - cs(k) = VecVec(c, s); - } - - M_.Resize(num_freq, 3, kUndefined); - Minv_.Resize(num_freq, 3, kUndefined); - - for (int32 k = 0; k < num_freq; k++) { - // Let the matrix M be [ a b; b d ]. [we don't write c because c == b]. - // We want to compute Minv_. - BaseFloat a = cc(k), b = cs(k), d = num_samples_ - a; - M_(k, 0) = a; - M_(k, 1) = b; - M_(k, 2) = d; - if (k == 0 || k == num_freq - 1) { - // this is a special case; it's not really the inverse of M but it will - - // give us the expression we want; it's like an inverse in just one dimension. - Minv_(k, 0) = 1.0 / a; - Minv_(k, 1) = 0.0; - Minv_(k, 2) = 0.0; - } else { - BaseFloat inv_det = 1.0 / (a * d - b * b); - // check for NaN and inf. - KALDI_ASSERT(inv_det == inv_det && inv_det - inv_det == 0.0); - // use: [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a], special case where c = b. - BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det; - Minv_(k, 0) = inv_a; - Minv_(k, 1) = inv_b; - Minv_(k, 2) = inv_d; - } - } -} - - -// Does fine optimization of the frequency within this bin; returns the -// final energy, the optimized frequency, and the cos and sin coefficients. -void SinusoidDetector::FineOptimizeFrequency( - const VectorBase &signal, - int32 bin, - BaseFloat bin_offset, - std::vector *info_in, - OptimizedInfo *opt_info) const { - std::vector &info = *info_in; - if (!info[bin].valid) ComputeBinInfo(signal, bin, &(info[bin])); - if (!info[bin+1].valid) ComputeBinInfo(signal, bin+1, &(info[bin+1])); - - const BaseFloat epsilon = 0.02, delta = 0.001; - - // If the offset is very close to the edges of the bin, move it - // closer to the center. Otherwise we may have problems with the - // steps below. The initial offset is only used as a starting point - // anyway, so this won't affect the final value much. - if (bin_offset < epsilon) - bin_offset = epsilon; - if (bin_offset > 1.0 - epsilon) - bin_offset = 1.0 - epsilon; - KALDI_VLOG(4) << "Initial bin offset = " << bin_offset << ", bin = " << bin; - - // create cos and sin waves of the specified frequency. - BaseFloat freq = (bin + bin_offset) * samp_freq_ / (num_samples_padded_ * 4); - Vector c(num_samples_, kUndefined), s(num_samples_, kUndefined); - CreateCosAndSin(samp_freq_, freq, &c, &s); - - // these a, b and d values are the elements of the M matrix at this frequency - // "freq", i.e. the matrix M_f [ a b; b d ]. This will be invertible because - // we have ensured that the frequency is not too close to zero or the Nyquist. - BaseFloat a = VecVec(c, c), b = VecVec(c, s), d = num_samples_ - a; - BaseFloat inv_det = 1.0 / (a * d - b * b); - BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det; - - - BaseFloat v1 = VecVec(c, signal), v2 = VecVec(s, signal); - - BaseFloat delta_e = v1 * v1 * inv_a + v2 * v2 * inv_d + 2 * v1 * v2 * inv_b; - - KALDI_VLOG(4) << "Actual energy-change at frequency " << freq << " is " - << delta_e; - // "freq" is frequency somewhere in the middle of the bin. - - BaseFloat final_offset, final_energy; - QuadraticMaximize(bin_offset, info[bin].energy, delta_e, info[bin+1].energy, - &final_offset, &final_energy); - - KALDI_VLOG(4) << "After further optimizing, offset was " << final_offset - << " giving freq " - << ((bin+final_offset) * samp_freq_ / (num_samples_padded_*4)) - << ", with energy " << final_energy; - - // Use interpolation (using a quadratic function) to get the entries of the M matrix - // the the final, tuned frequency. Interpolation on M is better than M^{-1}, as its - // elements are much better behaved as the frequency varies. - const BaseFloat *M_left_data = M_.RowData(bin), - *M_right_data = M_.RowData(bin + 1); - - BaseFloat a_interp = QuadraticInterpolate(bin_offset, M_left_data[0], a, M_right_data[0], - final_offset); - BaseFloat b_interp = QuadraticInterpolate(bin_offset, M_left_data[1], b, M_right_data[1], - final_offset); - BaseFloat d_interp = QuadraticInterpolate(bin_offset, M_left_data[2], d, M_right_data[2], - final_offset); - - // Now get the inverse of the M matrix at the final point. - BaseFloat a_inv_interp, b_inv_interp, d_inv_interp; - - if ((bin == 0 && final_offset < delta) || - (bin == num_samples_padded_ * 2 && final_offset > 1.0 - delta)) { - // If we're extremely close to zero or the Nyquist, we'll have trouble - // inverting M; just invert in the 1st dimension (only have a cos - // component). - a_inv_interp = 1.0 / a_interp; - b_inv_interp = 0.0; - d_inv_interp = 0.0; - } else { - BaseFloat inv_det = 1.0 / (a_interp * d_interp - b_interp * b_interp); - // check for NaN and inf. - KALDI_ASSERT(inv_det == inv_det && inv_det - inv_det == 0.0); - // use: [a b;c d]^-1 = 1/(ad - bc) [d -b; -c a], special case where c = b. - a_inv_interp = d_interp * inv_det; - b_inv_interp = -b_interp * inv_det; - d_inv_interp = a_interp * inv_det; - } - - BaseFloat v1_interp = QuadraticInterpolate(bin_offset, info[bin].cos_dot, v1, - info[bin+1].cos_dot, final_offset); - BaseFloat v2_interp = QuadraticInterpolate(bin_offset, info[bin].sin_dot, v2, - info[bin+1].sin_dot, final_offset); - - opt_info->bin = bin; - opt_info->offset = final_offset; - // Recompute the energy-reduction using the more accurate interpolated values of - // v1 and v2 (the dot-products of the cos and sin with the signal), and - // of M. - opt_info->energy = v1_interp * v1_interp * a_inv_interp + - v2_interp * v2_interp * d_inv_interp + - 2 * v1_interp * v2_interp * b_inv_interp; - // Compute the coefficients of the cos and sin in the optimal sinusoid, as - // M^{-1} v. - opt_info->cos_coeff = a_inv_interp * v1_interp + b_inv_interp * v2_interp; - opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp; -} - -void SinusoidDetector::FindCandidateBins( - BaseFloat min_energy, - const std::vector &info, - std::vector *bins) const { - - int32 max_bin = num_samples_padded_ * 2; - - BaseFloat cutoff = min_energy; - for (int32 k = 0; k <= max_bin; k += 4) { - KALDI_ASSERT(info[k].valid); - cutoff = std::max(cutoff, info[k].energy); - } - - for (int32 k = 0; k < max_bin; k += 4) { - BaseFloat energy_upper_bound = - factor1_ * std::max(info[k].energy, - info[k+4].energy); - if (energy_upper_bound >= cutoff) - bins->push_back(k + 2); - } -} - - -void SinusoidDetector::FindCandidateBins2( - BaseFloat min_energy, - const std::vector &info, - std::vector *bins2) const { - - int32 max_bin = num_samples_padded_ * 2; - - BaseFloat cutoff = min_energy; - for (int32 k = 0; k <= max_bin; k += 2) { - if (info[k].valid) - cutoff = std::max(cutoff, info[k].energy); - } - - for (int32 k = 0; k < max_bin; k += 2) { - if (info[k].valid && info[k+2].valid) { - BaseFloat energy_upper_bound = - factor2_ * std::max(info[k].energy, - info[k+2].energy); - if (energy_upper_bound >= cutoff) - bins2->push_back(k + 1); - } - } -} - - -void SinusoidDetector::ComputeBinInfo( - const VectorBase &signal, - int32 bin, - InfoForBin *info) const { - KALDI_ASSERT(!info->valid); // or wasted time. - info->valid = true; - BaseFloat v1 = info->cos_dot = VecVec(cos_.Row(bin), signal); - BaseFloat v2 = info->sin_dot = VecVec(sin_.Row(bin), signal); - const BaseFloat *Minv_data = Minv_.RowData(bin); - BaseFloat a = Minv_data[0], b = Minv_data[1], d = Minv_data[2]; - // compute \delta E = v' M^{-1} v. - BaseFloat delta_e = v1 * v1 * a + v2 * v2 * d + 2 * v1 * v2 * b; - info->energy = delta_e; -} - - -MultiSinusoidDetector::MultiSinusoidDetector( - const MultiSinusoidDetectorConfig &config, - int32 sampling_freq): - config_(config), - sample_freq_(sampling_freq), - samples_per_frame_subsampled_(0.001 * config.frame_length_ms * - static_cast(config.subsample_freq)), - waveform_finished_(false), - samples_consumed_(0), - resampler_(sampling_freq, config.subsample_freq, - config.subsample_filter_cutoff, config.subsample_filter_zeros), - detector_(config.subsample_freq, samples_per_frame_subsampled_) { - config.Check(); -} - - -void MultiSinusoidDetector::Reset() { - waveform_finished_ = false; - samples_consumed_ = 0; - while(!subsampled_signal_.empty()) { - delete subsampled_signal_.front(); - subsampled_signal_.pop_front(); - } - resampler_.Reset(); -} - -void MultiSinusoidDetector::WaveformFinished() { - KALDI_ASSERT(!waveform_finished_ && - "WaveformFinished() called twice."); - - Vector empty_waveform; - subsampled_signal_.push_back(new Vector()); - bool flush = true; - resampler_.Resample(empty_waveform, flush, - subsampled_signal_.back()); - waveform_finished_ = true; - if (subsampled_signal_.back()->Dim() == 0) { - delete subsampled_signal_.back(); - subsampled_signal_.pop_back(); - } -} - -void MultiSinusoidDetector::AcceptWaveform( - const VectorBase &waveform) { - - - subsampled_signal_.push_back(new Vector()); - bool flush = false; - resampler_.Resample(waveform, flush, - subsampled_signal_.back()); - if (subsampled_signal_.back()->Dim() == 0) { - delete subsampled_signal_.back(); - subsampled_signal_.pop_back(); - } -} - -int32 MultiSinusoidDetector::NumSubsampledSamplesReady(int32 max_samp) const { - KALDI_ASSERT(samples_consumed_ >= 0 && - ((subsampled_signal_.empty() && samples_consumed_ == 0) || - (!subsampled_signal_.empty () && samples_consumed_ < - subsampled_signal_[0]->Dim()))); - - int32 ans = -samples_consumed_; - for (size_t i = 0; i < subsampled_signal_.size(); i++) { - ans += subsampled_signal_[i]->Dim(); - if (ans > max_samp) break; - } - KALDI_ASSERT(ans >= 0); - return std::min(ans, max_samp); -} - -bool MultiSinusoidDetector::Done() const { - int32 samp_ready = NumSubsampledSamplesReady(samples_per_frame_subsampled_); - if ((samp_ready >= samples_per_frame_subsampled_ && !waveform_finished_) || - (samp_ready > 0 && waveform_finished_)) - return false; - else - return true; -} - -void MultiSinusoidDetector::GetNextFrameOfSignal(Vector *frame) { - frame->Resize(samples_per_frame_subsampled_, kUndefined); - - int32 sample_offset = 0, - samples_needed = samples_per_frame_subsampled_; - while (samples_needed > 0 && - !subsampled_signal_.empty()) { - Vector *src = subsampled_signal_.front(); - int32 num_samples_avail = src->Dim() - samples_consumed_; - KALDI_ASSERT(num_samples_avail > 0); - int32 chunk_size = std::min(num_samples_avail, - samples_needed); - frame->Range(sample_offset, chunk_size).CopyFromVec( - src->Range(samples_consumed_, chunk_size)); - sample_offset += chunk_size; - samples_needed -= chunk_size; - samples_consumed_ += chunk_size; - if (samples_consumed_ == src->Dim()) { - samples_consumed_ = 0; - delete src; - subsampled_signal_.pop_front(); - } - } - if (samples_needed > 0) { - KALDI_ASSERT(waveform_finished_ && sample_offset > 0); // or code error. - frame->Range(sample_offset, samples_needed).SetZero(); - } -} - - -void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) { - Vector frame; - GetNextFrameOfSignal(&frame); - // Mean subtraction - frame.Add(-1.0 * frame.Sum() / frame.Dim()); - *output = MultiSinusoidDetectorOutput(); // reset to default. - - BaseFloat signal_energy = VecVec(frame, frame); - output->tot_energy = signal_energy / frame.Dim(); - if (signal_energy == 0.0) return; - - // min_energy1 is the lowest energy we might care about. - BaseFloat min_energy1 = signal_energy * - std::min(config_.two_freq_min_total_energy * 0.5, - config_.one_freq_min_energy); - - Sinusoid sinusoid1; - BaseFloat energy1 = detector_.DetectSinusoid(min_energy1, - frame, - &sinusoid1); - - if (energy1 == 0.0) return; // Nothing detected. - - // we only care about the 2nd sinusoid if - // energy1 + energy2 >= signal_energy * two_freq_min_total_energy, - // and energy2 >= signal_energy * config.two_freq_min_energy. - - BaseFloat min_energy2 = - std::max(signal_energy * config_.two_freq_min_energy, - signal_energy * config_.two_freq_min_total_energy - - energy1); - - BaseFloat energy2; - Sinusoid sinusoid2; - - // If there is enough energy left in the signal that we could - // possibly detect a sinusoid of energy at least min_energy2... - if (min_energy2 <= signal_energy - energy1) { - sinusoid1.phase += M_PI; // reverse the phase. - AddSinusoid(config_.subsample_freq, sinusoid1, &frame); - - - energy2 = detector_.DetectSinusoid(min_energy2, - frame, - &sinusoid2); - - if (energy2 > energy1) { - // The following is just for our information, so we are aware - // when the sinusoid detection gives us the non-optimal sinusoid - // first. - BaseFloat factor = energy2 / energy1; - KALDI_VLOG(2) << "Second sinusoid greater than first by a factor of " - << factor << ". (This means sinusoid detection is not " - << " working ideally)."; - } - - if (DetectedTwoFrequency(signal_energy, - sinusoid1, energy1, - sinusoid2, energy2, - output)) - return; - } else { - energy2 = 0.0; - } - // We don't need the return status of the following; we just return anyway. - DetectedOneFrequency(signal_energy, - sinusoid1, energy1, - sinusoid2, energy2, - output); -} - -// acceptable two-frequency tone. -bool MultiSinusoidDetector::DetectedTwoFrequency( - BaseFloat signal_energy, - const Sinusoid &sinusoid1, - BaseFloat energy1, - const Sinusoid &sinusoid2, - BaseFloat energy2, - MultiSinusoidDetectorOutput *output) { - - if (energy1 + energy2 >= signal_energy * - config_.two_freq_min_total_energy && - std::min(energy1, energy2) >= signal_energy * - config_.two_freq_min_energy && - std::min(sinusoid1.freq, sinusoid2.freq) >= config_.min_freq && - std::max(sinusoid1.freq, sinusoid2.freq) <= config_.max_freq) { - output->freq1 = sinusoid1.freq; - output->energy1 = energy1 / signal_energy; - output->freq2 = sinusoid2.freq; - output->energy2 = energy2 / signal_energy; - if (output->freq1 > output->freq2) { - std::swap(output->freq1, output->freq2); - std::swap(output->energy1, output->energy2); - } - return true; - } else { - return false; - } -} - - -// acceptable two-frequency tone. -bool MultiSinusoidDetector::DetectedOneFrequency( - BaseFloat signal_energy, - const Sinusoid &sinusoid1, - BaseFloat energy1, - const Sinusoid &sinusoid2, - BaseFloat energy2, - MultiSinusoidDetectorOutput *output) { - // If sinusoid detection were performing exactly to spec, we could assume - // energy1 >= energy2, but we don't assume this as it's not guaranteed. - if (energy1 > energy2 && energy1 > signal_energy * - config_.one_freq_min_energy && - sinusoid1.freq >= config_.min_freq && - sinusoid1.freq <= config_.max_freq) { - output->freq1 = sinusoid1.freq; - output->energy1 = energy1 / signal_energy; - output->freq2 = 0.0; - output->energy2 = 0.0; - return true; - } else if (energy2 > energy1 && energy2 > signal_energy * - config_.one_freq_min_energy && - sinusoid2.freq >= config_.min_freq && - sinusoid2.freq <= config_.max_freq) { - output->freq1 = sinusoid2.freq; - output->energy1 = energy2 / signal_energy; - output->freq2 = 0.0; - output->energy2 = 0.0; - return true; - } else { - return false; - } -} - - -void DetectSinusoids(const VectorBase &signal, - MultiSinusoidDetector *detector, - Matrix *output) { - std::vector output_vec; - detector->AcceptWaveform(signal); - detector->WaveformFinished(); - - int32 safety_margin = 10, approx_num_frames = safety_margin + - (signal.Dim() / (detector->SamplingFrequency() * - detector->FrameShiftSecs())); - output_vec.reserve(approx_num_frames); - while (!detector->Done()) { - output_vec.resize(output_vec.size() + 1); - detector->GetNextFrame(&(output_vec.back())); - } - detector->Reset(); - if (output_vec.empty()) { - output->Resize(0, 0); - } else { - output->Resize(output_vec.size(), 5, kUndefined); - for (int32 i = 0; i < output->NumRows(); i++) { - BaseFloat *row_data = output->RowData(i); - MultiSinusoidDetectorOutput &this_output = output_vec[i]; - row_data[0] = this_output.tot_energy; - row_data[1] = this_output.freq1; - row_data[2] = this_output.energy1; - row_data[3] = this_output.freq2; - row_data[4] = this_output.energy2; - } - } -} - - -} // namespace kaldi - diff --git a/src/feat/sinusoid-detection.h b/src/feat/sinusoid-detection.h deleted file mode 100644 index f6addc0b530..00000000000 --- a/src/feat/sinusoid-detection.h +++ /dev/null @@ -1,436 +0,0 @@ -// feat/sinusoid-detection.h - -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_FEAT_SINUSOID_DETECTION_H_ -#define KALDI_FEAT_SINUSOID_DETECTION_H_ - - -#include "base/kaldi-error.h" -#include "matrix/matrix-lib.h" -#include "util/common-utils.h" -#include "feat/resample.h" -#include - -namespace kaldi { -/// @addtogroup feat FeatureExtraction -/// @{ - - -struct Sinusoid { - // this structure used to represent a sinusoid of type amplitude cos (2 pi - // freq t + phase), in the SinusoidDetector code. - BaseFloat amplitude; - BaseFloat freq; - BaseFloat phase; - Sinusoid(BaseFloat a, BaseFloat f, BaseFloat p): - amplitude(a), freq(f), phase(p) { } - Sinusoid() {} -}; - - -// This function adds the given sinusoid to the signal, as: -// (*signal)(t) += amplitude * cos(2 pi freq/samp_freq t + phase). -void AddSinusoid(BaseFloat samp_freq, - const Sinusoid &sinusoid, - VectorBase *signal); - - -class SinusoidDetector { - public: - SinusoidDetector(BaseFloat samp_freq, - int32 num_samp); - - - // Detect the dominant sinusoid component in the signal, as long as the - // energy-reduction of the signal from subtracting that sinuoid would be >= - // "min_energy_change", and return that energy reduction; or zero if no - // candidate was found. - // non-const because the FFT class has a temporary buffer. - BaseFloat DetectSinusoid(BaseFloat min_energy_change, - const VectorBase &signal, - Sinusoid *sinusoid); - - // This function does quadratic interpolation for a function that is known at - // three equally spaced points [x0 x1 x2] = [0 1 2], and we want the x-value - // and corresponding y-value at the maximum of the function within the range - // 0 <= x <= 2. It's public for testing reasons. - static void QuadraticMaximizeEqualSpaced( - BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat *x, BaseFloat *y); - - - // This function does quadratic interpolation for a function that is known at - // three points x0, x1 and x2 with x0 = 0, 0 < x1 < 1 and x2 = 1, where we - // want the x-value and corresponding y-value at the maximum of the function - // within the range 0 <= x <= 1. It's public for testing reasons. - static void QuadraticMaximize( - BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat *x, BaseFloat *y); - - // This function does quadratic interpolation for a function that is known at - // three points x0, x1 and x2 with x0 = 0, 0 <= x1 <= 1 and x2 = 1, where - // we want the value at a specific value x. The corresponding y-value is returned. - static BaseFloat QuadraticInterpolate( - BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2, - BaseFloat x); - - - private: - BaseFloat samp_freq_; - int32 num_samples_; - int32 num_samples_padded_; // Number of samples, after zero-padding to power of 2. - SplitRadixRealFft fft_; // Object used to compute FFT of padded_signal_. - - BaseFloat factor1_; // When we search the range between two FFT bins, we - // assume that the maximum energy-reduction within the - // range may be greater than the maximum of the - // energy-reductions at either side, by at most - // "factor1", with factor1 > 1.0. The analysis is quite - // hard so we determine this factor empirically. Making - // this as small as possible helps us avoid searching too - // many bins. - - BaseFloat factor2_; // As factor1, but for searches within a half-fft-bin - // range. Again determined empirically. After that we - // use quadratic interpolation to find the maximum energy. - - // This matrix, of dimension (num_samples_padded_ * 2 + 1) by - // num_samples_, has in each row, a different frequency of cosine wave. - Matrix cos_; - // This matrix, of dimension (num_samples_padded_ * 2 + 1) by - // num_samples_, has in each row, a different frequency of sine wave. - Matrix sin_; - - // M_ is a precomputed matrix of dimension (num_samples_padded_ * 2 + 1) by 3, - // containing the values x y z of a symmetric matrix [ a b; b c ]. There is - // one of these matrices for each frequency, sampled at one quarter the - // spacing of the FFT bins. There is a long comment next to the definition of - // ComputeCoefficients that describes this. - Matrix M_; - - // Minv_ is the coefficients in the same format as M_, but containing the - // corresponding coefficients of the inverse matrix. There is a long comment - // next to the definition of ComputeCoefficients that describes this. - Matrix Minv_; - - - struct InfoForBin { - bool valid; - BaseFloat cos_dot; // dot product of signal with cosine on left frequency - BaseFloat sin_dot; // dot product of signal with sine on left frequency - BaseFloat energy; // energy. - InfoForBin(): valid(false) { } - }; - - // Info after fine optimization within a bin. - struct OptimizedInfo { - int32 bin; - BaseFloat offset; - BaseFloat energy; - BaseFloat cos_coeff; - BaseFloat sin_coeff; - }; - - // Compute the coefficients and energies at the original FFT bins (every - // fourth entry in "info"). - void ComputeCoarseInfo(const Vector &fft, - std::vector *info) const; - - - // After the coarse-level info is computed using ComputeCoarseInfo, finds a - // set of intermediate bin indexes to compute, that are the midpoints of - // coarse-level bins. - void FindCandidateBins(BaseFloat min_energy, - const std::vector &info, - std::vector *bins) const; - - void FindCandidateBins2(BaseFloat min_energy, - const std::vector &info, - std::vector *bins) const; - - - void ComputeBinInfo(const VectorBase &signal, - int32 bin, InfoForBin *info) const; - - - // For each bin b such that we have valid "info" data for bins b, b+1 and b+2, - // does quadratic interpolation to find the maximum predicted energy in the - // range [b, b+2]. The location of the maximum predicted energy is output to - // "bin_out" and "offset_out", and the corresponding predicted energy is - // returned. - // - // Note: if there are two different frequencies with similar maximum energies - // (e.g. within a factor of probably around 1.2 or so), the fact that - // OptimizeFrequency only returns one maximum may potentially lead to the - // smaller maximum being output. We could have modified this to output - // multiple different maxima, which could have been more accurate in terms of - // being guaranteed to output the best maximum, but this probably wouldn't - // have a measurable impact on our application so we haven't bothered. - BaseFloat OptimizeFrequency( - const std::vector &info, - int32 *bin_out, - BaseFloat *offset_out) const; - - - // This function does - // (*cos)(t) = cos(2 pi t freq / samp_freq) - // (*sin)(t) = sin(2 pi t freq / samp_freq) - static void CreateCosAndSin(BaseFloat samp_freq, - BaseFloat freq, - VectorBase *cos, - VectorBase *sin); - - // Do fine optimization of the frequency within a bin, given a reasonable - // approximate position within it based on interpolation (that should be close - // to the optimum). - void FineOptimizeFrequency( - const VectorBase &signal, - int32 bin, - BaseFloat offset, - std::vector *info, - OptimizedInfo *opt_info) const; - - // Computes the coefficients cos_, sin_, and Minv_. - void ComputeCoefficients(); - - // Calls some self-testing code that prints warnings if - // some of our assumptions were wrong. - void SelfTest(const VectorBase &signal, - const std::vector &info, - BaseFloat final_freq, - BaseFloat final_energy); - -}; - - - -/** - This configuration class is for the frame-by-frame detection of - cases where there are one or two sinusoids that can explain - a lot of the energy in the signal. -*/ -struct MultiSinusoidDetectorConfig { - - // frame length in milliseconds - BaseFloat frame_length_ms; - // frame shift in milliseconds - BaseFloat frame_shift_ms; - - // Proportion of the total energy of the signal that the quieter of - // the two sinusoids must comprise, in order to be counted, if two - // sinusoids are detected. - BaseFloat two_freq_min_energy; - - // Proportion of the total energy of the signal that both sinusoids (if - // two are detected) must comprise, in order to be output. - BaseFloat two_freq_min_total_energy; - - // Proportion of the total energy of the signal that a single sinusoid - // must comprise, in order to be output, if we are considering - // reporting a single sinusoid. Note: detection of two sinusoids - // will take precedence over detection of a single sinusoid. - BaseFloat one_freq_min_energy; - - // Lower end of frequency range that we consider; frequencies outside - // this range are not candidates to appear in the detected output. - BaseFloat min_freq; - // Upper end of frequency range that we consider, see min_freq. - BaseFloat max_freq; - - // Frequency to which we subsample the signal before processing it. - // Must be integer because of how LinearResample code works. - int32 subsample_freq; - - // Filter cut-off frequency used in sub-sampling. - BaseFloat subsample_filter_cutoff; - - // the following is not critical and is not exported to the - // command line. - int32 subsample_filter_zeros; - - MultiSinusoidDetectorConfig(): - frame_length_ms(20), frame_shift_ms(10), - two_freq_min_energy(0.2), two_freq_min_total_energy(0.6), - one_freq_min_energy(0.75), min_freq(300.0), - max_freq(1800.0), subsample_freq(4000), - subsample_filter_cutoff(1900.0), subsample_filter_zeros(5) {} - - void Register(OptionsItf *opts) { - opts->Register("frame-length", &frame_length_ms, - "Frame length in milliseconds"); - opts->Register("frame-shift", &frame_shift_ms, - "Frame shift in milliseconds"); - opts->Register("two-freq-min-energy", &two_freq_min_energy, - "For detecting two-frequency tones, minimum energy that " - "the quieter frequency must have (relative to total " - "enegy of frame)"); - opts->Register("two-freq-min-total-energy", &two_freq_min_total_energy, - "For detecting two-frequency tones, minimum energy that " - "the two frequencies together must have (relative to total " - "energy of frame)"); - opts->Register("one-freq-min-energy", &one_freq_min_energy, "For detecting " - "single-frequency tones, minimum energy that the frequency " - "must have relative to total energy of frame"); - opts->Register("min-freq", &min_freq, "Minimum frequency of sinusoid that " - "will be detected"); - opts->Register("max-freq", &max_freq, "Maximum frequency of sinusoid that " - "will be detected"); - opts->Register("subsample-freq", &subsample_freq, "Frequency at which " - "we subsample the signal"); - opts->Register("subsample-filter-cutoff", &subsample_filter_cutoff, "Filter " - "cut-off frequency used in subsampling"); - } - void Check() const { - KALDI_ASSERT(frame_length_ms > 0 && frame_length_ms >= frame_shift_ms && - min_freq > 0 && max_freq > min_freq && - subsample_filter_cutoff > max_freq && - subsample_freq/2 > subsample_filter_cutoff && - subsample_filter_zeros > 2 && - subsample_filter_cutoff > 0.25 * subsample_freq && - two_freq_min_total_energy > two_freq_min_energy && - two_freq_min_energy <= 0.5 * two_freq_min_total_energy); - BaseFloat samples_per_frame_shift = - frame_shift_ms * 0.001 * subsample_freq; - // The following assert ensures that the frame-shift is an exact - // number of samples, so that the locations of the frames - // don't gradually drift out of sync. - KALDI_ASSERT(fabs(samples_per_frame_shift - - static_cast(samples_per_frame_shift)) < - 0.001); - - } -}; - -struct MultiSinusoidDetectorOutput { - BaseFloat tot_energy; // Total energy per sample of this frame (sum-square of - // signal divided by number of samples... this is after - // downsampling and mean subtraction. - BaseFloat freq1; // Lower frequency detected, or 0 if none detected. - BaseFloat energy1; // Energy of lower frequency divided by total energy, or 0 - // if none detected. - BaseFloat freq2; // Lower frequency detected, or 0 if zero or one - // frequencies detected. - BaseFloat energy2; // Energy of higher frequency divided by total energy, or 0 - // if zero or one freqencies detected. - MultiSinusoidDetectorOutput(): tot_energy(0.0), freq1(0.0), - energy1(0.0), freq2(0.0), energy2(0.0) { } -}; - - -class MultiSinusoidDetector { - public: - - // Initialize sinusoid detector. Sampling frequency must be integer. - MultiSinusoidDetector(const MultiSinusoidDetectorConfig &config, - int32 sampling_freq); - - /// This is how the class acccepts its input. You can put the waveform in - /// piece by piece, if it's an online application. - void AcceptWaveform(const VectorBase &waveform); - - /// The user calls this to announce to the class that the waveform has ended; - /// this forces any pending data to be flushed. - void WaveformFinished(); - - /// Resets the state of the class so you can start processing another waveform. - void Reset(); - - /// This returns true if the class currently has no more data ready to output. - bool Done() const; - - /// Outputs the next frame of output to "frame", which must be non-NULL. - /// It is an error to call this if Done() has returned true, or has not been - /// checked. - void GetNextFrame(MultiSinusoidDetectorOutput *output); - - BaseFloat FrameShiftSecs() const { return 0.001 * config_.frame_shift_ms; } - - BaseFloat SamplingFrequency() const { return sample_freq_; } - - private: - // Gets the next frame of subsampled signal, and consumes the appropriate - // amount of stored data. It is an error to call this if Done() returned - // true. - void GetNextFrameOfSignal(Vector *frame); - - // returns true and sets freq1, freq1, energy1 and energy2 in "output" if we - // successfully detected an acceptable two-frequency tone. - bool DetectedTwoFrequency(BaseFloat signal_energy, - const Sinusoid &sinusoid1, - BaseFloat energy1, - const Sinusoid &sinusoid2, - BaseFloat energy2, - MultiSinusoidDetectorOutput *output); - - // returns true and sets freq1, freq1, energy1 and energy2 in "output" if we - // successfully detected an acceptable one-frequency tone. - bool DetectedOneFrequency(BaseFloat signal_energy, - const Sinusoid &sinusoid1, - BaseFloat energy1, - const Sinusoid &sinusoid2, - BaseFloat energy2, - MultiSinusoidDetectorOutput *output); - - - // Returns std::min(max_samp, sum-of-samples-in-subsampled_signal_). - // (the std::min is for efficiency so we don't have to visit the - // whole list). - int32 NumSubsampledSamplesReady(int32 max_samp) const; - - MultiSinusoidDetectorConfig config_; - int32 sample_freq_; - int32 samples_per_frame_subsampled_; // (samples per frame at subsampled - // rate). - - // True if the user has called WaveformFinished(). - bool waveform_finished_; - - // Pieces of the subsampled signal that are awaiting processing. - // Normally there will be just one element here, but if someone calls - // AcceptWaveform multiple times before getting output, there could - // be more elements. All of these pieces are nonempty. - std::deque* > subsampled_signal_; - - // stores the number of samples consumed from the first member of - // subsampled_signal_. We will always have samples_consumed_ >= 0 and either - // (subsampled_signal_.empty() && samples_consumed_ == 0) or - // samples_consumed_ < subsampled_signal_[0]->Dim(). - int32 samples_consumed_; - - - // This object is used to subsample the signal. - LinearResample resampler_; - - // This object is used to detect sinusoids in the subsampled - // frames. - SinusoidDetector detector_; -}; - -// Detect sinusoids. Signal should be sampled at detector->SamplingFrequency(). -void DetectSinusoids(const VectorBase &signal, - MultiSinusoidDetector *detector, - Matrix *output); - - - - - -/// @} End of "addtogroup feat" -} // namespace kaldi -#endif // KALDI_FEAT_SINUSOID_DETECTION_H_ diff --git a/src/featbin/Makefile b/src/featbin/Makefile index dc2bea215d8..c51867b7d4c 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -14,8 +14,8 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \ apply-cmvn-sliding compute-cmvn-stats-two-channel compute-kaldi-pitch-feats \ process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \ compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \ - wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \ - concat-feats append-post-to-feats post-to-feats + wav-reverberate append-vector-to-feats shift-feats concat-feats \ + append-post-to-feats post-to-feats OBJFILES = diff --git a/src/featbin/detect-sinusoids.cc b/src/featbin/detect-sinusoids.cc deleted file mode 100644 index 6c104d5ab5f..00000000000 --- a/src/featbin/detect-sinusoids.cc +++ /dev/null @@ -1,113 +0,0 @@ -// featbin/detect-sinusoids.cc - -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "feat/sinusoid-detection.h" -#include "feat/wave-reader.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - const char *usage = - "Detect sinusoids (one or two at a time) in waveform input and output\n" - "frame-by-frame information on their frequencies and energies. Useful\n" - "as part of DTMF and dialtone detection. Output is an archive of\n" - "matrices; for each file, there is a row per frame, containing\n" - " \n" - "where the frequencies and energies may be zero if no sufficiently\n" - "dominant sinusoid(s) was/were detected. If two frequencies were\n" - "detected, frequency1 < frequency2. See options for more detail on\n" - "configuration options.\n" - "\n" - "Usage: detect-sinusoids [options] \n" - "e.g.: detect-sinusoids scp:wav.scp ark,t:sinusoids.ark\n"; - - ParseOptions po(usage); - MultiSinusoidDetectorConfig config; - - config.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - - std::string wav_rspecifier = po.GetArg(1), - matrix_wspecifier = po.GetArg(2); - - int32 num_done = 0, num_err = 0; - - SequentialTableReader wav_reader(wav_rspecifier); - BaseFloatMatrixWriter matrix_writer(matrix_wspecifier); - - MultiSinusoidDetector *detector = NULL; - - for (; !wav_reader.Done(); wav_reader.Next()) { - const WaveData &wav_data = wav_reader.Value(); - const Matrix &data = wav_data.Data(); - BaseFloat samp_freq = wav_data.SampFreq(); - int32 num_channels = data.NumRows(); - if (num_channels != 1) { - KALDI_WARN << "detect-sinusoids requires data with one " - << "channel. Recording " << wav_reader.Key() << " has " - << num_channels << ". First select one channel of your " - << "data (e.g. using sox)"; - num_err++; - continue; - } - if (samp_freq < config.subsample_freq) { - KALDI_WARN << "Sampling frequency of data " << wav_reader.Key() - << " is too low " << samp_freq << " < " - << config.subsample_freq << ". Reduce --subsample-freq " - << "if you want to run on this data."; - num_err++; - continue; - } - - if (detector == NULL || - samp_freq != detector->SamplingFrequency()) { - delete detector; - detector = new MultiSinusoidDetector(config, samp_freq); - } - - Matrix output; - DetectSinusoids(data.Row(0), detector, &output); - - if (output.NumRows() == 0) { - KALDI_WARN << "No output for " << wav_reader.Key(); - num_err++; - } else { - matrix_writer.Write(wav_reader.Key(), output); - num_done++; - } - } - delete detector; - KALDI_LOG << "Detected sinusoids in " << num_done << " wave files," - << num_err << " with errors."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - diff --git a/src/sgmm/Makefile b/src/sgmm/Makefile deleted file mode 100644 index 26996a13116..00000000000 --- a/src/sgmm/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -all: - -OPENFST_CXXFLAGS = -OPENFST_LDLIBS = -include ../kaldi.mk - -TESTFILES = am-sgmm-test estimate-am-sgmm-test fmllr-sgmm-test \ - estimate-am-sgmm-multi-test - -OBJFILES = am-sgmm.o estimate-am-sgmm.o fmllr-sgmm.o sgmm-clusterable.o \ - estimate-am-sgmm-ebw.o estimate-am-sgmm-multi.o decodable-am-sgmm.o - -LIBNAME = kaldi-sgmm -ADDLIBS = ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ - ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a - -include ../makefiles/default_rules.mk diff --git a/src/sgmm/am-sgmm-test.cc b/src/sgmm/am-sgmm-test.cc deleted file mode 100644 index 8b463a29f0e..00000000000 --- a/src/sgmm/am-sgmm-test.cc +++ /dev/null @@ -1,278 +0,0 @@ -// sgmm/am-sgmm-test.cc - -// Copyright 2012 Arnab Ghoshal -// Copyright 2009-2011 Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "gmm/model-test-common.h" -#include "sgmm/am-sgmm.h" -#include "util/kaldi-io.h" - -using kaldi::AmSgmm; -using kaldi::int32; -using kaldi::BaseFloat; -namespace ut = kaldi::unittest; - -// Tests the initialization routines: InitializeFromFullGmm(), CopyFromSgmm() -// and CopyGlobalsInitVecs(). -void TestSgmmInit(const AmSgmm &sgmm) { - using namespace kaldi; - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmGselectConfig config; - config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss()); - - kaldi::Vector feat(dim); - for (int32 d = 0; d < dim; d++) { - feat(d) = kaldi::RandGauss(); - } - kaldi::SgmmPerFrameDerivedVars frame_vars; - frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(), - sgmm.PhoneSpaceDim()); - - std::vector gselect; - sgmm.GaussianSelection(config, feat, &gselect); - SgmmPerSpkDerivedVars empty; - SgmmPerFrameDerivedVars per_frame; - sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0); - - // First, test the CopyFromSgmm() method: - AmSgmm *sgmm1 = new AmSgmm(); - sgmm1->CopyFromSgmm(sgmm, true); - sgmm1->GaussianSelection(config, feat, &gselect); - sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike1, 1e-4); - delete sgmm1; - - AmSgmm *sgmm2 = new AmSgmm(); - sgmm2->CopyFromSgmm(sgmm, false); - sgmm2->ComputeNormalizers(); - sgmm2->GaussianSelection(config, feat, &gselect); - sgmm2->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike2, 1e-4); - delete sgmm2; - - // Next, initialize using the UBM from the current model - AmSgmm *sgmm3 = new AmSgmm(); - sgmm3->InitializeFromFullGmm(sgmm.full_ubm(), sgmm.NumPdfs(), - sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim()); - sgmm3->ComputeNormalizers(); - sgmm3->GaussianSelection(config, feat, &gselect); - sgmm3->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike3 = sgmm3->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike3, 1e-4); - delete sgmm3; - - // Finally, copy the global parameters from the current model - AmSgmm *sgmm4 = new AmSgmm(); - sgmm4->CopyGlobalsInitVecs(sgmm, sgmm.PhoneSpaceDim(), sgmm.SpkSpaceDim(), - sgmm.NumPdfs()); - sgmm4->ComputeNormalizers(); - sgmm4->GaussianSelection(config, feat, &gselect); - sgmm4->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike4 = sgmm4->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike4, 1e-4); - delete sgmm4; -} - -// Tests the Read() and Write() methods, in both binary and ASCII mode, as well -// as Check(), and methods in likelihood computations. -void TestSgmmIO(const AmSgmm &sgmm) { - using namespace kaldi; - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmGselectConfig config; - config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss()); - - kaldi::Vector feat(dim); - for (int32 d = 0; d < dim; d++) { - feat(d) = kaldi::RandGauss(); - } - kaldi::SgmmPerFrameDerivedVars frame_vars; - frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(), - sgmm.PhoneSpaceDim()); - - std::vector gselect; - sgmm.GaussianSelection(config, feat, &gselect); - SgmmPerSpkDerivedVars empty; - SgmmPerFrameDerivedVars per_frame; - sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0); - - // First, non-binary write - sgmm.Write(kaldi::Output("tmpf", false).Stream(), false, - kaldi::kSgmmWriteAll); - - bool binary_in; - AmSgmm *sgmm1 = new AmSgmm(); - // Non-binary read - kaldi::Input ki1("tmpf", &binary_in); - sgmm1->Read(ki1.Stream(), binary_in); - sgmm1->Check(true); - sgmm1->GaussianSelection(config, feat, &gselect); - sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - - BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike1, 1e-4); - - // Next, binary write - sgmm1->Write(kaldi::Output("tmpfb", true).Stream(), true, - kaldi::kSgmmWriteAll); - delete sgmm1; - - AmSgmm *sgmm2 = new AmSgmm(); - // Binary read - kaldi::Input ki2("tmpfb", &binary_in); - sgmm2->Read(ki2.Stream(), binary_in); - sgmm2->Check(true); - sgmm2->GaussianSelection(config, feat, &gselect); - sgmm2->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike2 = sgmm2->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike2, 1e-4); - delete sgmm2; - - unlink("tmpf"); - unlink("tmpfb"); -} - -void TestSgmmSubstates(const AmSgmm &sgmm) { - using namespace kaldi; - int32 target_substates = 2 * sgmm.NumPdfs(); - kaldi::Vector occs(sgmm.NumPdfs()); - for (int32 i = 0; i < occs.Dim(); i++) - occs(i) = std::fabs(kaldi::RandGauss()) * (kaldi::RandUniform()+1); - AmSgmm *sgmm1 = new AmSgmm(); - sgmm1->CopyFromSgmm(sgmm, false); - sgmm1->SplitSubstates(occs, target_substates, 0.01, 0.2, 1000); - sgmm1->ComputeNormalizers(); - sgmm1->Check(true); - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmGselectConfig config; - config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss()); - kaldi::Vector feat(dim); - for (int32 d = 0; d < dim; d++) { - feat(d) = kaldi::RandGauss(); - } - - std::vector gselect; - sgmm.GaussianSelection(config, feat, &gselect); - - SgmmPerSpkDerivedVars empty; - SgmmPerFrameDerivedVars per_frame; - sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0); - - sgmm1->GaussianSelection(config, feat, &gselect); - sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike1, 1e-2); - - delete sgmm1; -} - -void TestSgmmIncreaseDim(const AmSgmm &sgmm) { - using namespace kaldi; - int32 target_phn_dim = static_cast(1.5 * sgmm.PhoneSpaceDim()); - int32 target_spk_dim = sgmm.PhoneSpaceDim() - 1; - - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmGselectConfig config; - config.full_gmm_nbest = std::min(config.full_gmm_nbest, sgmm.NumGauss()); - kaldi::Vector feat(dim); - for (int32 d = 0; d < dim; d++) { - feat(d) = kaldi::RandGauss(); - } - kaldi::SgmmPerFrameDerivedVars frame_vars; - - std::vector gselect; - sgmm.GaussianSelection(config, feat, &gselect); - SgmmPerSpkDerivedVars empty; - SgmmPerFrameDerivedVars per_frame; - sgmm.ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike = sgmm.LogLikelihood(per_frame, 0); - - kaldi::Matrix norm_xform; - kaldi::ComputeFeatureNormalizer(sgmm.full_ubm(), &norm_xform); - AmSgmm *sgmm1 = new AmSgmm(); - sgmm1->CopyFromSgmm(sgmm, false); - sgmm1->Check(true); - sgmm1->IncreasePhoneSpaceDim(target_phn_dim, norm_xform); - sgmm1->ComputeNormalizers(); - sgmm1->Check(true); - - - sgmm1->GaussianSelection(config, feat, &gselect); - sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike1 = sgmm1->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike1, 1e-4); - - sgmm1->IncreaseSpkSpaceDim(target_spk_dim, norm_xform); - sgmm1->Check(true); - sgmm1->GaussianSelection(config, feat, &gselect); - sgmm1->ComputePerFrameVars(feat, gselect, empty, 0.0, &per_frame); - BaseFloat loglike2 = sgmm1->LogLikelihood(per_frame, 0); - kaldi::AssertEqual(loglike, loglike2, 1e-4); - delete sgmm1; -} - -void TestSgmmPreXform(const AmSgmm &sgmm) { - kaldi::Matrix xform, inv_xform; - kaldi::Vector diag_scatter; - kaldi::Vector occs(sgmm.NumPdfs()); - occs.Set(100); - sgmm.ComputeFmllrPreXform(occs, &xform, &inv_xform, &diag_scatter); - int32 dim = xform.NumRows(); - kaldi::SubMatrix a_pre(xform, 0, dim, 0, dim), - a_inv(inv_xform, 0, dim, 0, dim); - kaldi::Vector b_pre(dim), b_inv(dim); - b_pre.CopyColFromMat(xform, dim); - b_inv.CopyColFromMat(inv_xform, dim); - kaldi::Matrix res_mat(dim, dim, kaldi::kSetZero); - res_mat.AddMatMat(1.0, a_pre, kaldi::kNoTrans, a_inv, kaldi::kNoTrans, 0.0); - KALDI_ASSERT(res_mat.IsUnit(1.0e-6)); - kaldi::Vector res_vec(dim, kaldi::kSetZero); - res_vec.AddMatVec(1.0, a_inv, kaldi::kNoTrans, b_pre, 0.0); - res_vec.AddVec(1.0, b_inv); - KALDI_ASSERT(res_vec.IsZero(1.0e-6)); -} - -void UnitTestSgmm() { - size_t dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm - size_t num_comp = 3 + kaldi::RandInt(0, 9); // random number of mixtures; - // make sure it's more than one or we get errors initializing the SGMM. - kaldi::FullGmm full_gmm; - ut::InitRandFullGmm(dim, num_comp, &full_gmm); - - size_t num_states = 1; - AmSgmm sgmm; - kaldi::SgmmGselectConfig config; - sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, 0); - sgmm.ComputeNormalizers(); - TestSgmmInit(sgmm); - TestSgmmIO(sgmm); - TestSgmmSubstates(sgmm); - TestSgmmIncreaseDim(sgmm); - TestSgmmPreXform(sgmm); -} - -int main() { - for (int i = 0; i < 10; i++) - UnitTestSgmm(); - std::cout << "Test OK.\n"; - return 0; -} diff --git a/src/sgmm/am-sgmm.cc b/src/sgmm/am-sgmm.cc deleted file mode 100644 index 1cd7c6a2b62..00000000000 --- a/src/sgmm/am-sgmm.cc +++ /dev/null @@ -1,1395 +0,0 @@ -// sgmm/am-sgmm.cc - -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; -// Saarland University (Author: Arnab Ghoshal); -// Ondrej Glembek; Yanmin Qian; -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) -// Liang Lu; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "sgmm/am-sgmm.h" -#include "thread/kaldi-thread.h" - -namespace kaldi { -using std::vector; - -// This function needs to be added because std::generate is complaining -// about RandGauss(), which takes an optional arguments. -static inline float _RandGauss() -{ - return RandGauss(); -} - -void AmSgmm::Read(std::istream &in_stream, bool binary) { - int32 num_states, feat_dim, num_gauss; - std::string token; - - ExpectToken(in_stream, binary, ""); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &num_states); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &feat_dim); - KALDI_ASSERT(num_states > 0 && feat_dim > 0); - - ReadToken(in_stream, binary, &token); - - while (token != "") { - if (token == "") { - diag_ubm_.Read(in_stream, binary); - } else if (token == "") { - full_ubm_.Read(in_stream, binary); - } else if (token == "") { - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &num_gauss); - SigmaInv_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - SigmaInv_[i].Read(in_stream, binary); - } - } else if (token == "") { - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &num_gauss); - M_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - M_[i].Read(in_stream, binary); - } - } else if (token == "") { - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &num_gauss); - N_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - N_[i].Read(in_stream, binary); - } - } else if (token == "") { - w_.Read(in_stream, binary); - } else if (token == "") { - v_.resize(num_states); - for (int32 j = 0; j < num_states; j++) { - v_[j].Read(in_stream, binary); - } - } else if (token == "") { - c_.resize(num_states); - for (int32 j = 0; j < num_states; j++) { - c_[j].Read(in_stream, binary); - } - } else if (token == "") { - n_.resize(num_states); - for (int32 j = 0; j < num_states; j++) { - n_[j].Read(in_stream, binary); - } - // The following are the Gaussian prior parameters for MAP adaptation of M - // They may be moved to somewhere else eventually. - } else if (token == "") { - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &num_gauss); - M_prior_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - M_prior_[i].Read(in_stream, binary); - } - } else if (token == "") { - row_cov_inv_.Read(in_stream, binary); - } else if (token == "") { - col_cov_inv_.Read(in_stream, binary); - } else { - KALDI_ERR << "Unexpected token '" << token << "' in model file "; - } - ReadToken(in_stream, binary, &token); - } - - if (n_.empty()) { - ComputeNormalizers(); - } -} - -void AmSgmm::Write(std::ostream &out_stream, bool binary, - SgmmWriteFlagsType write_params) const { - int32 num_states = NumPdfs(), - feat_dim = FeatureDim(), - num_gauss = NumGauss(); - - WriteToken(out_stream, binary, ""); - if (!binary) out_stream << "\n"; - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, num_states); - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, feat_dim); - if (!binary) out_stream << "\n"; - - if (write_params & kSgmmBackgroundGmms) { - WriteToken(out_stream, binary, ""); - diag_ubm_.Write(out_stream, binary); - WriteToken(out_stream, binary, ""); - full_ubm_.Write(out_stream, binary); - } - - if (write_params & kSgmmGlobalParams) { - WriteToken(out_stream, binary, ""); - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, num_gauss); - if (!binary) out_stream << "\n"; - for (int32 i = 0; i < num_gauss; i++) { - SigmaInv_[i].Write(out_stream, binary); - } - WriteToken(out_stream, binary, ""); - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, num_gauss); - if (!binary) out_stream << "\n"; - for (int32 i = 0; i < num_gauss; i++) { - M_[i].Write(out_stream, binary); - } - if (N_.size() != 0) { - WriteToken(out_stream, binary, ""); - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, num_gauss); - if (!binary) out_stream << "\n"; - for (int32 i = 0; i < num_gauss; i++) { - N_[i].Write(out_stream, binary); - } - } - WriteToken(out_stream, binary, ""); - w_.Write(out_stream, binary); - - // The following are the Gaussian prior parameters for MAP adaptation of M. - // They may be moved to somewhere else eventually. - if (M_prior_.size() != 0) { - WriteToken(out_stream, binary, ""); - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, num_gauss); - if (!binary) out_stream << "\n"; - for (int32 i = 0; i < num_gauss; i++) { - M_prior_[i].Write(out_stream, binary); - } - - KALDI_ASSERT(row_cov_inv_.NumRows() != 0 && - "Empty row covariance for MAP prior"); - WriteToken(out_stream, binary, ""); - if (!binary) out_stream << "\n"; - row_cov_inv_.Write(out_stream, binary); - - KALDI_ASSERT(col_cov_inv_.NumRows() != 0 && - "Empty column covariance for MAP prior"); - WriteToken(out_stream, binary, ""); - if (!binary) out_stream << "\n"; - col_cov_inv_.Write(out_stream, binary); - } - // end priors for MAP adaptation - } - - if (write_params & kSgmmStateParams) { - WriteToken(out_stream, binary, ""); - for (int32 j = 0; j < num_states; j++) { - v_[j].Write(out_stream, binary); - } - WriteToken(out_stream, binary, ""); - for (int32 j = 0; j < num_states; j++) { - c_[j].Write(out_stream, binary); - } - } - - if (write_params & kSgmmNormalizers) { - WriteToken(out_stream, binary, ""); - if (n_.empty()) - KALDI_WARN << "Not writing normalizers since they are not present."; - else - for (int32 j = 0; j < num_states; j++) - n_[j].Write(out_stream, binary); - } - - WriteToken(out_stream, binary, ""); -} - -void AmSgmm::Check(bool show_properties) { - int32 num_states = NumPdfs(), - num_gauss = NumGauss(), - feat_dim = FeatureDim(), - phn_dim = PhoneSpaceDim(), - spk_dim = SpkSpaceDim(); - - if (show_properties) - KALDI_LOG << "AmSgmm: #states = " << num_states << ", #Gaussians = " - << num_gauss << ", feature dim = " << feat_dim - << ", phone-space dim =" << phn_dim - << ", speaker-space dim =" << spk_dim; - KALDI_ASSERT(num_states > 0 && num_gauss > 0 && feat_dim > 0 && phn_dim > 0); - - std::ostringstream debug_str; - - // First check the diagonal-covariance UBM. - KALDI_ASSERT(diag_ubm_.NumGauss() == num_gauss); - KALDI_ASSERT(diag_ubm_.Dim() == feat_dim); - - // Check the full-covariance UBM. - KALDI_ASSERT(full_ubm_.NumGauss() == num_gauss); - KALDI_ASSERT(full_ubm_.Dim() == feat_dim); - - // Check the globally-shared covariance matrices. - KALDI_ASSERT(SigmaInv_.size() == static_cast(num_gauss)); - for (int32 i = 0; i < num_gauss; i++) { - KALDI_ASSERT(SigmaInv_[i].NumRows() == feat_dim && - SigmaInv_[i](0, 0) > 0.0); // or it wouldn't be +ve definite. - } - - KALDI_ASSERT(M_.size() == static_cast(num_gauss)); - for (int32 i = 0; i < num_gauss; i++) { - KALDI_ASSERT(M_[i].NumRows() == feat_dim && M_[i].NumCols() == phn_dim); - } - - KALDI_ASSERT(w_.NumRows() == num_gauss && w_.NumCols() == phn_dim); - - { // check v, c. - KALDI_ASSERT(v_.size() == static_cast(num_states) && - c_.size() == static_cast(num_states)); - int32 nSubstatesTot = 0; - for (int32 j = 0; j < num_states; j++) { - int32 M_j = NumSubstates(j); - nSubstatesTot += M_j; - KALDI_ASSERT(M_j > 0 && v_[j].NumRows() == M_j && - c_[j].Dim() == M_j && v_[j].NumCols() == phn_dim); - } - debug_str << "Substates: "<< (nSubstatesTot) << ". "; - } - - // check n. - if (n_.size() == 0) { - debug_str << "Normalizers: no. "; - } else { - debug_str << "Normalizers: yes. "; - KALDI_ASSERT(n_.size() == static_cast(num_states)); - for (int32 j = 0; j < num_states; j++) { - KALDI_ASSERT(n_[j].NumRows() == num_gauss && - n_[j].NumCols() == NumSubstates(j)); - } - } - - if (show_properties) - KALDI_LOG << "Subspace GMM model properties: " << debug_str.str(); -} - -void AmSgmm::InitializeFromFullGmm(const FullGmm &full_gmm, - int32 num_states, - int32 phn_subspace_dim, - int32 spk_subspace_dim) { - full_ubm_.CopyFromFullGmm(full_gmm); - diag_ubm_.CopyFromFullGmm(full_gmm); - if (phn_subspace_dim < 1 || phn_subspace_dim > full_gmm.Dim() + 1) { - KALDI_WARN << "Initial phone-subspace dimension must be in [1, " - << full_gmm.Dim() + 1 << "]. Changing from " << phn_subspace_dim - << " to " << full_gmm.Dim() + 1; - phn_subspace_dim = full_gmm.Dim() + 1; - } - if (spk_subspace_dim < 0 || spk_subspace_dim > full_gmm.Dim()) { - KALDI_WARN << "Initial spk-subspace dimension must be in [1, " - << full_gmm.Dim() << "]. Changing from " << spk_subspace_dim - << " to " << full_gmm.Dim(); - spk_subspace_dim = full_gmm.Dim(); - } - w_.Resize(0, 0); - N_.clear(); - c_.clear(); - v_.clear(); - SigmaInv_.clear(); - - KALDI_LOG << "Initializing model"; - Matrix norm_xform; - ComputeFeatureNormalizer(full_gmm, &norm_xform); - InitializeMw(phn_subspace_dim, norm_xform); - if (spk_subspace_dim > 0) InitializeN(spk_subspace_dim, norm_xform); - InitializeVecs(num_states); - KALDI_LOG << "Initializing variances"; - InitializeCovars(); -} - -void AmSgmm::CopyFromSgmm(const AmSgmm &other, - bool copy_normalizers) { - KALDI_LOG << "Copying AmSgmm"; - - // Copy background GMMs - diag_ubm_.CopyFromDiagGmm(other.diag_ubm_); - full_ubm_.CopyFromFullGmm(other.full_ubm_); - - // Copy global params - SigmaInv_ = other.SigmaInv_; - M_ = other.M_; - w_ = other.w_; - N_ = other.N_; - - // Copy state-specific params, but only copy normalizers if requested. - v_ = other.v_; - c_ = other.c_; - if (copy_normalizers) n_ = other.n_; - - KALDI_LOG << "Done."; -} - -void AmSgmm::CopyGlobalsInitVecs(const AmSgmm &other, - int32 phn_subspace_dim, - int32 spk_subspace_dim, - int32 num_pdfs) { - if (phn_subspace_dim < 1 || phn_subspace_dim > other.PhoneSpaceDim()) { - KALDI_WARN << "Initial phone-subspace dimension must be in [1, " - << other.PhoneSpaceDim() << "]. Changing from " << phn_subspace_dim - << " to " << other.PhoneSpaceDim(); - phn_subspace_dim = other.PhoneSpaceDim(); - } - if (spk_subspace_dim < 0 || spk_subspace_dim > other.SpkSpaceDim()) { - KALDI_WARN << "Initial spk-subspace dimension must be in [1, " - << other.SpkSpaceDim() << "]. Changing from " << spk_subspace_dim - << " to " << other.SpkSpaceDim(); - spk_subspace_dim = other.SpkSpaceDim(); - } - - KALDI_LOG << "Initializing model"; - - // Copy background GMMs - diag_ubm_.CopyFromDiagGmm(other.diag_ubm_); - full_ubm_.CopyFromFullGmm(other.full_ubm_); - - // Copy global params - SigmaInv_ = other.SigmaInv_; - int32 num_gauss = diag_ubm_.NumGauss(), - data_dim = other.FeatureDim(); - M_.resize(num_gauss); - w_.Resize(num_gauss, phn_subspace_dim); - for (int32 i = 0; i < num_gauss; i++) { - M_[i].Resize(data_dim, phn_subspace_dim); - M_[i].CopyFromMat(other.M_[i].Range(0, data_dim, 0, phn_subspace_dim), - kNoTrans); - } - w_.CopyFromMat(other.w_.Range(0, num_gauss, 0, phn_subspace_dim), kNoTrans); - - if (spk_subspace_dim > 0) { - N_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - N_[i].Resize(data_dim, spk_subspace_dim); - N_[i].CopyFromMat(other.N_[i].Range(0, data_dim, 0, spk_subspace_dim), - kNoTrans); - } - } else { - N_.clear(); - } - InitializeVecs(num_pdfs); -} - - -void AmSgmm::ComputePerFrameVars(const VectorBase &data, - const std::vector &gselect, - const SgmmPerSpkDerivedVars &spk_vars, - BaseFloat logdet_s, - SgmmPerFrameDerivedVars *per_frame_vars) const { - KALDI_ASSERT(!n_.empty() && "ComputeNormalizers() must be called."); - - if (per_frame_vars->NeedsResizing(gselect.size(), - FeatureDim(), - PhoneSpaceDim())) - per_frame_vars->Resize(gselect.size(), FeatureDim(), PhoneSpaceDim()); - - per_frame_vars->gselect = gselect; - per_frame_vars->xt.CopyFromVec(data); - - for (int32 ki = 0, last = gselect.size(); ki < last; ki++) { - int32 i = gselect[ki]; - per_frame_vars->xti.Row(ki).CopyFromVec(per_frame_vars->xt); - if (spk_vars.v_s.Dim() != 0) - per_frame_vars->xti.Row(ki).AddVec(-1.0, spk_vars.o_s.Row(i)); - } - Vector SigmaInv_xt(FeatureDim()); - for (int32 ki = 0, last = gselect.size(); ki < last; ki++) { - int32 i = gselect[ki]; - SigmaInv_xt.AddSpVec(1.0, SigmaInv_[i], per_frame_vars->xti.Row(ki), 0.0); - // Eq (35): z_{i}(t) = M_{i}^{T} \Sigma_{i}^{-1} x_{i}(t) - per_frame_vars->zti.Row(ki).AddMatVec(1.0, M_[i], kTrans, SigmaInv_xt, 0.0); - // Eq.(36): n_{i}(t) = -0.5 x_{i}^{T} \Sigma_{i}^{-1} x_{i}(t) - per_frame_vars->nti(ki) = -0.5 * VecVec(per_frame_vars->xti.Row(ki), - SigmaInv_xt) + logdet_s; - } -} - -BaseFloat AmSgmm::LogLikelihood(const SgmmPerFrameDerivedVars &per_frame_vars, - int32 j, BaseFloat log_prune) const { - KALDI_ASSERT(j < NumPdfs()); - const vector &gselect = per_frame_vars.gselect; - - - // Eq.(37): log p(x(t), m, i|j) [indexed by j, ki] - // Although the extra memory allocation of storing this as a - // matrix might seem unnecessary, we save time in the LogSumExp() - // via more effective pruning. - Matrix logp_x(gselect.size(), NumSubstates(j)); - - for (int32 ki = 0, last = gselect.size(); ki < last; ki++) { - SubVector logp_xi(logp_x, ki); - int32 i = gselect[ki]; - // for all substates, compute z_{i}^T v_{jm} - logp_xi.AddMatVec(1.0, v_[j], kNoTrans, per_frame_vars.zti.Row(ki), 0.0); - logp_xi.AddVec(1.0, n_[j].Row(i)); // for all substates, add n_{jim} - logp_xi.Add(per_frame_vars.nti(ki)); // for all substates, add n_{i}(t) - } - // Eq. (38): log p(x(t)|j) = log \sum_{m, i} p(x(t), m, i|j) - return logp_x.LogSumExp(log_prune); -} - -BaseFloat -AmSgmm::ComponentPosteriors(const SgmmPerFrameDerivedVars &per_frame_vars, - int32 j, - Matrix *post) const { - KALDI_ASSERT(j < NumPdfs()); - if (post == NULL) KALDI_ERR << "NULL pointer passed as return argument."; - const vector &gselect = per_frame_vars.gselect; - int32 num_gselect = gselect.size(); - post->Resize(num_gselect, NumSubstates(j)); - - // Eq.(37): log p(x(t), m, i|j) = z_{i}^T v_{jm} (for all substates) - post->AddMatMat(1.0, per_frame_vars.zti, kNoTrans, v_[j], kTrans, 0.0); - for (int32 ki = 0; ki < num_gselect; ki++) { - int32 i = gselect[ki]; - // Eq. (37): log p(x(t), m, i|j) += n_{jim} + n_{i}(t) (for all substates) - post->Row(ki).AddVec(1.0, n_[j].Row(i)); - post->Row(ki).Add(per_frame_vars.nti(ki)); - } - - // Eq. (38): log p(x(t)|j) = log \sum_{m, i} p(x(t), m, i|j) - return post->ApplySoftMax(); -} - -struct SubstateCounter { - SubstateCounter(int32 j, int32 num_substates, BaseFloat occ) - : state_index(j), num_substates(num_substates), occupancy(occ) {} - - int32 state_index; - int32 num_substates; - BaseFloat occupancy; - - bool operator < (const SubstateCounter &r) const { - return occupancy/num_substates < r.occupancy/r.num_substates; - } -}; - -void AmSgmm::SplitSubstates(const Vector &state_occupancies, - int32 target_nsubstates, BaseFloat perturb, - BaseFloat power, BaseFloat max_cond) { - // power == p in document. target_nsubstates == T in document. - KALDI_ASSERT(state_occupancies.Dim() == NumPdfs()); - int32 tot_n_substates_old = 0; - int32 phn_dim = PhoneSpaceDim(); - std::priority_queue substate_counts; - vector< SpMatrix > H_i; - SpMatrix sqrt_H_sm; - Vector rand_vec(phn_dim), v_shift(phn_dim); - - for (int32 j = 0; j < NumPdfs(); j++) { - BaseFloat gamma_p = pow(state_occupancies(j), power); - substate_counts.push(SubstateCounter(j, NumSubstates(j), gamma_p)); - tot_n_substates_old += NumSubstates(j); - } - if (target_nsubstates <= tot_n_substates_old || tot_n_substates_old == 0) { - KALDI_WARN << "Cannot split from " << (tot_n_substates_old) << - " to " << (target_nsubstates) << " substates."; - return; - } - - ComputeH(&H_i); // set up that array. - ComputeSmoothingTermsFromModel(H_i, state_occupancies, &sqrt_H_sm, max_cond); - H_i.clear(); - sqrt_H_sm.ApplyPow(-0.5); - - for (int32 n_states = tot_n_substates_old; - n_states < target_nsubstates; n_states++) { - SubstateCounter state_to_split = substate_counts.top(); - substate_counts.pop(); - state_to_split.num_substates++; - substate_counts.push(state_to_split); - } - - while (!substate_counts.empty()) { - int32 j = substate_counts.top().state_index; - int32 tgt_n_substates_j = substate_counts.top().num_substates; - int32 n_substates_j = NumSubstates(j); - substate_counts.pop(); - - if (n_substates_j == tgt_n_substates_j) continue; - - // Resize v[j] and c[j] to fit new substates - Matrix tmp_v_j(v_[j]); - v_[j].Resize(tgt_n_substates_j, phn_dim); - v_[j].Range(0, n_substates_j, 0, phn_dim).CopyFromMat(tmp_v_j); - tmp_v_j.Resize(0, 0); - - Vector tmp_c_j(c_[j]); - c_[j].Resize(tgt_n_substates_j); - c_[j].Range(0, n_substates_j).CopyFromVec(tmp_c_j); - tmp_c_j.Resize(0); - - // Keep splitting substates until obtaining the desired number - for (; n_substates_j < tgt_n_substates_j; n_substates_j++) { - int32 split_substate = std::max_element(c_[j].Data(), c_[j].Data() - + n_substates_j) - c_[j].Data(); - - // c_{jkm} := c_{jmk}' := c_{jkm} / 2 - c_[j](split_substate) = c_[j](n_substates_j) = c_[j](split_substate) / 2; - - // v_{jkm} := +/- split_perturb * H_k^{(sm)}^{-0.5} * rand_vec - std::generate(rand_vec.Data(), rand_vec.Data() + rand_vec.Dim(), - _RandGauss); - v_shift.AddSpVec(perturb, sqrt_H_sm, rand_vec, 0.0); - v_[j].Row(n_substates_j).CopyFromVec(v_[j].Row(split_substate)); - v_[j].Row(n_substates_j).AddVec(1.0, v_shift); - v_[j].Row(split_substate).AddVec((-1.0), v_shift); - } - } - KALDI_LOG << "Getting rid of normalizers as they will no longer be valid"; - - n_.clear(); - KALDI_LOG << "Split " << (tot_n_substates_old) << " substates to " - << (target_nsubstates); -} - -void AmSgmm::IncreasePhoneSpaceDim(int32 target_dim, - const Matrix &norm_xform) { - KALDI_ASSERT(!M_.empty()); - int32 initial_dim = PhoneSpaceDim(), - feat_dim = FeatureDim(); - KALDI_ASSERT(norm_xform.NumRows() == feat_dim); - - if (target_dim < initial_dim) - KALDI_ERR << "You asked to increase phn dim to a value lower than the " - << " current dimension, " << target_dim << " < " << initial_dim; - - if (target_dim > initial_dim + feat_dim) { - KALDI_WARN << "Cannot increase phone subspace dimensionality from " - << initial_dim << " to " << target_dim << ", increasing to " - << initial_dim + feat_dim; - target_dim = initial_dim + feat_dim; - } - - if (initial_dim < target_dim) { - Matrix tmp_M(feat_dim, initial_dim); - for (int32 i = 0; i < NumGauss(); i++) { - tmp_M.CopyFromMat(M_[i]); - M_[i].Resize(feat_dim, target_dim); - M_[i].Range(0, feat_dim, 0, tmp_M.NumCols()).CopyFromMat(tmp_M); - M_[i].Range(0, feat_dim, tmp_M.NumCols(), - target_dim - tmp_M.NumCols()).CopyFromMat(norm_xform.Range(0, - feat_dim, 0, target_dim-tmp_M.NumCols())); - } - Matrix tmp_w = w_; - w_.Resize(tmp_w.NumRows(), target_dim); - w_.Range(0, tmp_w.NumRows(), 0, tmp_w.NumCols()).CopyFromMat(tmp_w); - - for (int32 j = 0; j < NumPdfs(); j++) { - // Resize v[j] - Matrix tmp_v_j = v_[j]; - v_[j].Resize(tmp_v_j.NumRows(), target_dim); - v_[j].Range(0, tmp_v_j.NumRows(), 0, tmp_v_j.NumCols()).CopyFromMat( - tmp_v_j); - } - KALDI_LOG << "Phone subspace dimensionality increased from " << - initial_dim << " to " << target_dim; - } else { - KALDI_LOG << "Phone subspace dimensionality unchanged, since target " << - "dimension (" << target_dim << ") <= initial dimansion (" << - initial_dim << ")"; - } -} - -void AmSgmm::IncreaseSpkSpaceDim(int32 target_dim, - const Matrix &norm_xform) { - int32 initial_dim = SpkSpaceDim(), - feat_dim = FeatureDim(); - KALDI_ASSERT(norm_xform.NumRows() == feat_dim); - - if (N_.size() == 0) - N_.resize(NumGauss()); - - if (target_dim < initial_dim) - KALDI_ERR << "You asked to increase spk dim to a value lower than the " - << " current dimension, " << target_dim << " < " << initial_dim; - - if (target_dim > initial_dim + feat_dim) { - KALDI_WARN << "Cannot increase speaker subspace dimensionality from " - << initial_dim << " to " << target_dim << ", increasing to " - << initial_dim + feat_dim; - target_dim = initial_dim + feat_dim; - } - - if (initial_dim < target_dim) { - int32 dim_change = target_dim - initial_dim; - Matrix tmp_N((initial_dim != 0) ? feat_dim : 0, - initial_dim); - for (int32 i = 0; i < NumGauss(); i++) { - if (initial_dim != 0) tmp_N.CopyFromMat(N_[i]); - N_[i].Resize(feat_dim, target_dim); - if (initial_dim != 0) { - N_[i].Range(0, feat_dim, 0, tmp_N.NumCols()).CopyFromMat(tmp_N); - } - N_[i].Range(0, feat_dim, tmp_N.NumCols(), dim_change).CopyFromMat( - norm_xform.Range(0, feat_dim, 0, dim_change)); - } - KALDI_LOG << "Speaker subspace dimensionality increased from " << - initial_dim << " to " << target_dim; - } else { - KALDI_LOG << "Speaker subspace dimensionality unchanged, since target " << - "dimension (" << target_dim << ") <= initial dimansion (" << - initial_dim << ")"; - } -} - -void AmSgmm::ComputeDerivedVars() { - if (n_.empty()) { - ComputeNormalizers(); - } - if (diag_ubm_.NumGauss() != full_ubm_.NumGauss() - || diag_ubm_.Dim() != full_ubm_.Dim()) { - diag_ubm_.CopyFromFullGmm(full_ubm_); - } -} - -class ComputeNormalizersClass: public MultiThreadable { // For multi-threaded. - public: - ComputeNormalizersClass(AmSgmm *am_sgmm, - int32 *entropy_count_ptr, - double *entropy_sum_ptr): - am_sgmm_(am_sgmm), entropy_count_ptr_(entropy_count_ptr), - entropy_sum_ptr_(entropy_sum_ptr), entropy_count_(0), - entropy_sum_(0.0) { } - - ~ComputeNormalizersClass() { - *entropy_count_ptr_ += entropy_count_; - *entropy_sum_ptr_ += entropy_sum_; - } - - inline void operator() () { - // Note: give them local copy of the sums we're computing, - // which will be propagated to original pointer in the destructor. - am_sgmm_->ComputeNormalizersInternal(num_threads_, thread_id_, - &entropy_count_, - &entropy_sum_); - } - private: - ComputeNormalizersClass() { } // Disallow empty constructor. - AmSgmm *am_sgmm_; - int32 *entropy_count_ptr_; - double *entropy_sum_ptr_; - int32 entropy_count_; - double entropy_sum_; - -}; - -void AmSgmm::ComputeNormalizers() { - KALDI_LOG << "Computing normalizers"; - n_.resize(NumPdfs()); - int32 entropy_count = 0; - double entropy_sum = 0.0; - ComputeNormalizersClass c(this, &entropy_count, &entropy_sum); - RunMultiThreaded(c); - - KALDI_LOG << "Entropy of weights in substates is " - << (entropy_sum / entropy_count) << " over " << entropy_count - << " substates, equivalent to perplexity of " - << (Exp(entropy_sum /entropy_count)); - KALDI_LOG << "Done computing normalizers"; -} - - -void AmSgmm::ComputeNormalizersInternal(int32 num_threads, int32 thread, - int32 *entropy_count, - double *entropy_sum) { - - BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI); - Vector log_det_Sigma(NumGauss()); - - for (int32 i = 0; i < NumGauss(); i++) { - try { - log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet(); - } catch(...) { - if (thread == 0) // just for one thread, print errors [else, duplicates] - KALDI_WARN << "Covariance is not positive definite, setting to unit"; - SigmaInv_[i].SetUnit(); - log_det_Sigma(i) = 0.0; - } - } - - - int block_size = (NumPdfs() + num_threads-1) / num_threads; - int j_start = thread * block_size, j_end = std::min(NumPdfs(), j_start + block_size); - - for (int32 j = j_start; j < j_end; j++) { - Matrix log_w_jm(NumSubstates(j), NumGauss()); - n_[j].Resize(NumGauss(), NumSubstates(j)); - Matrix mu_jmi(NumSubstates(j), FeatureDim()); - Matrix SigmaInv_mu(NumSubstates(j), FeatureDim()); - - // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - log_w_jm.AddMatMat(1.0, v_[j], kNoTrans, w_, kTrans, 0.0); - for (int32 m = 0; m < NumSubstates(j); m++) { - log_w_jm.Row(m).Add(-1.0 * log_w_jm.Row(m).LogSumExp()); - { // DIAGNOSTIC CODE - (*entropy_count)++; - for (int32 i = 0; i < NumGauss(); i++) { - (*entropy_sum) -= log_w_jm(m, i) * Exp(log_w_jm(m, i)); - } - } - } - - for (int32 i = 0; i < NumGauss(); i++) { - // mu_jmi = M_{i} * v_{jm} - mu_jmi.AddMatMat(1.0, v_[j], kNoTrans, M_[i], kTrans, 0.0); - SigmaInv_mu.AddMatSp(1.0, mu_jmi, kNoTrans, SigmaInv_[i], 0.0); - - for (int32 m = 0; m < NumSubstates(j); m++) { - // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi} - BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi.Row(m), SigmaInv_mu.Row(m)); - BaseFloat logc = Log(c_[j](m)); - - // Suggestion: Both mu_jmi and SigmaInv_mu could - // have been computed at once for i, - // if M[i] was concatenated to single matrix over i indices - - // eq.(31) - n_[j](i, m) = logc + log_w_jm(m, i) - 0.5 * (log_det_Sigma(i) + DLog2pi - + mu_SigmaInv_mu); - { // Mainly diagnostic code. Not necessary. - BaseFloat tmp = n_[j](i, m); - if (!KALDI_ISFINITE(tmp)) { // NaN or inf - KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m - << ", i = " << i << " is infinite or NaN " << tmp << "= " - << (logc) << "+" << (log_w_jm(m, i)) << "+" << (-0.5 * - log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi) - << "+" << (mu_SigmaInv_mu) << ", setting to finite."; - n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number - } - } - } - } - } -} - - -void AmSgmm::ComputeNormalizersNormalized( - const std::vector< std::vector > &normalize_sets) { - { // Check sets in normalize_sets are disjoint and cover all Gaussians. - std::set all; - for (int32 i = 0; i < normalize_sets.size(); i++) - for (int32 j = 0; static_cast(j) < normalize_sets[i].size(); j++) { - int32 n = normalize_sets[i][j]; - KALDI_ASSERT(all.count(n) == 0 && n >= 0 && n < NumGauss()); - all.insert(n); - } - KALDI_ASSERT(all.size() == NumGauss()); - } - - KALDI_LOG << "Computing normalizers [normalized]"; - BaseFloat DLog2pi = FeatureDim() * Log(2 * M_PI); - Vector mu_jmi(FeatureDim()); - Vector SigmaInv_mu(FeatureDim()); - Vector log_det_Sigma(NumGauss()); - - for (int32 i = 0; i < NumGauss(); i++) { - try { - log_det_Sigma(i) = - SigmaInv_[i].LogPosDefDet(); - } catch(...) { - KALDI_WARN << "Covariance is not positive definite, setting to unit"; - SigmaInv_[i].SetUnit(); - log_det_Sigma(i) = 0.0; - } - } - - n_.resize(NumPdfs()); - for (int32 j = 0; j < NumPdfs(); j++) { - Vector log_w_jm(NumGauss()); - - n_[j].Resize(NumGauss(), NumSubstates(j)); - for (int32 m = 0; m < NumSubstates(j); m++) { - BaseFloat logc = Log(c_[j](m)); - - // (in logs): w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - log_w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0); - log_w_jm.Add((-1.0) * log_w_jm.LogSumExp()); - - for (int32 n = 0; n < normalize_sets.size(); n++) { - const std::vector &this_set(normalize_sets[n]); - double sum = 0.0; - for (int32 p = 0; p < this_set.size(); p++) - sum += Exp(log_w_jm(this_set[p])); - double offset = -Log(sum); // add "offset", to normalize weights. - for (int32 p = 0; p < this_set.size(); p++) - log_w_jm(this_set[p]) += offset; - } - - for (int32 i = 0; i < NumGauss(); i++) { - // mu_jmi = M_{i} * v_{jm} - mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0); - - // mu_{jmi} * \Sigma_{i}^{-1} * mu_{jmi} - SigmaInv_mu.AddSpVec(1.0, SigmaInv_[i], mu_jmi, 0.0); - BaseFloat mu_SigmaInv_mu = VecVec(mu_jmi, SigmaInv_mu); - - // Suggestion: Both mu_jmi and SigmaInv_mu could - // have been computed at once for i , - // if M[i] was concatenated to single matrix over i indeces - - // eq.(31) - n_[j](i, m) = logc + log_w_jm(i) - 0.5 * (log_det_Sigma(i) + DLog2pi - + mu_SigmaInv_mu); - { // Mainly diagnostic code. Not necessary. - BaseFloat tmp = n_[j](i, m); - if (!KALDI_ISFINITE(tmp)) { // NaN or inf - KALDI_LOG << "Warning: normalizer for j = " << j << ", m = " << m - << ", i = " << i << " is infinite or NaN " << tmp << "= " - << (logc) << "+" << (log_w_jm(i)) << "+" << (-0.5 * - log_det_Sigma(i)) << "+" << (-0.5 * DLog2pi) - << "+" << (mu_SigmaInv_mu) << ", setting to finite."; - n_[j](i, m) = -1.0e+40; // future work(arnab): get rid of magic number - } - } - } - } - } - - KALDI_LOG << "Done computing normalizers (normalized over subsets)"; -} - - -void AmSgmm::ComputeFmllrPreXform(const Vector &state_occs, - Matrix *xform, Matrix *inv_xform, - Vector *diag_mean_scatter) const { - int32 num_states = NumPdfs(), - num_gauss = NumGauss(), - dim = FeatureDim(); - KALDI_ASSERT(state_occs.Dim() == num_states); - - BaseFloat total_occ = state_occs.Sum(); - - // Degenerate case: unlikely to ever happen. - if (total_occ == 0) { - KALDI_WARN << "Zero probability (computing transform). Using unit " - << "pre-transform"; - xform->Resize(dim, dim + 1, kUndefined); - xform->SetUnit(); - inv_xform->Resize(dim, dim + 1, kUndefined); - inv_xform->SetUnit(); - diag_mean_scatter->Resize(dim, kSetZero); - return; - } - - // Convert state occupancies to posteriors; Eq. (B.1) - Vector state_posteriors(state_occs); - state_posteriors.Scale(1/total_occ); - - Vector mu_jmi(dim), global_mean(dim); - SpMatrix within_class_covar(dim), between_class_covar(dim); - Vector gauss_weight(num_gauss); // weights for within-class vars. - Vector w_jm(num_gauss); - BaseFloat substate_weight; - for (int32 j = 0; j < num_states; j++) { - for (int32 m = 0; m < NumSubstates(j); m++) { - // Eq. (7): w_jm = softmax([w_{1}^T ... w_{D}^T] * v_{jm}) - w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0); - w_jm.ApplySoftMax(); - - for (int32 i = 0; i < num_gauss; i++) { - substate_weight = state_posteriors(j) * c_[j](m) * w_jm(i); - mu_jmi.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0); // Eq. (6) - // Eq. (B.3): \mu_avg = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi} - global_mean.AddVec(substate_weight, mu_jmi); - // \Sigma_B = \sum_{jmi} p(j) c_{jm} w_{jmi} \mu_{jmi} \mu_{jmi}^T - between_class_covar.AddVec2(substate_weight, mu_jmi); // Eq. (B.4) - gauss_weight(i) += substate_weight; - } - } - } - between_class_covar.AddVec2(-1.0, global_mean); // Eq. (B.4) - - for (int32 i = 0; i < num_gauss; i++) { - SpMatrix Sigma(SigmaInv_[i]); - Sigma.InvertDouble(); - // Eq. (B.2): \Sigma_W = \sum_{jmi} p(j) c_{jm} w_{jmi} \Sigma_i - within_class_covar.AddSp(gauss_weight(i), Sigma); - } - - TpMatrix tmpL(dim); - Matrix tmpLInvFull(dim, dim); - tmpL.Cholesky(within_class_covar); // \Sigma_W = L L^T - tmpL.InvertDouble(); // L^{-1} - tmpLInvFull.CopyFromTp(tmpL); // get as full matrix. - - // B := L^{-1} * \Sigma_B * L^{-T} - SpMatrix tmpB(dim); - tmpB.AddMat2Sp(1.0, tmpLInvFull, kNoTrans, between_class_covar, 0.0); - - Matrix U(dim, dim); - diag_mean_scatter->Resize(dim); - xform->Resize(dim, dim + 1); - inv_xform->Resize(dim, dim + 1); - - tmpB.Eig(diag_mean_scatter, &U); // Eq. (B.5): B = U D V^T - int32 n; - if ((n = diag_mean_scatter->ApplyFloor(1.0e-04)) != 0) - KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix."; - - // Eq. (B.6): A_{pre} = U^T * L^{-1} - SubMatrix Apre(*xform, 0, dim, 0, dim); - Apre.AddMatMat(1.0, U, kTrans, tmpLInvFull, kNoTrans, 0.0); - -#ifdef KALDI_PARANOID - { - SpMatrix tmp(dim); - tmp.AddMat2Sp(1.0, Apre, kNoTrans, within_class_covar, 0.0); - KALDI_ASSERT(tmp.IsUnit(0.01)); - } - { - SpMatrix tmp(dim); - tmp.AddMat2Sp(1.0, Apre, kNoTrans, between_class_covar, 0.0); - KALDI_ASSERT(tmp.IsDiagonal(0.01)); - } -#endif - - // Eq. (B.7): b_{pre} = - A_{pre} \mu_{avg} - Vector b_pre(dim); - b_pre.AddMatVec(-1.0, Apre, kNoTrans, global_mean, 0.0); - for (int32 r = 0; r < dim; r++) { - xform->Row(r)(dim) = b_pre(r); // W_{pre} = [ A_{pre}, b_{pre} ] - } - - // Eq. (B.8) & (B.9): W_{inv} = [ A_{pre}^{-1}, \mu_{avg} ] - inv_xform->CopyFromMat(*xform); - inv_xform->Range(0, dim, 0, dim).InvertDouble(); - for (int32 r = 0; r < dim; r++) - inv_xform->Row(r)(dim) = global_mean(r); -} // End of ComputePreXform() - -template -void AmSgmm::GetNtransSigmaInv(vector< Matrix > *out) const { - KALDI_ASSERT(SpkSpaceDim() > 0 && - "Cannot compute N^{T} \\Sigma_{i}^{-1} without speaker projections."); - out->resize(NumGauss()); - Matrix tmpcov(FeatureDim(), FeatureDim()); - Matrix tmp_n(FeatureDim(), SpkSpaceDim()); - for (int32 i = 0; i < NumGauss(); i++) { - tmpcov.CopyFromSp(SigmaInv_[i]); - tmp_n.CopyFromMat(N_[i]); - (*out)[i].Resize(SpkSpaceDim(), FeatureDim()); - (*out)[i].AddMatMat(1.0, tmp_n, kTrans, tmpcov, kNoTrans, 0.0); - } -} - -// Instantiate the above template. -template -void AmSgmm::GetNtransSigmaInv(vector< Matrix > *out) const; -template -void AmSgmm::GetNtransSigmaInv(vector< Matrix > *out) const; - -/////////////////////////////////////////////////////////////////////////////// - -template -void AmSgmm::ComputeH(std::vector< SpMatrix > *H_i) const { - KALDI_ASSERT(NumGauss() != 0); - (*H_i).resize(NumGauss()); - SpMatrix H_i_tmp(PhoneSpaceDim()); - for (int32 i = 0; i < NumGauss(); i++) { - (*H_i)[i].Resize(PhoneSpaceDim()); - H_i_tmp.AddMat2Sp(1.0, M_[i], kTrans, SigmaInv_[i], 0.0); - (*H_i)[i].CopyFromSp(H_i_tmp); - } -} - -// Instantiate the template. -template -void AmSgmm::ComputeH(std::vector< SpMatrix > *H_i) const; -template -void AmSgmm::ComputeH(std::vector< SpMatrix > *H_i) const; - - -// Initializes the matrices M_{i} and w_i -void AmSgmm::InitializeMw(int32 phn_subspace_dim, - const Matrix &norm_xform) { - int32 ddim = full_ubm_.Dim(); - KALDI_ASSERT(phn_subspace_dim <= ddim + 1); - KALDI_ASSERT(phn_subspace_dim <= norm_xform.NumCols() + 1); - KALDI_ASSERT(ddim <= norm_xform.NumRows()); - - Vector mean(ddim); - int32 num_gauss = full_ubm_.NumGauss(); - w_.Resize(num_gauss, phn_subspace_dim); - M_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - full_ubm_.GetComponentMean(i, &mean); - Matrix &thisM(M_[i]); - thisM.Resize(ddim, phn_subspace_dim); - // Eq. (27): M_{i} = [ \bar{\mu}_{i} (J)_{1:D, 1:(S-1)}] - thisM.CopyColFromVec(mean, 0); - thisM.Range(0, ddim, 1, phn_subspace_dim-1).CopyFromMat( - norm_xform.Range(0, ddim, 0, phn_subspace_dim-1), kNoTrans); - } -} - -// Initializes the matrices N_{i} -void AmSgmm::InitializeN(int32 spk_subspace_dim, - const Matrix &norm_xform) { - int32 ddim = full_ubm_.Dim(); - KALDI_ASSERT(spk_subspace_dim <= ddim); - KALDI_ASSERT(spk_subspace_dim <= norm_xform.NumCols()); - KALDI_ASSERT(ddim <= norm_xform.NumRows()); - - int32 num_gauss = full_ubm_.NumGauss(); - N_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - N_[i].Resize(ddim, spk_subspace_dim); - // Eq. (28): N_{i} = [ (J)_{1:D, 1:T)}] - N_[i].CopyFromMat(norm_xform.Range(0, ddim, 0, spk_subspace_dim), kNoTrans); - } -} - -// Initializes the vectors v_{jm} -void AmSgmm::InitializeVecs(int32 num_states) { - KALDI_ASSERT(num_states >= 0); - int32 phn_subspace_dim = PhoneSpaceDim(); - KALDI_ASSERT(phn_subspace_dim > 0 && "Initialize M and w first."); - - v_.resize(num_states); - c_.resize(num_states); - for (int32 j = 0; j < num_states; j++) { - v_[j].Resize(1, phn_subspace_dim); - c_[j].Resize(1); - v_[j](0, 0) = 1.0; // Eq. (26): v_{j1} = [1 0 0 ... 0] - c_[j](0) = 1.0; // Eq. (25): c_{j1} = 1.0 - } -} - -// Initializes the within-class vars Sigma_{ki} -void AmSgmm::InitializeCovars() { - std::vector< SpMatrix > &inv_covars(full_ubm_.inv_covars()); - int32 num_gauss = full_ubm_.NumGauss(); - int32 dim = full_ubm_.Dim(); - SigmaInv_.resize(num_gauss); - for (int32 i = 0; i < num_gauss; i++) { - SigmaInv_[i].Resize(dim); - SigmaInv_[i].CopyFromSp(inv_covars[i]); - } -} - -// Compute the "smoothing" matrices from expected counts given the model. -void AmSgmm::ComputeSmoothingTermsFromModel( - const std::vector< SpMatrix > &H, - const Vector &state_occupancies, SpMatrix *H_sm, - BaseFloat max_cond) const { - int32 num_gauss = NumGauss(); - BaseFloat tot_sum = 0.0; - KALDI_ASSERT(state_occupancies.Dim() == NumPdfs()); - Vector w_jm(num_gauss); - H_sm->Resize(PhoneSpaceDim()); - H_sm->SetZero(); - Vector gamma_i(num_gauss); - gamma_i.SetZero(); - for (int32 j = 0; j < NumPdfs(); j++) { - int32 M_j = NumSubstates(j); - KALDI_ASSERT(M_j > 0); - for (int32 m = 0; m < M_j; m++) { - w_jm.AddMatVec(1.0, w_, kNoTrans, v_[j].Row(m), 0.0); - w_jm.ApplySoftMax(); - gamma_i.AddVec(state_occupancies(j) * c_[j](m), w_jm); - } - } - BaseFloat sum = 0.0; - for (int32 i = 0; i < num_gauss; i++) { - if (gamma_i(i) > 0) { - H_sm->AddSp(gamma_i(i), H[i]); - sum += gamma_i(i); - } - } - if (sum == 0.0) { - KALDI_WARN << "Sum of counts is zero. "; - // set to unit matrix--arbitrary non-singular matrix.. won't ever matter. - H_sm->SetUnit(); - } else { - H_sm->Scale(1.0 / sum); - int32 tmp = H_sm->LimitCondDouble(max_cond); - if (tmp > 0) { - KALDI_WARN << "Limited " << (tmp) << " eigenvalues of H_sm"; - } - } - tot_sum += sum; - - KALDI_LOG << "ComputeSmoothingTermsFromModel: total count is " << tot_sum; -} - -void ComputeFeatureNormalizer(const FullGmm &gmm, Matrix *xform) { - int32 dim = gmm.Dim(); - int32 num_gauss = gmm.NumGauss(); - SpMatrix within_class_covar(dim); - SpMatrix between_class_covar(dim); - Vector global_mean(dim); - - // Accumulate LDA statistics from the GMM parameters. - { - BaseFloat total_weight = 0.0; - Vector tmp_weight(num_gauss); - Matrix tmp_means; - std::vector< SpMatrix > tmp_covars; - tmp_weight.CopyFromVec(gmm.weights()); - gmm.GetCovarsAndMeans(&tmp_covars, &tmp_means); - for (int32 i = 0; i < num_gauss; i++) { - BaseFloat w_i = tmp_weight(i); - total_weight += w_i; - within_class_covar.AddSp(w_i, tmp_covars[i]); - between_class_covar.AddVec2(w_i, tmp_means.Row(i)); - global_mean.AddVec(w_i, tmp_means.Row(i)); - } - KALDI_ASSERT(total_weight > 0); - if (fabs(total_weight - 1.0) > 0.001) { - KALDI_WARN << "Total weight across the GMMs is " << (total_weight) - << ", renormalizing."; - global_mean.Scale(1.0 / total_weight); - within_class_covar.Scale(1.0 / total_weight); - between_class_covar.Scale(1.0 / total_weight); - } - between_class_covar.AddVec2(-1.0, global_mean); - } - - TpMatrix chol(dim); - chol.Cholesky(within_class_covar); // Sigma_W = L L^T - TpMatrix chol_inv(chol); - chol_inv.InvertDouble(); - Matrix chol_full(dim, dim); - chol_full.CopyFromTp(chol_inv); - SpMatrix LBL(dim); - // LBL = L^{-1} \Sigma_B L^{-T} - LBL.AddMat2Sp(1.0, chol_full, kNoTrans, between_class_covar, 0.0); - Vector Dvec(dim); - Matrix U(dim, dim); - LBL.Eig(&Dvec, &U); - SortSvd(&Dvec, &U); - - xform->Resize(dim, dim); - chol_full.CopyFromTp(chol); - // T := L U, eq (23) - xform->AddMatMat(1.0, chol_full, kNoTrans, U, kNoTrans, 0.0); - -#ifdef KALDI_PARANOID - Matrix inv_xform(*xform); - inv_xform.InvertDouble(); - { // Check that T*within_class_covar*T' = I. - Matrix wc_covar_full(dim, dim), tmp(dim, dim); - wc_covar_full.CopyFromSp(within_class_covar); - tmp.AddMatMat(1.0, inv_xform, kNoTrans, wc_covar_full, kNoTrans, 0.0); - wc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0); - KALDI_ASSERT(wc_covar_full.IsUnit(0.01)); - } - { // Check that T*between_class_covar*T' = diagonal. - Matrix bc_covar_full(dim, dim), tmp(dim, dim); - bc_covar_full.CopyFromSp(between_class_covar); - tmp.AddMatMat(1.0, inv_xform, kNoTrans, bc_covar_full, kNoTrans, 0.0); - bc_covar_full.AddMatMat(1.0, tmp, kNoTrans, inv_xform, kTrans, 0.0); - KALDI_ASSERT(bc_covar_full.IsDiagonal(0.01)); - } -#endif -} - -void AmSgmm::ComputePerSpkDerivedVars(SgmmPerSpkDerivedVars *vars) const { - KALDI_ASSERT(vars != NULL); - if (vars->v_s.Dim() != 0) { - KALDI_ASSERT(vars->v_s.Dim() == SpkSpaceDim()); - vars->o_s.Resize(NumGauss(), FeatureDim()); - int32 num_gauss = NumGauss(); - for (int32 i = 0; i < num_gauss; i++) { - // Eqn. (32): o_i^{(s)} = N_i v^{(s)} - vars->o_s.Row(i).AddMatVec(1.0, N_[i], kNoTrans, vars->v_s, 0.0); - } - } else { - vars->o_s.Resize(0, 0); - } -} - -BaseFloat AmSgmm::GaussianSelection(const SgmmGselectConfig &config, - const VectorBase &data, - std::vector *gselect) const { - KALDI_ASSERT(diag_ubm_.NumGauss() != 0 && - diag_ubm_.NumGauss() == full_ubm_.NumGauss() && - diag_ubm_.Dim() == data.Dim()); - KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 && - config.full_gmm_nbest < config.diag_gmm_nbest); - int32 num_gauss = diag_ubm_.NumGauss(); - - std::vector< std::pair > pruned_pairs; - if (config.diag_gmm_nbest < num_gauss) { - Vector loglikes(num_gauss); - diag_ubm_.LogLikelihoods(data, &loglikes); - Vector loglikes_copy(loglikes); - BaseFloat *ptr = loglikes_copy.Data(); - std::nth_element(ptr, ptr+num_gauss-config.diag_gmm_nbest, ptr+num_gauss); - BaseFloat thresh = ptr[num_gauss-config.diag_gmm_nbest]; - for (int32 g = 0; g < num_gauss; g++) - if (loglikes(g) >= thresh) // met threshold for diagonal phase. - pruned_pairs.push_back( - std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g)); - } else { - Vector loglikes(num_gauss); - full_ubm_.LogLikelihoods(data, &loglikes); - for (int32 g = 0; g < num_gauss; g++) - pruned_pairs.push_back(std::make_pair(loglikes(g), g)); - } - KALDI_ASSERT(!pruned_pairs.empty()); - if (pruned_pairs.size() > static_cast(config.full_gmm_nbest)) { - std::nth_element(pruned_pairs.begin(), - pruned_pairs.end() - config.full_gmm_nbest, - pruned_pairs.end()); - pruned_pairs.erase(pruned_pairs.begin(), - pruned_pairs.end() - config.full_gmm_nbest); - } - Vector loglikes_tmp(pruned_pairs.size()); // for return value. - KALDI_ASSERT(gselect != NULL); - gselect->resize(pruned_pairs.size()); - // Make sure pruned Gaussians appear from best to worst. - std::sort(pruned_pairs.begin(), pruned_pairs.end(), - std::greater< std::pair >()); - for (size_t i = 0; i < pruned_pairs.size(); i++) { - loglikes_tmp(i) = pruned_pairs[i].first; - (*gselect)[i] = pruned_pairs[i].second; - } - return loglikes_tmp.LogSumExp(); -} - -BaseFloat AmSgmm::GaussianSelectionPreselect(const SgmmGselectConfig &config, - const VectorBase &data, - const std::vector &preselect, - std::vector *gselect) const { - KALDI_ASSERT(IsSortedAndUniq(preselect) && !preselect.empty()); - KALDI_ASSERT(diag_ubm_.NumGauss() != 0 && - diag_ubm_.NumGauss() == full_ubm_.NumGauss() && - diag_ubm_.Dim() == data.Dim()); - - int32 num_preselect = preselect.size(); - - KALDI_ASSERT(config.diag_gmm_nbest > 0 && config.full_gmm_nbest > 0 && - config.full_gmm_nbest < num_preselect); - - std::vector > pruned_pairs; - if (config.diag_gmm_nbest < num_preselect) { - Vector loglikes(num_preselect); - diag_ubm_.LogLikelihoodsPreselect(data, preselect, &loglikes); - Vector loglikes_copy(loglikes); - BaseFloat *ptr = loglikes_copy.Data(); - std::nth_element(ptr, ptr+num_preselect-config.diag_gmm_nbest, - ptr+num_preselect); - BaseFloat thresh = ptr[num_preselect-config.diag_gmm_nbest]; - for (int32 p = 0; p < num_preselect; p++) { - if (loglikes(p) >= thresh) { // met threshold for diagonal phase. - int32 g = preselect[p]; - pruned_pairs.push_back( - std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g)); - } - } - } else { - for (int32 p = 0; p < num_preselect; p++) { - int32 g = preselect[p]; - pruned_pairs.push_back( - std::make_pair(full_ubm_.ComponentLogLikelihood(data, g), g)); - } - } - KALDI_ASSERT(!pruned_pairs.empty()); - if (pruned_pairs.size() > static_cast(config.full_gmm_nbest)) { - std::nth_element(pruned_pairs.begin(), - pruned_pairs.end() - config.full_gmm_nbest, - pruned_pairs.end()); - pruned_pairs.erase(pruned_pairs.begin(), - pruned_pairs.end() - config.full_gmm_nbest); - } - // Make sure pruned Gaussians appear from best to worst. - std::sort(pruned_pairs.begin(), pruned_pairs.end(), - std::greater >()); - Vector loglikes_tmp(pruned_pairs.size()); // for return value. - KALDI_ASSERT(gselect != NULL); - gselect->resize(pruned_pairs.size()); - for (size_t i = 0; i < pruned_pairs.size(); i++) { - loglikes_tmp(i) = pruned_pairs[i].first; - (*gselect)[i] = pruned_pairs[i].second; - } - return loglikes_tmp.LogSumExp(); -} - - - -void SgmmGauPost::Write(std::ostream &os, bool binary) const { - WriteToken(os, binary, ""); - int32 T = this->size(); - WriteBasicType(os, binary, T); - for (int32 t = 0; t < T; t++) { - WriteToken(os, binary, ""); - WriteIntegerVector(os, binary, (*this)[t].gselect); - WriteToken(os, binary, ""); - WriteIntegerVector(os, binary, (*this)[t].tids); - KALDI_ASSERT((*this)[t].tids.size() == (*this)[t].posteriors.size()); - for (size_t i = 0; i < (*this)[t].posteriors.size(); i++) { - (*this)[t].posteriors[i].Write(os, binary); - } - } - WriteToken(os, binary, ""); -} - -void SgmmGauPost::Read(std::istream &is, bool binary) { - ExpectToken(is, binary, ""); - int32 T; - ReadBasicType(is, binary, &T); - KALDI_ASSERT(T >= 0); - this->resize(T); - for (int32 t = 0; t < T; t++) { - ExpectToken(is, binary, ""); - ReadIntegerVector(is, binary, &((*this)[t].gselect)); - ExpectToken(is, binary, ""); - ReadIntegerVector(is, binary, &((*this)[t].tids)); - size_t sz = (*this)[t].tids.size(); - (*this)[t].posteriors.resize(sz); - for (size_t i = 0; i < sz; i++) - (*this)[t].posteriors[i].Read(is, binary); - } - ExpectToken(is, binary, ""); -} - - -void AmSgmmFunctions::ComputeDistances(const AmSgmm &model, - const Vector &state_occs, - MatrixBase *dists) { - int32 num_states = model.NumPdfs(), - phn_space_dim = model.PhoneSpaceDim(), - num_gauss = model.NumGauss(); - KALDI_ASSERT(dists != NULL && dists->NumRows() == num_states - && dists->NumCols() == num_states); - Vector prior(state_occs); - KALDI_ASSERT(prior.Sum() != 0.0); - prior.Scale(1.0 / prior.Sum()); // Normalize. - SpMatrix H(phn_space_dim); // The same as H_sm in some other code. - for (int32 i = 0; i < num_gauss; i++) { - SpMatrix Hi(phn_space_dim); - Hi.AddMat2Sp(1.0, model.M_[i], kTrans, model.SigmaInv_[i], 0.0); - H.AddSp(prior(i), Hi); - } - bool warned = false; - for (int32 j1 = 0; j1 < num_states; ++j1) { - if (model.NumSubstates(j1) != 1 && !warned) { - KALDI_WARN << "ComputeDistances() can only give meaningful output if you " - << "have one substate per state."; - warned = true; - } - for (int32 j2 = 0; j2 <= j1; ++j2) { - Vector v_diff(model.v_[j1].Row(0)); - v_diff.AddVec(-1.0, model.v_[j2].Row(0)); - (*dists)(j1, j2) = (*dists)(j2, j1) = VecSpVec(v_diff, H, v_diff); - } - } -} - -} // namespace kaldi diff --git a/src/sgmm/am-sgmm.h b/src/sgmm/am-sgmm.h deleted file mode 100644 index 229b1b4811f..00000000000 --- a/src/sgmm/am-sgmm.h +++ /dev/null @@ -1,420 +0,0 @@ -// sgmm/am-sgmm.h - -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; -// Saarland University (Author: Arnab Ghoshal); -// Ondrej Glembek; Yanmin Qian; -// Copyright 2012-2013 Johns Hopkins University (author: Daniel Povey) -// Liang Lu; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_AM_SGMM_H_ -#define KALDI_SGMM_AM_SGMM_H_ - -#include - -#include "base/kaldi-common.h" -#include "matrix/matrix-lib.h" -#include "gmm/model-common.h" -#include "gmm/diag-gmm.h" -#include "gmm/full-gmm.h" -#include "itf/options-itf.h" -#include "util/table-types.h" - -namespace kaldi { - -struct SgmmGselectConfig { - /// Number of highest-scoring full-covariance Gaussians per frame. - int32 full_gmm_nbest; - /// Number of highest-scoring diagonal-covariance Gaussians per frame. - int32 diag_gmm_nbest; - - SgmmGselectConfig() { - full_gmm_nbest = 15; - diag_gmm_nbest = 50; - } - - void Register(OptionsItf *opts) { - opts->Register("full-gmm-nbest", &full_gmm_nbest, "Number of highest-scoring" - " full-covariance Gaussians selected per frame."); - opts->Register("diag-gmm-nbest", &diag_gmm_nbest, "Number of highest-scoring" - " diagonal-covariance Gaussians selected per frame."); - } -}; - -/** \struct SgmmPerFrameDerivedVars - * Holds the per-frame precomputed quantities x(t), x_{i}(t), z_{i}(t), and - * n_{i}(t) (cf. Eq. (33)-(36)) for the SGMM, as well as the cached Gaussian - * selection records. - */ -struct SgmmPerFrameDerivedVars { - std::vector gselect; - Vector xt; ///< x'(t), FMLLR-adapted, dim = [D], eq.(33) - Matrix xti; ///< x_{i}(t) = x'(t) - o_i(s): dim = [I][D], eq.(34) - Matrix zti; ///< z_{i}(t), dim = [I][S], eq.(35) - Vector nti; ///< n_{i}(t), dim = [I], eq.(36) - - SgmmPerFrameDerivedVars() : xt(0), xti(0, 0), zti(0, 0), nti(0) {} - void Resize(int32 ngauss, int32 feat_dim, int32 phn_dim) { - xt.Resize(feat_dim); - xti.Resize(ngauss, feat_dim); - zti.Resize(ngauss, phn_dim); - nti.Resize(ngauss); - } - bool IsEmpty() const { - return (xt.Dim() == 0 || xti.NumRows() == 0 || zti.NumRows() == 0 - || nti.Dim() == 0); - } - bool NeedsResizing(int32 ngauss, int32 feat_dim, int32 phn_dim) const { - /* if (xt.Dim() != feat_dim) - KALDI_LOG << "xt dim = " << xt.Dim() << ", feat dim = " << feat_dim; - if (xti.NumRows() != ngauss || xti.NumCols() != feat_dim) - KALDI_LOG << "xti size = " << xti.NumRows() << ", " << xti.NumCols() - << "; ngauss = " << ngauss << ", feat dim = " << feat_dim; - if (zti.NumRows() != ngauss || zti.NumCols() != phn_dim) - KALDI_LOG << "zti size = " << zti.NumRows() << ", " << zti.NumCols() - << "; ngauss = " << ngauss << "; phn dim = " << phn_dim; - if (nti.Dim() != ngauss) - KALDI_LOG << "nti dim = " << nti.Dim() << ", ngauss = " << ngauss; - */ - return (xt.Dim() != feat_dim || xti.NumRows() != ngauss - || xti.NumCols() != feat_dim || zti.NumRows() != ngauss - || zti.NumCols() != phn_dim || nti.Dim() != ngauss); - } -}; - - -struct SgmmPerSpkDerivedVars { - // To set this up, call ComputePerSpkDerivedVars from the sgmm object. - void Clear() { - v_s.Resize(0); - o_s.Resize(0, 0); - } - Vector v_s; ///< Speaker adaptation vector v_^{(s)}. Dim is [T] - Matrix o_s; ///< Per-speaker offsets o_{i}. Dimension is [I][D] -}; - - -/** \class AmSgmm - * Class for definition of the subspace Gmm acoustic model - */ -class AmSgmm { - public: - AmSgmm() {} - void Read(std::istream &rIn, bool binary); - void Write(std::ostream &out, bool binary, - SgmmWriteFlagsType write_params) const; - - /// Checks the various components for correct sizes. With wrong sizes, - /// assertion failure occurs. When the argument is set to true, dimensions of - /// the various components are printed. - void Check(bool show_properties = true); - - /// Initializes the SGMM parameters from a full-covariance UBM. - void InitializeFromFullGmm(const FullGmm &gmm, int32 num_states, - int32 phn_subspace_dim, int32 spk_subspace_dim); - - /// Used to copy models (useful in update) - void CopyFromSgmm(const AmSgmm &other, bool copy_normalizers); - - /// Copies the global parameters from the supplied model, but sets - /// the state vectors to zero. Supports reducing the phonetic - /// and speaker subspace dimensions. - void CopyGlobalsInitVecs(const AmSgmm &other, int32 phn_subspace_dim, - int32 spk_subspace_dim, int32 num_pdfs); - - /// Computes the top-scoring Gaussian indices (used for pruning of later - /// stages of computation). Returns frame log-likelihood given selected - /// Gaussians from full UBM. - BaseFloat GaussianSelection(const SgmmGselectConfig &config, - const VectorBase &data, - std::vector *gselect) const; - - /// As GaussianSelection, but limiting it to a provided list of - /// preselected Gaussians (e.g. for gender dependency). - /// The list "preselect" must be sorted and uniq. - BaseFloat GaussianSelectionPreselect(const SgmmGselectConfig &config, - const VectorBase &data, - const std::vector &preselect, - std::vector *gselect) const; - - /// This needs to be called with each new frame of data, prior to accumulation - /// or likelihood evaluation: it computes various pre-computed quantities. The - /// 'logdet_s' term is the log determinant of the FMLLR transform, or 0.0 if - /// no FMLLR is used or it's single-class fMLLR applied in the feature - /// extraction, and we're not keeping track of it here. - void ComputePerFrameVars(const VectorBase &data, - const std::vector &gselect, - const SgmmPerSpkDerivedVars &spk_vars, - BaseFloat logdet_s, - SgmmPerFrameDerivedVars *per_frame_vars) const; - - /// Computes the per-speaker derived vars; assumes vars->v_s is already - /// set up. - void ComputePerSpkDerivedVars(SgmmPerSpkDerivedVars *vars) const; - - /// This does a likelihood computation for a given state using the - /// top-scoring Gaussian components (in per_frame_vars). If the - /// log_prune parameter is nonzero (e.g. 5.0), the LogSumExp() stage is - /// pruned, which is a significant speedup... smaller values are faster. - BaseFloat LogLikelihood(const SgmmPerFrameDerivedVars &per_frame_vars, - int32 state_index, BaseFloat log_prune = 0.0) const; - - /// Similar to LogLikelihood() function above, but also computes the posterior - /// probabilities for the top-scoring Gaussian components and all substates. - BaseFloat ComponentPosteriors(const SgmmPerFrameDerivedVars &per_frame_vars, - int32 state, Matrix *post) const; - - /// Increases the total number of substates based on the state occupancies. - void SplitSubstates(const Vector &state_occupancies, - int32 target_nsubstates, - BaseFloat perturb, - BaseFloat power, - BaseFloat cond); - - /// Functions for increasing the phonetic and speaker space dimensions. - /// The argument norm_xform is a LDA-like feature normalizing transform, - /// computed by the ComputeFeatureNormalizer function. - void IncreasePhoneSpaceDim(int32 target_dim, - const Matrix &norm_xform); - void IncreaseSpkSpaceDim(int32 target_dim, - const Matrix &norm_xform); - - /// Computes (and initializes if necessary) derived vars... - /// for now this is just the normalizers "n" and the diagonal UBM. - void ComputeDerivedVars(); - - /// Computes the data-independent terms in the log-likelihood computation - /// for each Gaussian component and all substates. Eq. (31) - void ComputeNormalizers(); - - /// Computes the normalizers, while normalizing the weights to one - /// among each of the sets in "normalize_sets": these sets should - /// be disjoint and their union should be all the indices 0 ... I-1. - void ComputeNormalizersNormalized( - const std::vector< std::vector > &normalize_sets); - - /// Computes the LDA-like pre-transform and its inverse as well as the - /// eigenvalues of the scatter of the means used in FMLLR estimation. - void ComputeFmllrPreXform(const Vector &state_occs, - Matrix *xform, - Matrix *inv_xform, - Vector *diag_mean_scatter) const; - - /// Various model dimensions. - int32 NumPdfs() const { return c_.size(); } - int32 NumSubstates(int32 j) const { return c_[j].Dim(); } - int32 NumGauss() const { return M_.size(); } - int32 PhoneSpaceDim() const { return w_.NumCols(); } - int32 SpkSpaceDim() const { return (N_.size() > 0) ? N_[0].NumCols() : 0; } - int32 FeatureDim() const { return M_[0].NumRows(); } - - void RemoveSpeakerSpace() { N_.clear(); } - - /// Accessors - const FullGmm & full_ubm() const { return full_ubm_; } - const DiagGmm & diag_ubm() const { return diag_ubm_; } - - const Matrix& StateVectors(int32 state_index) const { - return v_[state_index]; - } - const SpMatrix& GetInvCovars(int32 gauss_index) const { - return SigmaInv_[gauss_index]; - } - const Matrix& GetPhoneProjection(int32 gauss_index) const { - return M_[gauss_index]; - } - - /// Templated accessors (used to accumulate in different precision) - template - void GetInvCovars(int32 gauss_index, SpMatrix *out) const; - - template - void GetSubstateMean(int32 j, int32 m, int32 i, - VectorBase *mean_out) const; - - template - void GetSubstateSpeakerMean(int32 state, int32 substate, int32 gauss, - const SgmmPerSpkDerivedVars &spk, - VectorBase *mean_out) const; - - template - void GetVarScaledSubstateSpeakerMean(int32 state, int32 substate, - int32 gauss, - const SgmmPerSpkDerivedVars &spk, - VectorBase *mean_out) const; - - template - void GetNtransSigmaInv(std::vector< Matrix > *out) const; - - /// Computes quantities H = M_i Sigma_i^{-1} M_i^T. - template - void ComputeH(std::vector< SpMatrix > *H_i) const; - - protected: - friend class ComputeNormalizersClass; - private: - /// Compute a subset of normalizers; used in multi-threaded implementation. - void ComputeNormalizersInternal(int32 num_threads, int32 thread, - int32 *entropy_count, double *entropy_sum); - - - /// Initializes the matrices M_ and w_ - void InitializeMw(int32 phn_subspace_dim, - const Matrix &norm_xform); - /// Initializes the matrices N_ - void InitializeN(int32 spk_subspace_dim, const Matrix &norm_xform); - void InitializeVecs(int32 num_states); ///< Initializes the state-vectors. - void InitializeCovars(); ///< initializes the within-class covariances. - - void ComputeSmoothingTermsFromModel( - const std::vector< SpMatrix > &H, - const Vector &state_occupancies, SpMatrix *H_sm, - BaseFloat max_cond) const; - - private: - /// These contain the "background" model associated with the subspace GMM. - DiagGmm diag_ubm_; - FullGmm full_ubm_; - - /// Globally shared parameters of the subspace GMM. - /// The various quantities are: I = number of Gaussians, D = data dimension, - /// S = phonetic subspace dimension, T = speaker subspace dimension, - /// J = number of states, M_{j} = number of substates of state j. - - /// Inverse within-class (full) covariances; dim is [I][D][D]. - std::vector< SpMatrix > SigmaInv_; - /// Phonetic-subspace projections. Dimension is [I][D][S] - std::vector< Matrix > M_; - /// Speaker-subspace projections. Dimension is [I][D][T] - std::vector< Matrix > N_; - /// Weight projection vectors. Dimension is [I][S] - Matrix w_; - - /// The parameters in a particular SGMM state. - - /// v_{jm}, per-state phonetic-subspace vectors. Dimension is [J][M_{j}][S]. - std::vector< Matrix > v_; - /// c_{jm}, mixture weights. Dimension is [J][M_{j}] - std::vector< Vector > c_; - /// n_{jim}, per-Gaussian normalizer. Dimension is [J][I][M_{j}] - std::vector< Matrix > n_; - - // Priors for MAP adaptation of M -- keeping them here for now but they may - // be moved somewhere else eventually - // These are parameters of a matrix-variate normal distribution. The means are - // the unadapted M_i, and we have 2 separate covaraince matrices for the rows - // and columns of M. - std::vector< Matrix > M_prior_; // Matrix-variate Gaussian mean - SpMatrix row_cov_inv_; - SpMatrix col_cov_inv_; - - KALDI_DISALLOW_COPY_AND_ASSIGN(AmSgmm); - friend class EbwAmSgmmUpdater; - friend class MleAmSgmmUpdater; - friend class MleSgmmSpeakerAccs; - friend class AmSgmmFunctions; // misc functions that need access. - friend class MleAmSgmmUpdaterMulti; -}; - -template -inline void AmSgmm::GetInvCovars(int32 gauss_index, - SpMatrix *out) const { - out->Resize(SigmaInv_[gauss_index].NumRows(), kUndefined); - out->CopyFromSp(SigmaInv_[gauss_index]); -} - -template -inline void AmSgmm::GetSubstateMean(int32 j, int32 m, int32 i, - VectorBase *mean_out) const { - KALDI_ASSERT(mean_out != NULL); - KALDI_ASSERT(j < NumPdfs() && m < NumSubstates(j) && i < NumGauss()); - KALDI_ASSERT(mean_out->Dim() == FeatureDim()); - Vector mean_tmp(FeatureDim()); - mean_tmp.AddMatVec(1.0, M_[i], kNoTrans, v_[j].Row(m), 0.0); - mean_out->CopyFromVec(mean_tmp); -} - - -template -inline void AmSgmm::GetSubstateSpeakerMean(int32 j, int32 m, int32 i, - const SgmmPerSpkDerivedVars &spk, - VectorBase *mean_out) const { - GetSubstateMean(j, m, i, mean_out); - if (spk.v_s.Dim() != 0) // have speaker adaptation... - mean_out->AddVec(1.0, spk.o_s.Row(i)); -} - -template -void AmSgmm::GetVarScaledSubstateSpeakerMean(int32 j, int32 m, int32 i, - const SgmmPerSpkDerivedVars &spk, - VectorBase *mean_out) const { - Vector tmp_mean(mean_out->Dim()), tmp_mean2(mean_out->Dim()); - GetSubstateSpeakerMean(j, m, i, spk, &tmp_mean); - tmp_mean2.AddSpVec(1.0, SigmaInv_[i], tmp_mean, 0.0); - mean_out->CopyFromVec(tmp_mean2); -} - - -/// Computes the inverse of an LDA transform (without dimensionality reduction) -/// The computed transform is used in initializing the phonetic and speaker -/// subspaces, as well as while increasing the dimensions of those spaces. -void ComputeFeatureNormalizer(const FullGmm &gmm, Matrix *xform); - - -/// This is the entry for a single time. -struct SgmmGauPostElement { - // Need gselect info here, since "posteriors" is relative to this set of - // selected Gaussians. - std::vector gselect; - std::vector tids; // transition-ids for each entry in "posteriors" - std::vector > posteriors; -}; - - -/// indexed by time. -class SgmmGauPost: public std::vector { - public: - // Add the standard Kaldi Read and Write routines so - // we can use KaldiObjectHolder with this type. - explicit SgmmGauPost(size_t i) : std::vector(i) {} - SgmmGauPost() {} - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); -}; - -typedef KaldiObjectHolder SgmmGauPostHolder; -typedef RandomAccessTableReader RandomAccessSgmmGauPostReader; -typedef SequentialTableReader SequentialSgmmGauPostReader; -typedef TableWriter SgmmGauPostWriter; - -/// Class for misc functions that need access to SGMM private variables. -class AmSgmmFunctions { - public: - /// Computes matrix of approximated K-L divergences, - /// of size [#states x #states], as described in - /// "State-Level Data Borrowing for Low-Resource Speech Recognition based on - /// Subspace GMMs", by Yanmin Qian et. al, Interspeech 2011. - /// Model must have one substate per state. - static void ComputeDistances(const AmSgmm &model, - const Vector &state_occs, - MatrixBase *dists); -}; - -} // namespace kaldi - - -#endif // KALDI_SGMM_AM_SGMM_H_ diff --git a/src/sgmm/decodable-am-sgmm.cc b/src/sgmm/decodable-am-sgmm.cc deleted file mode 100644 index a654d557781..00000000000 --- a/src/sgmm/decodable-am-sgmm.cc +++ /dev/null @@ -1,72 +0,0 @@ -// sgmm/decodable-am-sgmm.cc - -// Copyright 2009-2011 Saarland University; Lukas Burget - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::vector; - -#include "sgmm/decodable-am-sgmm.h" - -namespace kaldi { - -BaseFloat DecodableAmSgmm::LogLikelihoodZeroBased(int32 frame, int32 pdf_id) { - KALDI_ASSERT(frame >= 0 && frame < NumFramesReady()); - KALDI_ASSERT(pdf_id >= 0 && pdf_id < NumIndices()); - - if (log_like_cache_[pdf_id].hit_time == frame) { - return log_like_cache_[pdf_id].log_like; // return cached value, if found - } - - const VectorBase &data = feature_matrix_.Row(frame); - // check if everything is in order - if (acoustic_model_.FeatureDim() != data.Dim()) { - KALDI_ERR << "Dim mismatch: data dim = " << data.Dim() - << "vs. model dim = " << acoustic_model_.FeatureDim(); - } - - if (frame != previous_frame_) { // Per-frame precomputation for SGMM. - if (gselect_all_.empty()) - acoustic_model_.GaussianSelection(sgmm_config_, data, &gselect_); - else { - KALDI_ASSERT(frame < gselect_all_.size()); - gselect_ = gselect_all_[frame]; - } - acoustic_model_.ComputePerFrameVars(data, gselect_, spk_, - 0.0 /*FMLLR logdet*/, &per_frame_vars_); - previous_frame_ = frame; - } - - BaseFloat loglike = acoustic_model_.LogLikelihood(per_frame_vars_, pdf_id, - log_prune_); - if (KALDI_ISNAN(loglike) || KALDI_ISINF(loglike)) - KALDI_ERR << "Invalid answer (overflow or invalid variances/features?)"; - log_like_cache_[pdf_id].log_like = loglike; - log_like_cache_[pdf_id].hit_time = frame; - return loglike; -} - -void DecodableAmSgmm::ResetLogLikeCache() { - if (log_like_cache_.size() != acoustic_model_.NumPdfs()) { - log_like_cache_.resize(acoustic_model_.NumPdfs()); - } - vector::iterator it = log_like_cache_.begin(), - end = log_like_cache_.end(); - for (; it != end; ++it) { it->hit_time = -1; } -} - -} // namespace kaldi diff --git a/src/sgmm/decodable-am-sgmm.h b/src/sgmm/decodable-am-sgmm.h deleted file mode 100644 index f5f21732a3a..00000000000 --- a/src/sgmm/decodable-am-sgmm.h +++ /dev/null @@ -1,119 +0,0 @@ -// sgmm/decodable-am-sgmm.h - -// Copyright 2009-2011 Saarland University; Microsoft Corporation; -// Lukas Burget - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_DECODABLE_AM_SGMM_H_ -#define KALDI_SGMM_DECODABLE_AM_SGMM_H_ - -#include - -#include "base/kaldi-common.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "itf/decodable-itf.h" - -namespace kaldi { - -class DecodableAmSgmm : public DecodableInterface { - public: - DecodableAmSgmm(const SgmmGselectConfig &opts, - const AmSgmm &am, - const SgmmPerSpkDerivedVars &spk, // may be empty - const TransitionModel &tm, - const Matrix &feats, - const std::vector > &gselect_all, - BaseFloat log_prune): // gselect_all may be empty - acoustic_model_(am), sgmm_config_(opts), spk_(spk), - trans_model_(tm), feature_matrix_(feats), - gselect_all_(gselect_all), previous_frame_(-1), - log_prune_(log_prune) { - ResetLogLikeCache(); - } - - // Note, frames are numbered from zero, but transition indices are 1-based! - // This is for compatibility with OpenFST. - virtual BaseFloat LogLikelihood(int32 frame, int32 tid) { - return LogLikelihoodZeroBased(frame, trans_model_.TransitionIdToPdf(tid)); - } - int32 NumFramesReady() const { return feature_matrix_.NumRows(); } - virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); } - - virtual bool IsLastFrame(int32 frame) const { - KALDI_ASSERT(frame < NumFramesReady()); - return (frame == NumFramesReady() - 1); - } - - protected: - void ResetLogLikeCache(); - virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 pdf_id); - - const AmSgmm &acoustic_model_; - const SgmmGselectConfig &sgmm_config_; - const SgmmPerSpkDerivedVars &spk_; - const TransitionModel &trans_model_; ///< for tid to pdf mapping - const Matrix &feature_matrix_; - const std::vector > gselect_all_; ///< if nonempty, - ///< precomputed gaussian indices. - int32 previous_frame_; - BaseFloat log_prune_; - - /// Defines a cache record for a state - struct LikelihoodCacheRecord { - BaseFloat log_like; ///< Cache value - int32 hit_time; ///< Frame for which this value is relevant - }; - - /// Cached per-frame quantities used in SGMM likelihood computation. - std::vector log_like_cache_; - std::vector gselect_; - SgmmPerFrameDerivedVars per_frame_vars_; - - private: - KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm); -}; - -class DecodableAmSgmmScaled : public DecodableAmSgmm { - public: - DecodableAmSgmmScaled(const SgmmGselectConfig &opts, - const AmSgmm &am, - const SgmmPerSpkDerivedVars &spk, // may be empty - const TransitionModel &tm, - const Matrix &feats, - const std::vector > &gselect_all, - // gselect_all may be empty - BaseFloat log_prune, - BaseFloat scale) - : DecodableAmSgmm(opts, am, spk, tm, feats, gselect_all, log_prune), - scale_(scale) {} - - // Note, frames are numbered from zero but transition-ids from one. - virtual BaseFloat LogLikelihood(int32 frame, int32 tid) { - return LogLikelihoodZeroBased(frame, trans_model_.TransitionIdToPdf(tid)) - * scale_; - } - - private: - BaseFloat scale_; - KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmmScaled); -}; - - -} // namespace kaldi - -#endif // KALDI_SGMM_DECODABLE_AM_SGMM_H_ diff --git a/src/sgmm/estimate-am-sgmm-ebw.cc b/src/sgmm/estimate-am-sgmm-ebw.cc deleted file mode 100644 index 74b79694ec8..00000000000 --- a/src/sgmm/estimate-am-sgmm-ebw.cc +++ /dev/null @@ -1,654 +0,0 @@ -// sgmm/estimate-am-sgmm-ebw.cc - -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "sgmm/estimate-am-sgmm-ebw.h" -#include "thread/kaldi-thread.h" -using std::vector; - -namespace kaldi { - -void EbwAmSgmmUpdater::Update(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - SgmmUpdateFlagsType flags, - BaseFloat *auxf_change_out, - BaseFloat *count_out) { - - KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections | - kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix | - kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0); - - // Various quantities need to be computed at the start, before we - // change any of the model parameters. - std::vector< SpMatrix > Q_num, Q_den, H, S_means; - - if (flags & kSgmmPhoneProjections) { - MleAmSgmmUpdater::ComputeQ(num_accs, *model, &Q_num); - MleAmSgmmUpdater::ComputeQ(den_accs, *model, &Q_den); - } - if (flags & kSgmmCovarianceMatrix) { // compute the difference between - // the num and den S_means matrices... this is what we will need. - MleAmSgmmUpdater::ComputeSMeans(num_accs, *model, &S_means); - std::vector< SpMatrix > S_means_tmp; - MleAmSgmmUpdater::ComputeSMeans(den_accs, *model, &S_means_tmp); - for (size_t i = 0; i < S_means.size(); i++) - S_means[i].AddSp(-1.0, S_means_tmp[i]); - } - if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections)) - model->ComputeH(&H); - - BaseFloat tot_impr = 0.0; - - if (flags & kSgmmPhoneVectors) - tot_impr += UpdatePhoneVectors(num_accs, den_accs, model, H); - - if (flags & kSgmmPhoneProjections) - tot_impr += UpdateM(num_accs, den_accs, Q_num, Q_den, model); - - if (flags & kSgmmPhoneWeightProjections) - tot_impr += UpdateWParallel(num_accs, den_accs, model); - - if (flags & kSgmmCovarianceMatrix) - tot_impr += UpdateVars(num_accs, den_accs, S_means, model); - - if (flags & kSgmmSubstateWeights) - tot_impr += UpdateSubstateWeights(num_accs, den_accs, model); - - if (flags & kSgmmSpeakerProjections) - tot_impr += UpdateN(num_accs, den_accs, model); - - - if (auxf_change_out) *auxf_change_out = tot_impr * num_accs.total_frames_; - if (count_out) *count_out = num_accs.total_frames_; - - if (fabs(num_accs.total_frames_ - den_accs.total_frames_) > - 0.01*(num_accs.total_frames_ + den_accs.total_frames_)) - KALDI_WARN << "Num and den frame counts differ, " - << num_accs.total_frames_ << " vs. " << den_accs.total_frames_; - - BaseFloat like_diff = num_accs.total_like_ - den_accs.total_like_; - - KALDI_LOG << "***Averaged differenced likelihood per frame is " - << (like_diff/num_accs.total_frames_) - << " over " << (num_accs.total_frames_) << " frames."; - KALDI_LOG << "***Note: for this to be at all meaningful, if you use " - << "\"canceled\" stats you will have to renormalize this over " - << "the \"real\" frame count."; - - model->ComputeNormalizers(); -} - - -class EbwUpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded. - public: - EbwUpdatePhoneVectorsClass(const EbwAmSgmmUpdater *updater, - const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - const std::vector > &H, - double *auxf_impr): - updater_(updater), num_accs_(num_accs), den_accs_(den_accs), - model_(model), H_(H), auxf_impr_ptr_(auxf_impr), auxf_impr_(0.0) { } - - ~EbwUpdatePhoneVectorsClass() { - *auxf_impr_ptr_ += auxf_impr_; - } - - inline void operator() () { - // Note: give them local copy of the sums we're computing, - // which will be propagated to the total sums in the destructor. - updater_->UpdatePhoneVectorsInternal(num_accs_, den_accs_, model_, H_, - &auxf_impr_, num_threads_, thread_id_); - } - private: - const EbwAmSgmmUpdater *updater_; - const MleAmSgmmAccs &num_accs_; - const MleAmSgmmAccs &den_accs_; - AmSgmm *model_; - const std::vector > &H_; - double *auxf_impr_ptr_; - double auxf_impr_; -}; - - -void EbwAmSgmmUpdater::ComputePhoneVecStats( - const MleAmSgmmAccs &accs, - const AmSgmm &model, - const std::vector > &H, - int32 j, - int32 m, - const Vector &w_jm, - double gamma_jm, - Vector *g_jm, - SpMatrix *H_jm) { - g_jm->CopyFromVec(accs.y_[j].Row(m)); - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_jmi = accs.gamma_[j](m, i); - double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i)); - double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term - * VecVec(model.w_.Row(i), model.v_[j].Row(m)); - g_jm->AddVec(scalar, model.w_.Row(i)); - if (gamma_jmi != 0.0) - H_jm->AddSp(gamma_jmi, H[i]); // The most important term.. - if (quadratic_term > 1.0e-10) - H_jm->AddVec2(static_cast(quadratic_term), model.w_.Row(i)); - } -} - - -// Runs the phone vectors update for a subset of states (called -// multi-threaded). -void EbwAmSgmmUpdater::UpdatePhoneVectorsInternal( - const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - const std::vector > &H, - double *auxf_impr, - int32 num_threads, - int32 thread_id) const { - - int32 block_size = (num_accs.num_states_ + (num_threads-1)) / num_threads, - j_start = block_size * thread_id, - j_end = std::min(num_accs.num_states_, j_start + block_size); - - int32 S = num_accs.phn_space_dim_, I = num_accs.num_gaussians_; - - for (int32 j = j_start; j < j_end; j++) { - double num_state_count = 0.0, - state_auxf_impr = 0.0; - Vector w_jm(I); - for (int32 m = 0; m < model->NumSubstates(j); m++) { - double gamma_jm_num = num_accs.gamma_[j].Row(m).Sum(); - double gamma_jm_den = den_accs.gamma_[j].Row(m).Sum(); - num_state_count += gamma_jm_num; - Vector g_jm_num(S); // computed using eq. 58 of SGMM paper [for numerator stats] - SpMatrix H_jm_num(S); // computed using eq. 59 of SGMM paper [for numerator stats] - Vector g_jm_den(S); // same, but for denominator stats. - SpMatrix H_jm_den(S); - - // Compute the weights for this sub-state. - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - w_jm.AddMatVec(1.0, Matrix(model->w_), kNoTrans, - Vector(model->v_[j].Row(m)), 0.0); - w_jm.ApplySoftMax(); - - ComputePhoneVecStats(num_accs, *model, H, j, m, w_jm, gamma_jm_num, - &g_jm_num, &H_jm_num); - ComputePhoneVecStats(den_accs, *model, H, j, m, w_jm, gamma_jm_den, - &g_jm_den, &H_jm_den); - - Vector v_jm(model->v_[j].Row(m)); - Vector local_derivative(S); // difference of derivative of numerator - // and denominator objetive function. - local_derivative.AddVec(1.0, g_jm_num); - local_derivative.AddSpVec(-1.0, H_jm_num, v_jm, 1.0); - local_derivative.AddVec(-1.0, g_jm_den); - local_derivative.AddSpVec(-1.0 * -1.0, H_jm_den, v_jm, 1.0); - - SpMatrix quadratic_term(H_jm_num); - quadratic_term.AddSp(1.0, H_jm_den); - double substate_count = 1.0e-10 + gamma_jm_num + gamma_jm_den; - quadratic_term.Scale( (substate_count + options_.tau_v) / substate_count); - quadratic_term.Scale(1.0 / (options_.lrate_v + 1.0e-10) ); - - Vector delta_v_jm(S); - - SolverOptions opts; - opts.name = "v"; - opts.K = options_.max_cond; - opts.eps = options_.epsilon; - - double auxf_impr = - ((gamma_jm_num + gamma_jm_den == 0) ? 0.0 : - SolveQuadraticProblem(quadratic_term, local_derivative, - opts, &delta_v_jm)); - - v_jm.AddVec(1.0, delta_v_jm); - model->v_[j].Row(m).CopyFromVec(v_jm); - state_auxf_impr += auxf_impr; - } - - *auxf_impr += state_auxf_impr; - if (j < 10 && thread_id == 0) { - KALDI_LOG << "Objf impr for state j = " << j << " is " - << (state_auxf_impr / (num_state_count + 1.0e-10)) - << " over " << num_state_count << " frames"; - } - } -} - -double EbwAmSgmmUpdater::UpdatePhoneVectors(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - const vector< SpMatrix > &H) const { - KALDI_LOG << "Updating phone vectors."; - - double count = 0.0, auxf_impr = 0.0; - - int32 J = num_accs.num_states_; - for (int32 j = 0; j < J; j++) count += num_accs.gamma_[j].Sum(); - - EbwUpdatePhoneVectorsClass c(this, num_accs, den_accs, model, H, &auxf_impr); - RunMultiThreaded(c); - - auxf_impr /= count; - - KALDI_LOG << "**Overall auxf improvement for v is " << auxf_impr - << " over " << count << " frames"; - return auxf_impr; -} - - -double EbwAmSgmmUpdater::UpdateM(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - const std::vector< SpMatrix > &Q_num, - const std::vector< SpMatrix > &Q_den, - AmSgmm *model) const { - int32 S = model->PhoneSpaceDim(), - D = model->FeatureDim(), - I = model->NumGauss(); - - Vector num_count_vec(I), den_count_vec(I), impr_vec(I); - for (int32 j = 0; j < num_accs.num_states_; j++) { - num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]); - den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]); - } - - for (int32 i = 0; i < I; i++) { - double gamma_i_num = num_count_vec(i), gamma_i_den = den_count_vec(i); - - if (gamma_i_num + gamma_i_den == 0.0) { - KALDI_WARN << "Not updating phonetic basis for i = " << i - << " because count is zero. "; - continue; - } - - Matrix Mi(model->M_[i]); - Matrix L(D, S); // this is something like the Y quantity, which - // represents the linear term in the objf on M-- except that we make it the local - // derivative about the current value, instead of the derivative around zero. - // But it's not exactly the derivative w.r.t. M, due to the factor of Sigma_i. - // The auxiliary function is Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T), - // where P is Y^{-1}. The quantity L we define here will be Y - M Q, - // and you can think of this as like the local derivative, except there is - // a term P in there. - L.AddMat(1.0, num_accs.Y_[i]); - L.AddMatSp(-1.0, Mi, kNoTrans, Q_num[i], 1.0); - L.AddMat(-1.0, den_accs.Y_[i]); - L.AddMatSp(-1.0*-1.0, Mi, kNoTrans, Q_den[i], 1.0); - - SpMatrix Q(S); // This is a combination of the Q's for the numerator and denominator. - Q.AddSp(1.0, Q_num[i]); - Q.AddSp(1.0, Q_den[i]); - - double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count - // represented by the quadratic part of the stats. - Q.Scale( (state_count + options_.tau_M) / state_count ); - Q.Scale( 1.0 / (options_.lrate_M + 1.0e-10) ); - - SolverOptions opts; - opts.name = "M"; - opts.K = options_.max_cond; - opts.eps = options_.epsilon; - - Matrix deltaM(D, S); - double impr = - SolveQuadraticMatrixProblem(Q, L, - SpMatrix(model->SigmaInv_[i]), - opts, &deltaM); - - impr_vec(i) = impr; - Mi.AddMat(1.0, deltaM); - model->M_[i].CopyFromMat(Mi); - if (i < 10 || impr / state_count > 3.0) { - KALDI_LOG << "Objf impr for projection M for i = " << i << ", is " - << (impr/(gamma_i_num + 1.0e-20)) << " over " << gamma_i_num - << " frames"; - } - } - BaseFloat tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum(); - - tot_impr /= (tot_count + 1.0e-20); - KALDI_LOG << "Overall auxiliary function improvement for model projections " - << "M is " << tot_impr << " over " << tot_count << " frames"; - - KALDI_VLOG(1) << "Updating M: num-count is " << num_count_vec; - KALDI_VLOG(1) << "Updating M: den-count is " << den_count_vec; - KALDI_VLOG(1) << "Updating M: objf-impr is " << impr_vec; - - return tot_impr; -} - - -// Note: we do just one iteration of the weight-projection update here. The -// weak-sense auxiliary functions used don't really make sense if we do it for -// multiple iterations. It would be possible to use a similar auxiliary -// function to the one on my (D. Povey)'s thesis for the Gaussian mixture -// weights, which would make sense for multiple iterations, but this would be a -// bit more complex to implement and probably would not give much improvement -// over this approach. -double EbwAmSgmmUpdater::UpdateWParallel(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model) { - KALDI_LOG << "Updating weight projections"; - - int32 I = num_accs.num_gaussians_, S = num_accs.phn_space_dim_; - - Matrix g_i_num(I, S), g_i_den(I, S); - - // View F_i_{num,den} as vectors of SpMatrix [i.e. symmetric matrices, - // linearized into vectors] - Matrix F_i_num(I, (S*(S+1))/2), F_i_den(I, (S*(S+1))/2); - - Vector num_count_vec(I), den_count_vec(I), impr_vec(I); - for (int32 j = 0; j < num_accs.num_states_; j++) { - num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]); - den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]); - } - - // Get the F_i and g_i quantities-- this is done in parallel (multi-core), - // using the same code we use in the ML update [except we get it for - // numerator and denominator separately.] - Matrix w(model->w_); - { - double garbage; - UpdateWParallelClass c_num(num_accs, *model, w, &F_i_num, &g_i_num, &garbage); - RunMultiThreaded(c_num); - } - { - double garbage; - UpdateWParallelClass c_den(den_accs, *model, w, &F_i_den, &g_i_den, &garbage); - RunMultiThreaded(c_den); - } - - for (int32 i = 0; i < I; i++) { - - // auxf was originally formulated in terms of the change in w (i.e. the - // g quantities are the local derivatives), so there is less hassle than - // with some of the other updates, in changing it to be discriminative. - // we essentially just difference the linear terms and add the quadratic - // terms. - - Vector derivative(g_i_num.Row(i)); - derivative.AddVec(-1.0, g_i_den.Row(i)); - // F_i_num quadratic_term is a bit like the negated 2nd derivative - // of the numerator stats-- actually it's not the actual 2nd deriv, - // but an upper bound on it. - SpMatrix quadratic_term(S), tmp_F(S); - quadratic_term.CopyFromVec(F_i_num.Row(i)); - tmp_F.CopyFromVec(F_i_den.Row(i)); // tmp_F is used for Vector->SpMatrix conversion. - quadratic_term.AddSp(1.0, tmp_F); - - double state_count = num_count_vec(i) + den_count_vec(i); - - quadratic_term.Scale((state_count + options_.tau_w) / (state_count + 1.0e-10)); - quadratic_term.Scale(1.0 / (options_.lrate_w + 1.0e-10) ); - - Vector delta_w(S); - - SolverOptions opts; - opts.name = "w"; - opts.K = options_.max_cond; - opts.eps = options_.epsilon; - - double objf_impr = - SolveQuadraticProblem(quadratic_term, derivative, opts, &delta_w); - - impr_vec(i) = objf_impr; - if (i < 10 || objf_impr / (num_count_vec(i) + 1.0e-10) > 2.0) { - KALDI_LOG << "Predicted objf impr for w per frame is " - << (objf_impr / (num_count_vec(i) + 1.0e-10)) - << " over " << num_count_vec(i) << " frames."; - } - model->w_.Row(i).AddVec(1.0, delta_w); - } - KALDI_VLOG(1) << "Updating w: numerator count is " << num_count_vec; - KALDI_VLOG(1) << "Updating w: denominator count is " << den_count_vec; - KALDI_VLOG(1) << "Updating w: objf-impr is " << impr_vec; - - double tot_num_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum(); - tot_impr /= tot_num_count; - - KALDI_LOG << "**Overall objf impr for w per frame is " - << tot_impr << " over " << tot_num_count - << " frames."; - return tot_impr; -} - - -double EbwAmSgmmUpdater::UpdateN(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model) const { - if (num_accs.spk_space_dim_ == 0 || num_accs.R_.size() == 0 || - num_accs.Z_.size() == 0) { - KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated"; - } - - int32 I = num_accs.num_gaussians_, D = num_accs.feature_dim_, - T = num_accs.spk_space_dim_; - - Vector num_count_vec(I), den_count_vec(I), impr_vec(I); - for (int32 j = 0; j < num_accs.num_states_; j++) { - num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]); - den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]); - } - - for (int32 i = 0; i < I; i++) { - double gamma_i_num = num_count_vec(i), gamma_i_den = den_count_vec(i); - if (gamma_i_num + gamma_i_den == 0.0) { - KALDI_WARN << "Not updating speaker basis for i = " << i - << " because count is zero. "; - continue; - } - Matrix Ni(model->N_[i]); - // See comment near declaration of L in UpdateM(). This update is the - // same, but change M->N, Y->Z and Q->R. - - Matrix L(D, T); - L.AddMat(1.0, num_accs.Z_[i]); - L.AddMatSp(-1.0, Ni, kNoTrans, num_accs.R_[i], 1.0); - L.AddMat(-1.0, den_accs.Z_[i]); - L.AddMatSp(-1.0*-1.0, Ni, kNoTrans, den_accs.R_[i], 1.0); - - SpMatrix R(T); // combination of the numerator and denominator R's. - R.AddSp(1.0, num_accs.R_[i]); - R.AddSp(1.0, den_accs.R_[i]); - - double state_count = 1.0e-10 + gamma_i_num + gamma_i_den; // the count - // represented by the quadratic part of the stats. - R.Scale( (state_count + options_.tau_N) / state_count ); - R.Scale( 1.0 / (options_.lrate_N + 1.0e-10) ); - - Matrix deltaN(D, T); - - SolverOptions opts; - opts.name = "M"; - opts.K = options_.max_cond; - opts.eps = options_.epsilon; - - double impr = - SolveQuadraticMatrixProblem(R, L, - SpMatrix(model->SigmaInv_[i]), - opts, &deltaN); - - impr_vec(i) = impr; - Ni.AddMat(1.0, deltaN); - model->N_[i].CopyFromMat(Ni); - if (i < 10 || impr / (state_count+1.0e-20) > 3.0) { - KALDI_LOG << "Objf impr for spk projection N for i = " << (i) - << ", is " << (impr / (gamma_i_num + 1.0e-20)) << " over " - << gamma_i_num << " frames"; - } - } - - KALDI_VLOG(1) << "Updating N: numerator count is " << num_count_vec; - KALDI_VLOG(1) << "Updating N: denominator count is " << den_count_vec; - KALDI_VLOG(1) << "Updating N: objf-impr is " << impr_vec; - - double tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum(); - tot_impr /= (tot_count + 1.0e-20); - KALDI_LOG << "**Overall auxf impr for N is " << tot_impr - << " over " << tot_count << " frames"; - return tot_impr; -} - -double EbwAmSgmmUpdater::UpdateVars(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - const std::vector< SpMatrix > &S_means, - AmSgmm *model) const { - // Note: S_means contains not only the quantity S_means in the paper, - // but also has a term - (Y_i M_i^T + M_i Y_i^T). Plus, it is differenced - // between numerator and denominator. We don't calculate it here, - // because it had to be computed with the original model, before we - // changed the M quantities. - int32 I = num_accs.num_gaussians_; - KALDI_ASSERT(S_means.size() == I); - - Vector num_count_vec(I), den_count_vec(I), impr_vec(I); - for (int32 j = 0; j < num_accs.num_states_; j++) { - num_count_vec.AddRowSumMat(1.0, num_accs.gamma_[j]); - den_count_vec.AddRowSumMat(1.0, den_accs.gamma_[j]); - } - - for (int32 i = 0; i < I; i++) { - double num_count = num_count_vec(i), den_count = den_count_vec(i); - - SpMatrix SigmaStats(S_means[i]); - SigmaStats.AddSp(1.0, num_accs.S_[i]); - SigmaStats.AddSp(-1.0, den_accs.S_[i]); - // SigmaStats now contain the stats for estimating Sigma (as in the main SGMM paper), - // differenced between num and den. - SpMatrix SigmaInvOld(model->SigmaInv_[i]), SigmaOld(model->SigmaInv_[i]); - SigmaOld.Invert(); - double count = num_count - den_count; - KALDI_ASSERT(options_.lrate_Sigma <= 1.0); - double inv_lrate = 1.0 / options_.lrate_Sigma; - // These formulas assure that the objective function behaves in - // a roughly symmetric way w.r.t. num and den counts. - double E_den = 1.0 + inv_lrate, E_num = inv_lrate - 1.0; - - double smoothing_count = - (options_.tau_Sigma * inv_lrate) + // multiply tau_Sigma by inverse-lrate - (E_den * den_count) + // for compatibility with other updates. - (E_num * num_count) + - 1.0e-10; - SigmaStats.AddSp(smoothing_count, SigmaOld); - count += smoothing_count; - SigmaStats.Scale(1.0 / count); - SpMatrix SigmaInv(SigmaStats); // before floor and ceiling. Currently sigma, - // not its inverse. - bool verbose = false; - int n_floor = SigmaInv.ApplyFloor(SigmaOld, options_.cov_min_value, verbose); - SigmaInv.Invert(); // make it inverse variance. - int n_ceiling = SigmaInv.ApplyFloor(SigmaInvOld, options_.cov_min_value, verbose); - - // this auxf_change. - double auxf_change = -0.5 * count *(TraceSpSp(SigmaInv, SigmaStats) - - TraceSpSp(SigmaInvOld, SigmaStats) - - SigmaInv.LogDet() - + SigmaInvOld.LogDet()); - - model->SigmaInv_[i].CopyFromSp(SigmaInv); - impr_vec(i) = auxf_change; - if (i < 10 || auxf_change / (num_count+den_count+1.0e-10) > 2.0 - || n_floor+n_ceiling > 0) { - KALDI_LOG << "Updating variance: Auxf change per frame for Gaussian " - << i << " is " << (auxf_change / num_count) << " over " - << num_count << " frames " << "(den count was " << den_count - << "), #floor,ceil was " << n_floor << ", " << n_ceiling; - } - } - KALDI_VLOG(1) << "Updating Sigma: numerator count is " << num_count_vec; - KALDI_VLOG(1) << "Updating Sigma: denominator count is " << den_count_vec; - KALDI_VLOG(1) << "Updating Sigma: objf-impr is " << impr_vec; - - double tot_count = num_count_vec.Sum(), tot_impr = impr_vec.Sum(); - tot_impr /= tot_count+1.0e-20; - KALDI_LOG << "**Overall auxf impr for Sigma is " << tot_impr - << " over " << tot_count << " frames"; - return tot_impr; -} - - -double EbwAmSgmmUpdater::UpdateSubstateWeights( - const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model) { - KALDI_LOG << "Updating substate mixture weights"; - - double tot_count = 0.0, tot_impr = 0.0; - for (int32 j = 0; j < num_accs.num_states_; j++) { - int32 M = model->NumSubstates(j); - Vector num_occs(M), den_occs(M), - orig_weights(model->c_[j]), weights(model->c_[j]); - - for (int32 m = 0; m < M; m++) { - num_occs(m) = num_accs.gamma_[j].Row(m).Sum() - + options_.tau_c * weights(m); - den_occs(m) = den_accs.gamma_[j].Row(m).Sum(); - } - - if (weights.Dim() > 1) { - double begin_auxf = 0.0, end_auxf = 0.0; - for (int32 m = 0; m < M; m++) { // see eq. 4.32, Dan Povey's PhD thesis. - begin_auxf += num_occs(m) * log (weights(m)) - - den_occs(m) * weights(m) / orig_weights(m); - } - for (int32 iter = 0; iter < 50; iter++) { - Vector k_jm(M); - double max_m = 0.0; - for (int32 m = 0; m < M; m++) - max_m = std::max(max_m, den_occs(m)/orig_weights(m)); - for (int32 m = 0; m < M; m++) - k_jm(m) = max_m - den_occs(m)/orig_weights(m); - for (int32 m = 0; m < M; m++) - weights(m) = num_occs(m) + k_jm(m)*weights(m); - weights.Scale(1.0 / weights.Sum()); - } - for (int32 m = 0; m < M; m++) - weights(m) = std::max(weights(m), - static_cast(options_.min_substate_weight)); - weights.Scale(1.0 / weights.Sum()); // renormalize. - - for (int32 m = 0; m < M; m++) { - end_auxf += num_occs(m) * log (weights(m)) - - den_occs(m) * weights(m) / orig_weights(m); - } - tot_impr += end_auxf - begin_auxf; - double this_impr = ((end_auxf - begin_auxf) / num_occs.Sum()); - if (j < 10 || this_impr > 0.5) { - KALDI_LOG << "Updating substate weights: auxf impr for state " << j - << " is " << this_impr << " per frame over " << num_occs.Sum() - << " frames (den count is " << den_occs.Sum() << ")"; - } - } - model->c_[j].CopyFromVec(weights); - tot_count += den_occs.Sum(); // Note: num and den occs should be the - // same, except num occs are smoothed, so this is what we want. - } - - tot_impr /= (tot_count + 1.0e-20); - - KALDI_LOG << "**Overall auxf impr for c is " << tot_impr - << " over " << tot_count << " frames"; - return tot_impr; -} - -} // namespace kaldi diff --git a/src/sgmm/estimate-am-sgmm-ebw.h b/src/sgmm/estimate-am-sgmm-ebw.h deleted file mode 100644 index d437dbe06a0..00000000000 --- a/src/sgmm/estimate-am-sgmm-ebw.h +++ /dev/null @@ -1,217 +0,0 @@ -// sgmm/estimate-am-sgmm-ebw.h - -// Copyright 2012 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_ -#define KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_ 1 - -#include -#include - -#include "gmm/model-common.h" -#include "itf/options-itf.h" -#include "sgmm/estimate-am-sgmm.h" - -namespace kaldi { - -/** - This header implements a form of Extended Baum-Welch training for SGMMs. - If you are confused by this comment, see Dan Povey's thesis for an explanation of - Extended Baum-Welch. - A note on the EBW (Extended Baum-Welch) updates for the SGMMs... In general there is - a parameter-specific value D that is similar to the D in EBW for GMMs. The value of - D is generally set to: - E * (denominator-count for that parameter) + tau-value for that parameter - where the tau-values are user-specified parameters that are specific to the type of - the parameter (e.g. phonetic vector, subspace projection, etc.). Things are a bit - more complex for this update than for GMMs, because it's not just a question of picking - a tau-value for smoothing: there is sometimes a scatter-matrix of some kind (e.g. - an outer product of vectors, or something) that defines a quadratic objective function - that we'll add as smoothing. We have to pick where to get this scatter-matrix from. - We feel that it's appropriate for the "E" part of the D to get its scatter-matrix from - denominator stats, and the tau part of the D to get half its scatter-matrix from the - both the numerator and denominator stats, assigned a weight proportional to how much - stats there were. When you see the auxiliary function written out, it's clear why this - makes sense. - - */ - -struct EbwAmSgmmOptions { - BaseFloat tau_v; ///< Smoothing constant for updates of sub-state vectors v_{jm} - BaseFloat lrate_v; ///< Learning rate used in updating v-- default 0.5 - BaseFloat tau_M; ///< Smoothing constant for the M quantities (phone-subspace projections) - BaseFloat lrate_M; ///< Learning rate used in updating M-- default 0.5 - BaseFloat tau_N; ///< Smoothing constant for the N quantities (speaker-subspace projections) - BaseFloat lrate_N; ///< Learning rate used in updating N-- default 0.5 - BaseFloat tau_c; ///< Tau value for smoothing substate weights (c) - BaseFloat tau_w; ///< Tau value for smoothing update of weight projectsions (w) - BaseFloat lrate_w; ///< Learning rate used in updating w-- default 0.5 - BaseFloat tau_Sigma; ///< Tau value for smoothing covariance-matrices Sigma. - BaseFloat lrate_Sigma; ///< Learning rate used in updating Sigma-- default 0.5 - BaseFloat min_substate_weight; ///< Minimum allowed weight in a sub-state. - - BaseFloat cov_min_value; ///< E.g. 0.5-- the maximum any eigenvalue of a covariance - /// is allowed to change. [this is the minimum; the maximum is the inverse of this, - /// i.e. 2.0 in this case. For example, 0.9 would constrain the covariance quite tightly, - /// 0.1 would be a loose setting. - - BaseFloat max_cond; ///< large value used in SolveQuadraticProblem. - BaseFloat epsilon; ///< very small value used in SolveQuadraticProblem; workaround - /// for an issue in some implementations of SVD. - - EbwAmSgmmOptions() { - tau_v = 50.0; - lrate_v = 0.5; - tau_M = 500.0; - lrate_M = 0.5; - tau_N = 500.0; - lrate_N = 0.5; - tau_c = 10.0; - tau_w = 50.0; - lrate_w = 1.0; - tau_Sigma = 500.0; - lrate_Sigma = 0.5; - - min_substate_weight = 1.0e-05; - cov_min_value = 0.5; - - max_cond = 1.0e+05; - epsilon = 1.0e-40; - } - - void Register(OptionsItf *opts) { - std::string module = "EbwAmSgmmOptions: "; - opts->Register("tau-v", &tau_v, module+ - "Smoothing constant for phone vector estimation."); - opts->Register("lrate-v", &lrate_v, module+ - "Learning rate constant for phone vector estimation."); - opts->Register("tau-m", &tau_M, module+ - "Smoothing constant for estimation of phonetic-subspace projections (M)."); - opts->Register("lrate-m", &lrate_M, module+ - "Learning rate constant for phonetic-subspace projections."); - opts->Register("tau-n", &tau_N, module+ - "Smoothing constant for estimation of speaker-subspace projections (N)."); - opts->Register("lrate-n", &lrate_N, module+ - "Learning rate constant for speaker-subspace projections."); - opts->Register("tau-c", &tau_c, module+ - "Smoothing constant for estimation of substate weights (c)"); - opts->Register("tau-w", &tau_w, module+ - "Smoothing constant for estimation of weight projections (w)"); - opts->Register("lrate-w", &lrate_w, module+ - "Learning rate constant for weight-projections"); - opts->Register("tau-sigma", &tau_Sigma, module+ - "Smoothing constant for estimation of within-class covariances (Sigma)"); - opts->Register("lrate-sigma", &lrate_Sigma, module+ - "Constant that controls speed of learning for variances (larger->slower)"); - opts->Register("cov-min-value", &cov_min_value, module+ - "Minimum value that an eigenvalue of the updated covariance matrix can take, " - "relative to its old value (maximum is inverse of this.)"); - opts->Register("min-substate-weight", &min_substate_weight, module+ - "Floor for weights of sub-states."); - opts->Register("max-cond", &max_cond, module+ - "Value used in handling singular matrices during update."); - opts->Register("epsilon", &max_cond, module+ - "Value used in handling singular matrices during update."); - } -}; - - -/** \class EbwAmSgmmUpdater - * Contains the functions needed to update the SGMM parameters. - */ -class EbwAmSgmmUpdater { - public: - explicit EbwAmSgmmUpdater(const EbwAmSgmmOptions &options): - options_(options) {} - - void Update(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - SgmmUpdateFlagsType flags, - BaseFloat *auxf_change_out, - BaseFloat *count_out); - - protected: - // The following two classes relate to multi-core parallelization of some - // phases of the update. - friend class EbwUpdateWParallelClass; - friend class EbwUpdatePhoneVectorsClass; - private: - EbwAmSgmmOptions options_; - - Vector gamma_j_; ///< State occupancies - - double UpdatePhoneVectors(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - const std::vector< SpMatrix > &H) const; - - // Called from UpdatePhoneVectors; updates a subset of states - // (relates to multi-threading). - void UpdatePhoneVectorsInternal(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model, - const std::vector > &H, - double *auxf_impr, - int32 num_threads, - int32 thread_id) const; - // Called from UpdatePhoneVectorsInternal - static void ComputePhoneVecStats(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const std::vector > &H, - int32 j, - int32 m, - const Vector &w_jm, - double gamma_jm, - Vector *g_jm, - SpMatrix *H_jm); - - double UpdateM(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - const std::vector< SpMatrix > &Q_num, - const std::vector< SpMatrix > &Q_den, - AmSgmm *model) const; - - double UpdateN(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model) const; - - double UpdateVars(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - const std::vector< SpMatrix > &S_means, - AmSgmm *model) const; - - /// Note: in the discriminative case we do just one iteration of - /// updating the w quantities. - double UpdateWParallel(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model); - - double UpdateSubstateWeights(const MleAmSgmmAccs &num_accs, - const MleAmSgmmAccs &den_accs, - AmSgmm *model); - - KALDI_DISALLOW_COPY_AND_ASSIGN(EbwAmSgmmUpdater); - EbwAmSgmmUpdater() {} // Prevent unconfigured updater. -}; - - -} // namespace kaldi - - -#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_ diff --git a/src/sgmm/estimate-am-sgmm-multi-test.cc b/src/sgmm/estimate-am-sgmm-multi-test.cc deleted file mode 100644 index 883934a3ce0..00000000000 --- a/src/sgmm/estimate-am-sgmm-multi-test.cc +++ /dev/null @@ -1,154 +0,0 @@ -// sgmm/estimate-am-sgmm-multi-test.cc - -// Copyright 2009-2012 Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "gmm/model-test-common.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "sgmm/estimate-am-sgmm-multi.h" -#include "util/kaldi-io.h" -#include "base/kaldi-math.h" - -using kaldi::AmSgmm; -using kaldi::MleAmSgmmAccs; -using kaldi::BaseFloat; -using kaldi::Exp; - -namespace ut = kaldi::unittest; - -// Tests the MleAmSgmmUpdaterMulti (and MleAmSgmmGlobalAccs) classes. -void TestMultiSgmmEst(const std::vector &models, - const std::vector< kaldi::Matrix > &feats, - kaldi::SgmmUpdateFlagsType flags) { - using namespace kaldi; - typedef kaldi::int32 int32; - - int32 num_gauss = models[0]->NumGauss(), - feat_dim = models[0]->FeatureDim(), - phn_dim = models[0]->PhoneSpaceDim(), - spk_dim = models[0]->SpkSpaceDim(), - num_models = models.size(); - SgmmPerFrameDerivedVars frame_vars; - SgmmPerSpkDerivedVars spk_vars; - spk_vars.v_s.Resize(spk_dim); - spk_vars.v_s.SetRandn(); - SgmmGselectConfig sgmm_config; - frame_vars.Resize(num_gauss, feat_dim, phn_dim); - sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest, num_gauss); - - std::vector accs(num_models); - BaseFloat loglike = 0.0; - for (int32 i = 0; i < num_models; ++i) { - MleAmSgmmAccs* acc = new MleAmSgmmAccs(*models[i], flags); - models[i]->ComputePerSpkDerivedVars(&spk_vars); - for (int32 f = 0; f < feats[i].NumRows(); ++f) { - std::vector gselect; - models[i]->GaussianSelection(sgmm_config, feats[i].Row(f), &gselect); - models[i]->ComputePerFrameVars(feats[i].Row(f), gselect, spk_vars, 0.0, - &frame_vars); - loglike += acc->Accumulate(*models[i], frame_vars, spk_vars.v_s, 0, 1.0, - flags); - } - acc->CommitStatsForSpk(*models[i], spk_vars.v_s); - accs[i] = acc; - } - - std::vector new_models(num_models); - kaldi::MleAmSgmmOptions update_opts; - for (int32 i = 0; i < num_models; ++i) { - AmSgmm *sgmm1 = new AmSgmm(); - sgmm1->CopyFromSgmm(*models[i], false); - new_models[i] = sgmm1; - } - - // Updater class stores globals parameters; OK to initialize with any model - // since it is assumed that they have the same global parameters. - kaldi::MleAmSgmmUpdaterMulti updater(*models[0], update_opts); - updater.Update(accs, new_models, flags); - - BaseFloat loglike1 = 0.0; - for (int32 i = 0; i < num_models; ++i) { - new_models[i]->ComputePerSpkDerivedVars(&spk_vars); - for (int32 f = 0; f < feats[i].NumRows(); ++f) { - std::vector gselect; - new_models[i]->GaussianSelection(sgmm_config, feats[i].Row(f), &gselect); - new_models[i]->ComputePerFrameVars(feats[i].Row(f), gselect, spk_vars, 0.0, - &frame_vars); - loglike1 += new_models[i]->LogLikelihood(frame_vars, 0); - } - } - KALDI_LOG << "LL = " << loglike << "; LL1 = " << loglike1; - - KALDI_ASSERT(loglike1 >= loglike - (std::abs(loglike1)+std::abs(loglike))*1.0e-06); - - DeletePointers(&accs); - DeletePointers(&new_models); -} - -void UnitTestEstimateSgmm() { - int32 dim = 2 + kaldi::RandInt(0, 9); // random dimension of the gmm - int32 num_comp = 2 + kaldi::RandInt(0, 9); // random mixture size - kaldi::FullGmm full_gmm; - ut::InitRandFullGmm(dim, num_comp, &full_gmm); - - int32 num_states = 1; - int32 num_models = kaldi::RandInt(2, 9); - std::vector models(num_models); - for (int32 i =0; i < num_models; ++i) { - AmSgmm* sgmm = new AmSgmm(); - sgmm->InitializeFromFullGmm(full_gmm, num_states, dim+1, dim); - sgmm->ComputeNormalizers(); - models[i] = sgmm; - } - - std::vector< kaldi::Matrix > feats(num_models); - for (int32 i = 0; i < num_models; ++i) { - // First, generate random means and variances - int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2); - kaldi::Matrix means(num_feat_comp, dim), - vars(num_feat_comp, dim); - for (int32 m = 0; m < num_feat_comp; ++m) { - for (int32 d= 0; d < dim; d++) { - means(m, d) = kaldi::RandGauss(); - vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2; - } - } - // Now generate random features with those means and variances. - feats[i].Resize(num_feat_comp * 200, dim); - for (int32 m = 0; m < num_feat_comp; ++m) { - kaldi::SubMatrix tmp(feats[i], m*200, 200, 0, dim); - ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp); - } - } - kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll; - TestMultiSgmmEst(models, feats, flags); - flags = (kaldi::kSgmmPhoneProjections | kaldi::kSgmmPhoneWeightProjections | - kaldi::kSgmmCovarianceMatrix); - TestMultiSgmmEst(models, feats, flags); - flags = (kaldi::kSgmmSpeakerProjections | kaldi::kSgmmCovarianceMatrix | - kaldi::kSgmmPhoneVectors); - TestMultiSgmmEst(models, feats, flags); - kaldi::DeletePointers(&models); -} - -int main() { - for (int i = 0; i < 10; ++i) - UnitTestEstimateSgmm(); - std::cout << "Test OK.\n"; - return 0; -} diff --git a/src/sgmm/estimate-am-sgmm-multi.cc b/src/sgmm/estimate-am-sgmm-multi.cc deleted file mode 100644 index 38d517b55ff..00000000000 --- a/src/sgmm/estimate-am-sgmm-multi.cc +++ /dev/null @@ -1,746 +0,0 @@ -// sgmm/estimate-am-sgmm-multi.cc - -// Copyright 2012 Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -using std::string; -#include -using std::vector; - -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm-multi.h" -#include "thread/kaldi-thread.h" - -namespace kaldi { - -void MleAmSgmmGlobalAccs::ResizeAccumulators(const AmSgmm &model, - SgmmUpdateFlagsType flags) { - num_gaussians_ = model.NumGauss(); - feature_dim_ = model.FeatureDim(); - phn_space_dim_ = model.PhoneSpaceDim(); - spk_space_dim_ = model.SpkSpaceDim(); - - if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) { - Y_.resize(num_gaussians_); - Q_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; ++i) { - Y_[i].Resize(feature_dim_, phn_space_dim_, kSetZero); - Q_[i].Resize(phn_space_dim_, kSetZero); - } - } else { - Y_.clear(); - Q_.clear(); - } - - if (flags & kSgmmCovarianceMatrix) { - S_.resize(num_gaussians_); - S_means_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; i++) { - S_[i].Resize(feature_dim_, kSetZero); - S_means_[i].Resize(feature_dim_, kSetZero); - } - } else { - S_.clear(); - } - - if (flags & kSgmmSpeakerProjections) { - if (spk_space_dim_ == 0) { - KALDI_ERR << "Cannot set up accumulators for speaker projections " - << "because speaker subspace has not been set up"; - } - Z_.resize(num_gaussians_); - R_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; ++i) { - Z_[i].Resize(feature_dim_, spk_space_dim_, kSetZero); - R_[i].Resize(spk_space_dim_, kSetZero); - } - } else { - Z_.clear(); - R_.clear(); - } - - gamma_i_.Resize(num_gaussians_, kSetZero); -} - -void MleAmSgmmGlobalAccs::ZeroAccumulators(SgmmUpdateFlagsType flags) { - if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) { - for (int32 i = 0, end = Y_.size(); i < end; ++i) - Y_[i].SetZero(); - } - if (flags & kSgmmCovarianceMatrix) { - for (int32 i = 0, end = S_.size(); i < end; ++i) { - S_[i].SetZero(); - S_means_[i].SetZero(); - } - } - - if (flags & kSgmmSpeakerProjections) { - for (int32 i = 0, end = Z_.size(); i < end; ++i) { - Z_[i].SetZero(); - R_[i].SetZero(); - } - } - gamma_i_.SetZero(); -} - -void MleAmSgmmGlobalAccs::AddAccumulators(const AmSgmm &model, - const MleAmSgmmAccs &accs, - SgmmUpdateFlagsType flags) { - total_frames_ += accs.total_frames_; - total_like_ += accs.total_like_; - for (int32 i = 0; i < num_gaussians_; ++i) { - if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) { - Y_[i].AddMat(1.0, accs.Y_[i], kNoTrans); - } - if (flags & kSgmmSpeakerProjections) { - Z_[i].AddMat(1.0, accs.Z_[i], kNoTrans); - R_[i].AddSp(1.0, accs.R_[i]); - } - if (flags & kSgmmCovarianceMatrix) - S_[i].AddSp(1.0, accs.S_[i]); - } - - // gamma_i - for (int32 j = 0; j < model.NumPdfs(); ++j) { - for (int32 m = 0; m < model.NumSubstates(j); ++m) { - gamma_i_.AddVec(1.0, accs.gamma_[j].Row(m)); - } - } - - // Compute the Q_i quantities (Eq. 64). - if (flags & kSgmmPhoneProjections) { - for (int32 i = 0; i < num_gaussians_; ++i) { - for (int32 j = 0; j < accs.num_states_; ++j) { - const Matrix &state_vec(model.StateVectors(j)); - for (int32 m = 0; m < model.NumSubstates(j); ++m) { - if (accs.gamma_[j](m, i) > 0.0) { - Q_[i].AddVec2(static_cast(accs.gamma_[j](m, i)), - state_vec.Row(m)); - } - } - } - } - } - - // Compute the S_i^{(means)} quantities (Eq. 74). - if (flags & kSgmmCovarianceMatrix) { - Matrix YM_MY(feature_dim_, feature_dim_); - SpMatrix tmp_S_means(feature_dim_); - Vector mu_jmi(feature_dim_); - for (int32 i = 0; i < num_gaussians_; ++i) { - // YM_MY = - (Y_{i} M_{i}^T) - Matrix M(model.GetPhoneProjection(i)); - YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans, M, kTrans, 0.0); - // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T) - { - Matrix M(YM_MY, kTrans); - YM_MY.AddMat(1.0, M); - } - tmp_S_means.CopyFromMat(YM_MY); // Sigma_{i} = -(YM' + MY') - - for (int32 j = 0; j < accs.num_states_; ++j) { - for (int32 m = 0; m < model.NumSubstates(j); ++m) { - // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T - model.GetSubstateMean(j, m, i, &mu_jmi); - tmp_S_means.AddVec2(static_cast(accs.gamma_[j](m, i)), mu_jmi); - } - } - S_means_[i].AddSp(1.0, tmp_S_means); - KALDI_ASSERT(1.0 / S_means_[i](0, 0) != 0.0); - } - } -} - -BaseFloat MleAmSgmmUpdaterMulti::UpdateGlobals(const MleAmSgmmGlobalAccs &accs, - SgmmUpdateFlagsType flags) { - BaseFloat tot_impr = 0.0; - if (flags & kSgmmPhoneProjections) { - tot_impr += UpdateM(accs); - } - if (flags & kSgmmCovarianceMatrix) { - tot_impr += UpdateVars(accs); - } - if (flags & kSgmmSpeakerProjections) { - tot_impr += UpdateN(accs); - if (update_options_.renormalize_N) - KALDI_WARN << "Not renormalizing N"; - } - - KALDI_LOG << "**Total auxf improvement for phone projections & covariances is " - << (tot_impr) << " over " << accs.total_frames_ << " frames."; - return tot_impr; -} - -void MleAmSgmmUpdaterMulti::Update(const std::vector &accs, - const std::vector &models, - SgmmUpdateFlagsType flags) { - KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections | - kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix | - kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0); - if (accs.size() != models.size()) { - KALDI_ERR << "Found " << accs.size() << " accs and " << models.size() - << " models. Must have same number of models and accs."; - } - - SgmmUpdateFlagsType global_flags = (flags & (kSgmmPhoneProjections | - kSgmmPhoneWeightProjections | - kSgmmSpeakerProjections | - kSgmmCovarianceMatrix)); - SgmmUpdateFlagsType state_spec_flags = (flags & ~global_flags); - MleAmSgmmGlobalAccs glob_accs; - BaseFloat tot_impr = 0.0; - int32 num_models = models.size(); - - std::vector< SpMatrix > H; - if (update_options_.renormalize_V) - models[0]->ComputeH(&H); - - if (global_flags != 0) { // expected operating case - glob_accs.ResizeAccumulators(*models[0], global_flags); - for (int32 i = 0; i < num_models; ++i) { - glob_accs.AddAccumulators(*models[i], *accs[i], global_flags); - } - UpdateGlobals(glob_accs, global_flags); - - // Weight projection needs access to all models - if (global_flags & kSgmmPhoneWeightProjections) { - if (update_options_.use_sequential_weight_update) - KALDI_ERR << "Sequential weight update not implemented, using parallel"; -// tot_impr += UpdateWSequential(accs, model); -// } else { - tot_impr += UpdateWParallel(accs, models); -// } - } - } else { // Shouldn't be using this class without updating global params - KALDI_WARN << "Using MleAmSgmmUpdaterMulti class without updating global " - << " parameters."; - } - - // Update the state-specific parameters: phone vectors & substate weights - if (state_spec_flags != 0) { - MleAmSgmmOptions state_spec_opts = update_options_; - state_spec_opts.renormalize_V = false; - state_spec_opts.renormalize_N = false; - - MleAmSgmmUpdater sgmm_updater(state_spec_opts); - for (int32 i = 0; i < num_models; ++i) - tot_impr += sgmm_updater.Update(*accs[i], models[i], state_spec_flags); - } - - - if (update_options_.renormalize_V && (global_flags != 0)) { - SpMatrix H_sm; - this->ComputeSmoothingTerms(glob_accs, H, &H_sm); - RenormalizeV(H_sm, models); - } - - KALDI_LOG << "**Total auxf improvement, combining all parameters, over " - << "all model is " << tot_impr << " per frame."; - - // The following is just for diagnostics - double total_frames = 0, total_like = 0; - for (int32 i = 0; i < num_models; ++i) { - total_frames += accs[i]->TotalFrames(); - total_like += accs[i]->TotalLike(); - } - KALDI_LOG << "***Total data likelihood, over all models, is " - << (total_like/total_frames) << " over " << total_frames - << " frames."; - - // Now, copy the global parameters to the models - for (int32 i = 0; i < num_models; ++i) { - if ((flags & kSgmmPhoneProjections) || update_options_.renormalize_V) - models[i]->M_ = global_M_; - if (flags & kSgmmCovarianceMatrix) - models[i]->SigmaInv_ = global_SigmaInv_; - if ((flags & kSgmmSpeakerProjections) || update_options_.renormalize_N) - models[i]->N_ = global_N_; - if ((flags & kSgmmPhoneWeightProjections) || update_options_.renormalize_V) - models[i]->w_ = global_w_; - models[i]->ComputeNormalizers(); // So that the models are ready to use. - } -} - -// Compute H^{(sm)}, the "smoothing" matrices. -void MleAmSgmmUpdaterMulti::ComputeSmoothingTerms( - const MleAmSgmmGlobalAccs &accs, - const std::vector< SpMatrix > &H, - SpMatrix *H_sm) const { - KALDI_ASSERT(H_sm != NULL); - H_sm->Resize(PhoneSpaceDim()); - - double sum = 0.0; - for (int32 i = 0; i < NumGauss(); ++i) { - if (accs.gamma_i_(i) > 0) { - H_sm->AddSp(accs.gamma_i_(i), H[i]); - sum += accs.gamma_i_(i); - } - } - - if (sum == 0.0) { - KALDI_WARN << "Sum of counts is zero. Smoothing matrix set to unit"; - H_sm->SetUnit(); // arbitrary non-singular matrix - } else { - H_sm->Scale(1.0 / sum); - int32 tmp = H_sm->LimitCondDouble(update_options_.max_cond_H_sm); - if (tmp > 0) { - KALDI_WARN << "Limited " << tmp << " eigenvalues of H_sm."; - } - } -} - -double MleAmSgmmUpdaterMulti::UpdateM(const MleAmSgmmGlobalAccs &accs) { - double totcount = 0.0, tot_like_impr = 0.0; - for (int32 i = 0; i < accs.num_gaussians_; ++i) { - if (accs.gamma_i_(i) < accs.feature_dim_) { - KALDI_WARN << "For component " << i << ": not updating M due to very " - << "small count (=" << accs.gamma_i_(i) << ")."; - continue; - } - - - SolverOptions opts; - opts.name = "M"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - Matrix Mi(global_M_[i]); - double impr = - SolveQuadraticMatrixProblem(accs.Q_[i], accs.Y_[i], - SpMatrix(global_SigmaInv_[i]), - opts, &Mi); - global_M_[i].CopyFromMat(Mi); - - if (i % 50 == 0) { - KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is " - << (impr/(accs.gamma_i_(i) + 1.0e-20)) << " over " - << accs.gamma_i_(i) << " frames"; - } - totcount += accs.gamma_i_(i); - tot_like_impr += impr; - } - tot_like_impr /= (totcount + 1.0e-20); - KALDI_LOG << "Overall objective function improvement for model projections " - << "M is " << tot_like_impr << " over " << totcount << " frames"; - return tot_like_impr; -} - -double MleAmSgmmUpdaterMulti::UpdateN(const MleAmSgmmGlobalAccs &accs) { - double totcount = 0.0, tot_like_impr = 0.0; - if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) { - KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated"; - } - - for (int32 i = 0; i < accs.num_gaussians_; ++i) { - if (accs.gamma_i_(i) < 2 * accs.spk_space_dim_) { - KALDI_WARN << "Not updating speaker basis for i = " << (i) - << " because count is too small " << (accs.gamma_i_(i)); - continue; - } - - SolverOptions opts; - opts.name = "N"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - Matrix Ni(global_N_[i]); - double impr = - SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i], - SpMatrix(global_SigmaInv_[i]), - opts, &Ni); - global_N_[i].CopyFromMat(Ni); - if (i < 10) { - KALDI_LOG << "Objf impr for spk projection N for i = " << (i) - << ", is " << (impr / (accs.gamma_i_(i) + 1.0e-20)) << " over " - << (accs.gamma_i_(i)) << " frames"; - } - totcount += accs.gamma_i_(i); - tot_like_impr += impr; - } - - tot_like_impr /= (totcount+1.0e-20); - KALDI_LOG << "**Overall objf impr for N is " << tot_like_impr << " over " - << totcount << " frames"; - return tot_like_impr; -} - - -double MleAmSgmmUpdaterMulti::UpdateVars(const MleAmSgmmGlobalAccs &accs) { - SpMatrix Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_); - double tot_objf_impr = 0.0, tot_t = 0.0; - SpMatrix covfloor(accs.feature_dim_); - Vector objf_improv(accs.num_gaussians_); - - // First pass over all (shared) Gaussian components to calculate the - // ML estimate of the covariances, and the total covariance for flooring. - for (int32 i = 0; i < accs.num_gaussians_; ++i) { - // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ... - // Y_{i} M_{i}^T - M_{i} Y_{i}^T] - // Note the S_means_ already contains the Y_{i} M_{i}^T terms. - Sigma_i_ml.CopyFromSp(accs.S_means_[i]); - Sigma_i_ml.AddSp(1.0, accs.S_[i]); - covfloor.AddSp(1.0, Sigma_i_ml); - // inverting small values e.g. 4.41745328e-40 seems to generate inf, - // although would be fixed up later. - if (accs.gamma_i_(i) > 1.0e-20) { - Sigma_i_ml.Scale(1 / (accs.gamma_i_(i) + 1.0e-20)); - } else { - Sigma_i_ml.SetUnit(); - } - KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0); - // Eq. (76): Compute the objective function with the old parameter values - objf_improv(i) = global_SigmaInv_[i].LogPosDefDet() - - TraceSpSp(SpMatrix(global_SigmaInv_[i]), Sigma_i_ml); - - global_SigmaInv_[i].CopyFromSp(Sigma_i_ml); // inverted in the next loop. - } - - // Compute the covariance floor. - if (accs.gamma_i_.Sum() == 0) { // If no count, use identity. - KALDI_WARN << "Updating variances: zero counts. Setting floor to unit."; - covfloor.SetUnit(); - } else { // else, use the global average covariance. - covfloor.Scale(update_options_.cov_floor / accs.gamma_i_.Sum()); - int32 tmp; - if ((tmp = covfloor.LimitCondDouble(update_options_.max_cond)) != 0) { - KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed " - << "up " << (tmp) << " eigenvalues."; - } - } - - if (update_options_.cov_diag_ratio > 1000) { - KALDI_LOG << "Assuming you want to build a diagonal system since " - << "cov_diag_ratio is large: making diagonal covFloor."; - for (int32 i = 0; i < covfloor.NumRows(); i++) - for (int32 j = 0; j < i; j++) - covfloor(i, j) = 0.0; - } - - // Second pass over all (shared) Gaussian components to calculate the - // floored estimate of the covariances, and update the model. - for (int32 i = 0; i < accs.num_gaussians_; ++i) { - Sigma_i.CopyFromSp(global_SigmaInv_[i]); - Sigma_i_ml.CopyFromSp(Sigma_i); - // In case of insufficient counts, make the covariance matrix diagonal. - // cov_diag_ratio is 2 by default, set to very large to always get diag-cov - if (accs.gamma_i_(i) < update_options_.cov_diag_ratio * accs.feature_dim_) { - KALDI_WARN << "For Gaussian component " << i << ": Too low count " - << accs.gamma_i_(i) << " for covariance matrix estimation. " - << "Setting to diagonal"; - for (int32 d = 0; d < accs.feature_dim_; d++) - for (int32 e = 0; e < d; e++) - Sigma_i(d, e) = 0.0; // SpMatrix, can only set lower traingular part - - int floored = Sigma_i.ApplyFloor(covfloor); - if (floored > 0) { - KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored - << " covariance eigenvalues."; - } - global_SigmaInv_[i].CopyFromSp(Sigma_i); - global_SigmaInv_[i].InvertDouble(); - } else { // Updating the full covariance matrix. - try { - int floored = Sigma_i.ApplyFloor(covfloor); - if (floored > 0) { - KALDI_WARN << "For Gaussian component " << i << ": Floored " - << floored << " covariance eigenvalues."; - } - global_SigmaInv_[i].CopyFromSp(Sigma_i); - global_SigmaInv_[i].InvertDouble(); - - objf_improv(i) += Sigma_i.LogPosDefDet() + - TraceSpSp(SpMatrix(global_SigmaInv_[i]), Sigma_i_ml); - objf_improv(i) *= (-0.5 * accs.gamma_i_(i)); // Eq. (76) - - tot_objf_impr += objf_improv(i); - tot_t += accs.gamma_i_(i); - if (i < 5) { - KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i) - / (accs.gamma_i_(i) + 1.0e-20) << " over " << (accs.gamma_i_(i)) - << " frames for i = " << (i); - } - } catch(...) { - KALDI_WARN << "Updating within-class covariance matrix i = " << (i) - << ", numerical problem"; - // This is a catch-all thing in case of unanticipated errors, but - // flooring should prevent this occurring for the most part. - global_SigmaInv_[i].SetUnit(); // Set to unit. - } - } - } - KALDI_LOG << "**Overall objf impr for variance update = " - << (tot_objf_impr / (tot_t+ 1.0e-20)) - << " over " << (tot_t) << " frames"; - return tot_objf_impr / (tot_t + 1.0e-20); -} - - -// The parallel weight update, in the paper. -double MleAmSgmmUpdaterMulti::UpdateWParallel( - const std::vector &accs, - const std::vector &models) { - KALDI_LOG << "Updating weight projections"; - - int32 phn_dim = models[0]->PhoneSpaceDim(), - num_gauss = models[0]->NumGauss(), - num_models = models.size(); - SpMatrix v_vT(phn_dim); - // tot_like_{after, before} are totals over multiple iterations, - // not valid likelihoods. but difference is valid (when divided by tot_count). - double tot_predicted_like_impr = 0.0, tot_like_before = 0.0, - tot_like_after = 0.0, tot_count = 0.0; - - Vector w_jm(num_gauss); - Matrix g_i(num_gauss, phn_dim); - std::vector< SpMatrix > F_i(num_gauss); - - Matrix w(global_w_); - for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) { - for (int32 i = 0; i < num_gauss; ++i) { - F_i[i].Resize(phn_dim, kSetZero); - } - double k_like_before = 0.0, k_count = 0.0; - g_i.SetZero(); - - // Unlike in the report the inner most loop is over Gaussians, where - // per-gaussian statistics are accumulated. This is more memory demanding - // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T - // is computed only once for all gaussians. - - for (int32 mdl_idx = 0; mdl_idx < num_models; ++mdl_idx) { - std::vector< Matrix > gamma(accs[mdl_idx]->GetOccs()); - for (int32 j = 0; j < models[mdl_idx]->NumPdfs(); j++) { - for (int32 m = 0; m < models[mdl_idx]->NumSubstates(j); m++) { - double gamma_jm = gamma[j].Row(m).Sum(); - k_count += gamma_jm; - - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - w_jm.AddMatVec(1.0, w, kNoTrans, - Vector(models[mdl_idx]->v_[j].Row(m)), 0.0); - w_jm.Add((-1.0) * w_jm.LogSumExp()); - k_like_before += VecVec(w_jm, gamma[j].Row(m)); - w_jm.ApplyExp(); - v_vT.SetZero(); - // v_vT := v_{jkm} v_{jkm}^T - v_vT.AddVec2(1.0, models[mdl_idx]->v_[j].Row(m)); - - for (int32 i = 0; i < num_gauss; i++) { - // Suggestion: g_jkm can be computed more efficiently - // using the Vector/Matrix routines for all i at once - // linear term around cur value. - double linear_term = gamma[j](m, i) - gamma_jm * w_jm(i); - double quadratic_term = std::max(gamma[j](m, i), gamma_jm * w_jm(i)); - g_i.Row(i).AddVec(linear_term, models[mdl_idx]->v_[j].Row(m)); - // Now I am calling this F_i in the document. [dan] - F_i[i].AddSp(quadratic_term, v_vT); - } - } // loop over substates - } // loop over states - } // loop over model/acc pairs - - Matrix w_orig(w); - double k_predicted_like_impr = 0.0, k_like_after = 0.0; - double min_step = 0.001, step_size; - - SolverOptions opts; - opts.name = "w"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - for (step_size = 1.0; step_size >= min_step; step_size /= 2) { - k_predicted_like_impr = 0.0; - k_like_after = 0.0; - - for (int32 i = 0; i < num_gauss; i++) { - // auxf is formulated in terms of change in w. - Vector delta_w(phn_dim); - // returns objf impr with step_size = 1, - // but it may not be 1 so we recalculate it. - SolveQuadraticProblem(F_i[i], g_i.Row(i), opts, &delta_w); - - delta_w.Scale(step_size); - double predicted_impr = VecVec(delta_w, g_i.Row(i)) - - 0.5 * VecSpVec(delta_w, F_i[i], delta_w); - - // should never be negative because - // we checked inside SolveQuadraticProblem. - KALDI_ASSERT(predicted_impr >= -1.0e-05); - - if (i < 10) { - KALDI_LOG << "Predicted objf impr for w (not per frame), iter = " << - (iter) << ", i = " << (i) << " is " << (predicted_impr); - } - k_predicted_like_impr += predicted_impr; - w.Row(i).AddVec(1.0, delta_w); - } - - for (int32 mdl_idx = 0; mdl_idx < num_models; ++mdl_idx) { - std::vector< Matrix > gamma(accs[mdl_idx]->GetOccs()); - for (int32 j = 0; j < models[mdl_idx]->NumPdfs(); j++) { - for (int32 m = 0; m < models[mdl_idx]->NumSubstates(j); m++) { - w_jm.AddMatVec(1.0, w, kNoTrans, - Vector(models[mdl_idx]->v_[j].Row(m)), 0.0); - w_jm.Add((-1.0) * w_jm.LogSumExp()); - k_like_after += VecVec(w_jm, gamma[j].Row(m)); - } - } - } - KALDI_VLOG(2) << "For iteration " << (iter) << ", updating w gives " - << "predicted per-frame like impr " - << (k_predicted_like_impr / k_count) << ", actual " - << ((k_like_after - k_like_before) / k_count) << ", over " - << (k_count) << " frames"; - if (k_like_after < k_like_before) { - w.CopyFromMat(w_orig); // Undo what we computed. - if (fabs(k_like_after - k_like_before) / k_count < 1.0e-05) { - k_like_after = k_like_before; - KALDI_WARN << "Not updating weights as not increasing auxf and " - << "probably due to numerical issues (since small change)."; - break; - } else { - KALDI_WARN << "Halving step size for weights as likelihood did " - << "not increase"; - } - } else { - break; - } - } - if (step_size < min_step) { - // Undo any step as we have no confidence that this is right. - w.CopyFromMat(w_orig); - } else { - if (iter == 0) { - tot_count += k_count; - } - tot_predicted_like_impr += k_predicted_like_impr; - tot_like_after += k_like_after; - tot_like_before += k_like_before; - } - } - - global_w_.CopyFromMat(w); - - tot_predicted_like_impr /= tot_count; - tot_like_after = (tot_like_after - tot_like_before) / tot_count; - KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr - << ", actual " << tot_like_after << ", over " - << tot_count << " frames"; - return tot_like_after; -} - -void MleAmSgmmUpdaterMulti::RenormalizeV(const SpMatrix &H_sm, - const vector &models) { - int32 phn_dim = PhoneSpaceDim(), - feat_dim = FeatureDim(), - num_models = models.size(); - SpMatrix Sigma(phn_dim); - int32 count = 0; - for (int32 mdl = 0; mdl < num_models; ++mdl) { - for (int32 j = 0; j < models[mdl]->NumPdfs(); ++j) { - for (int32 m = 0; m < models[mdl]->NumSubstates(j); ++m) { - count++; - Sigma.AddVec2(static_cast(1.0), models[mdl]->v_[j].Row(m)); - } - } - } - Sigma.Scale(1.0 / count); - int32 fixed_eigs = Sigma.LimitCondDouble(update_options_.max_cond); - if (fixed_eigs != 0) { - KALDI_WARN << "Scatter of vectors v is poorly conditioned. Fixed up " - << fixed_eigs << " eigenvalues."; - } - KALDI_LOG << "Eigenvalues of scatter of vectors v is : "; - Sigma.PrintEigs("Sigma"); - if (!Sigma.IsPosDef()) { - KALDI_LOG << "Not renormalizing v because scatter is not positive definite" - << " -- maybe first iter?"; - return; - } - - // Want to make variance of v unit and H_sm (like precision matrix) diagonal. - TpMatrix L(phn_dim); - L.Cholesky(Sigma); - TpMatrix LInv(L); - LInv.Invert(); - - Matrix tmpL(phn_dim, phn_dim); - tmpL.CopyFromTp(L); - - SpMatrix H_sm_proj(phn_dim); - H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0); - // H_sm_proj := L^{T} * H_sm * L. - // This is right because we would transform the vectors themselves - // by L^{-1}, and H_sm is like the inverse of the vectors, - // so it's {L^{-1}}^{-T} = L^T. - - Matrix U(phn_dim, phn_dim); - Vector eigs(phn_dim); - H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0); // 1.0 means no checking +ve def -> faster - KALDI_LOG << "Note on the next diagnostic: the first number is generally not " - << "that meaningful as it relates to the static offset"; - H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)"); - - // Transform on vectors is U^T L^{-1}. - // Why? Because transform on H_sm is T =U^T L^T - // and we want T^{-T} by normal rules of vector/covector and we - // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}. - Matrix Trans(phn_dim, phn_dim); // T^{-T} - Matrix tmpLInv(phn_dim, phn_dim); - tmpLInv.CopyFromTp(LInv); - Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0); - Matrix TransInv(Trans); - TransInv.Invert(); // T in above... - -#ifdef KALDI_PARANOID - { - SpMatrix H_sm_tmp(phn_dim); - H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0); - KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1)); - } - { - SpMatrix Sigma_tmp(phn_dim); - Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0); - KALDI_ASSERT(Sigma_tmp.IsUnit(0.1)); - } -#endif - - for (int32 mdl = 0; mdl < num_models; ++mdl) { - for (int32 j = 0; j < models[mdl]->NumPdfs(); ++j) { - for (int32 m = 0; m < models[mdl]->NumSubstates(j); ++m) { - Vector tmp(phn_dim); - tmp.AddMatVec(1.0, Trans, kNoTrans, Vector(models[mdl]->v_[j].Row(m)), 0.0); - models[mdl]->v_[j].Row(m).CopyFromVec(tmp); - } - } - } - for (int32 i = 0; i < NumGauss(); ++i) { - Vector tmp(phn_dim); - tmp.AddMatVec(1.0, TransInv, kTrans, Vector(global_w_.Row(i)), 0.0); - global_w_.Row(i).CopyFromVec(tmp); - - Matrix tmpM(feat_dim, phn_dim); - // Multiplying on right not left so must not transpose TransInv. - tmpM.AddMatMat(1.0, Matrix(global_M_[i]), kNoTrans, - TransInv, kNoTrans, 0.0); - global_M_[i].CopyFromMat(tmpM); - } - KALDI_LOG << "Renormalized subspace."; -} - -} // namespace kaldi diff --git a/src/sgmm/estimate-am-sgmm-multi.h b/src/sgmm/estimate-am-sgmm-multi.h deleted file mode 100644 index 50eb28650b7..00000000000 --- a/src/sgmm/estimate-am-sgmm-multi.h +++ /dev/null @@ -1,146 +0,0 @@ -// sgmm/estimate-am-sgmm-multi.h - -// Copyright 2012 Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_ -#define KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_ 1 - -#include -#include - -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "gmm/model-common.h" - -namespace kaldi { - -/** \class MleAmSgmmGlobalAccs - * Class for the accumulators associated with SGMM global parameters (e.g. - * phonetic-, weight- and speaker-projections; and covariances). This is - * used when the global parameters are updated using stats from multiple - * models. - */ -class MleAmSgmmGlobalAccs { - public: - explicit MleAmSgmmGlobalAccs() - : feature_dim_(0), phn_space_dim_(0), spk_space_dim_(0), - num_gaussians_(0), total_frames_(0.0), total_like_(0.0) {} - - /// Resizes the accumulators to the correct sizes given the model. The flags - /// argument control which accumulators to resize. - void ResizeAccumulators(const AmSgmm &model, SgmmUpdateFlagsType flags); - - /// Set the accumulators specified by the flags argument to zero. - void ZeroAccumulators(SgmmUpdateFlagsType flags); - - /// Add another accumulator object - void AddAccumulators(const AmSgmm &model, const MleAmSgmmAccs &acc, - SgmmUpdateFlagsType flags); - - int32 FeatureDim() const { return feature_dim_; } - int32 PhoneSpaceDim() const { return phn_space_dim_; } - int32 NumGauss() const { return num_gaussians_; } - - private: - /// The stats which are not tied to any state. - /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S]. - std::vector< Matrix > Y_; - /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T]. - std::vector< Matrix > Z_; - /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T] - std::vector< SpMatrix > R_; - /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D]. - std::vector< SpMatrix > S_; - /// Total occupancies gamma_i for each Gaussian. Dim is [I] - Vector gamma_i_; - - /// Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S] - std::vector< SpMatrix > Q_; - /// Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating - /// the shared covariance matrices. Dimension is [I][D][D]. - std::vector< SpMatrix > S_means_; - - /// Dimensionality of various subspaces - int32 feature_dim_, phn_space_dim_, spk_space_dim_; - int32 num_gaussians_; ///< Other model specifications - - double total_frames_, total_like_; - - KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmGlobalAccs); - friend class MleAmSgmmUpdaterMulti; -}; - - -/** \class MleAmSgmmUpdaterMulti - * Contains the functions needed to update the parameters for multiple SGMMs - * whose global parameters are tied. - */ -class MleAmSgmmUpdaterMulti { - public: - explicit MleAmSgmmUpdaterMulti(const AmSgmm &model, - const MleAmSgmmOptions &options) - : update_options_(options), global_SigmaInv_(model.SigmaInv_), - global_M_(model.M_), global_N_(model.N_), global_w_(model.w_) {} - - void Update(const std::vector &accs, - const std::vector &models, - SgmmUpdateFlagsType flags); - - /// Various model dimensions. - int32 NumGauss() const { return global_M_.size(); } - int32 PhoneSpaceDim() const { return global_w_.NumCols(); } - int32 SpkSpaceDim() const { - return (global_N_.size() > 0) ? global_N_[0].NumCols() : 0; - } - int32 FeatureDim() const { return global_M_[0].NumRows(); } - - private: - MleAmSgmmOptions update_options_; - - /// SGMM global parameters that will be updated together and copied to the - /// different models: - std::vector< SpMatrix > global_SigmaInv_; - std::vector< Matrix > global_M_; - std::vector< Matrix > global_N_; - Matrix global_w_; - - BaseFloat UpdateGlobals(const MleAmSgmmGlobalAccs &glob_accs, - SgmmUpdateFlagsType flags); - - double UpdateM(const MleAmSgmmGlobalAccs &accs); - double UpdateN(const MleAmSgmmGlobalAccs &accs); - double UpdateVars(const MleAmSgmmGlobalAccs &accs); - double UpdateWParallel(const std::vector &accs, - const std::vector &models); -// double UpdateWSequential(const std::vector &accs, -// const std::vector &models); - - void ComputeSmoothingTerms(const MleAmSgmmGlobalAccs &accs, - const std::vector > &H, - SpMatrix *H_sm) const; - void RenormalizeV(const SpMatrix &H_sm, - const std::vector &models); - - KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmUpdaterMulti); - MleAmSgmmUpdaterMulti() {} // Prevent unconfigured updater. -}; - -} // namespace kaldi - - -#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_MULTI_H_ diff --git a/src/sgmm/estimate-am-sgmm-test.cc b/src/sgmm/estimate-am-sgmm-test.cc deleted file mode 100644 index a671b7fcb74..00000000000 --- a/src/sgmm/estimate-am-sgmm-test.cc +++ /dev/null @@ -1,161 +0,0 @@ -// sgmm/estimate-am-sgmm-test.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "gmm/model-test-common.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "util/kaldi-io.h" -#include "base/kaldi-math.h" - -using kaldi::AmSgmm; -using kaldi::MleAmSgmmAccs; -using kaldi::BaseFloat; -using kaldi::Exp; - -namespace ut = kaldi::unittest; - -// Tests the Read() and Write() methods for the accumulators, in both binary -// and ASCII mode, as well as Check(). -void TestUpdateAndAccsIO(const AmSgmm &sgmm, - const kaldi::Matrix &feats) { - using namespace kaldi; - typedef kaldi::int32 int32; - - kaldi::SgmmUpdateFlagsType flags = kaldi::kSgmmAll; - kaldi::SgmmPerFrameDerivedVars frame_vars; - kaldi::SgmmPerSpkDerivedVars empty; - frame_vars.Resize(sgmm.NumGauss(), sgmm.FeatureDim(), - sgmm.PhoneSpaceDim()); - kaldi::SgmmGselectConfig sgmm_config; - sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest, - sgmm.NumGauss()); - MleAmSgmmAccs accs(sgmm, flags); - BaseFloat loglike = 0.0; - Vector empty_spk; - for (int32 i = 0; i < feats.NumRows(); i++) { - std::vector gselect; - sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect); - sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars); - loglike += accs.Accumulate(sgmm, frame_vars, empty_spk, 0, 1.0, flags); - } - accs.CommitStatsForSpk(sgmm, empty_spk); - - kaldi::MleAmSgmmOptions update_opts; - update_opts.check_v = (Rand()%2 == 0); - AmSgmm *sgmm1 = new AmSgmm(); - sgmm1->CopyFromSgmm(sgmm, false); - kaldi::MleAmSgmmUpdater updater(update_opts); - updater.Update(accs, sgmm1, flags); - std::vector gselect; - - sgmm1->GaussianSelection(sgmm_config, feats.Row(0), &gselect); - sgmm1->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars); - BaseFloat loglike1 = sgmm1->LogLikelihood(frame_vars, 0); - delete sgmm1; - - // First, non-binary write - accs.Write(kaldi::Output("tmpf", false).Stream(), false); - bool binary_in; - MleAmSgmmAccs *accs1 = new MleAmSgmmAccs(); - // Non-binary read - kaldi::Input ki1("tmpf", &binary_in); - accs1->Read(ki1.Stream(), binary_in, false); - accs1->Check(sgmm, true); - AmSgmm *sgmm2 = new AmSgmm(); - sgmm2->CopyFromSgmm(sgmm, false); - updater.Update(*accs1, sgmm2, flags); - - sgmm2->GaussianSelection(sgmm_config, feats.Row(0), &gselect); - sgmm2->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars); - BaseFloat loglike2 = sgmm2->LogLikelihood(frame_vars, 0); - kaldi::AssertEqual(loglike1, loglike2, 1e-4); - delete accs1; - - // Next, binary write - accs.Write(kaldi::Output("tmpfb", true).Stream(), true); - MleAmSgmmAccs *accs2 = new MleAmSgmmAccs(); - // Binary read - kaldi::Input ki2("tmpfb", &binary_in); - accs2->Read(ki2.Stream(), binary_in, false); - accs2->Check(sgmm, true); - AmSgmm *sgmm3 = new AmSgmm(); - sgmm3->CopyFromSgmm(sgmm, false); - updater.Update(*accs2, sgmm3, flags); - sgmm3->GaussianSelection(sgmm_config, feats.Row(0), &gselect); - sgmm3->ComputePerFrameVars(feats.Row(0), gselect, empty, 0.0, &frame_vars); - BaseFloat loglike3 = sgmm3->LogLikelihood(frame_vars, 0); - kaldi::AssertEqual(loglike1, loglike3, 1e-6); - - // Testing the MAP update of M - update_opts.tau_map_M = 100; - update_opts.full_col_cov = (RandUniform() > 0.5)? true : false; - update_opts.full_row_cov = (RandUniform() > 0.5)? true : false; - kaldi::MleAmSgmmUpdater updater_map(update_opts); - BaseFloat impr = updater_map.Update(*accs2, sgmm3, flags); - KALDI_ASSERT(impr >= 0); - - delete accs2; - delete sgmm2; - delete sgmm3; - - unlink("tmpf"); - unlink("tmpfb"); -} - -void UnitTestEstimateSgmm() { - int32 dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm - int32 num_comp = 2 + kaldi::RandInt(0, 9); // random mixture size - kaldi::FullGmm full_gmm; - ut::InitRandFullGmm(dim, num_comp, &full_gmm); - - int32 num_states = 1; - AmSgmm sgmm; - kaldi::SgmmGselectConfig config; - sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, dim); - sgmm.ComputeNormalizers(); - - kaldi::Matrix feats; - - { // First, generate random means and variances - int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2); - kaldi::Matrix means(num_feat_comp, dim), - vars(num_feat_comp, dim); - for (int32 m = 0; m < num_feat_comp; m++) { - for (int32 d= 0; d < dim; d++) { - means(m, d) = kaldi::RandGauss(); - vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2; - } - } - // Now generate random features with those means and variances. - feats.Resize(num_feat_comp * 200, dim); - for (int32 m = 0; m < num_feat_comp; m++) { - kaldi::SubMatrix tmp(feats, m*200, 200, 0, dim); - ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp); - } - } - TestUpdateAndAccsIO(sgmm, feats); -} - -int main() { - for (int i = 0; i < 10; i++) - UnitTestEstimateSgmm(); - std::cout << "Test OK.\n"; - return 0; -} diff --git a/src/sgmm/estimate-am-sgmm.cc b/src/sgmm/estimate-am-sgmm.cc deleted file mode 100644 index 1e95e6b281c..00000000000 --- a/src/sgmm/estimate-am-sgmm.cc +++ /dev/null @@ -1,2135 +0,0 @@ -// sgmm/estimate-am-sgmm.cc - -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; -// Saarland University (Author: Arnab Ghoshal); -// Ondrej Glembek; Yanmin Qian; -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) -// Liang Lu; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "thread/kaldi-thread.h" - -namespace kaldi { -using std::string; -using std::vector; - -void MleAmSgmmAccs::Write(std::ostream &out_stream, bool binary) const { - uint32 tmp_uint32; - - WriteToken(out_stream, binary, ""); - - WriteToken(out_stream, binary, ""); - tmp_uint32 = static_cast(num_states_); - WriteBasicType(out_stream, binary, tmp_uint32); - WriteToken(out_stream, binary, ""); - tmp_uint32 = static_cast(num_gaussians_); - WriteBasicType(out_stream, binary, tmp_uint32); - WriteToken(out_stream, binary, ""); - tmp_uint32 = static_cast(feature_dim_); - WriteBasicType(out_stream, binary, tmp_uint32); - WriteToken(out_stream, binary, ""); - tmp_uint32 = static_cast(phn_space_dim_); - WriteBasicType(out_stream, binary, tmp_uint32); - WriteToken(out_stream, binary, ""); - tmp_uint32 = static_cast(spk_space_dim_); - WriteBasicType(out_stream, binary, tmp_uint32); - if (!binary) out_stream << "\n"; - - if (Y_.size() != 0) { - KALDI_ASSERT(gamma_.size() != 0); - WriteToken(out_stream, binary, ""); - for (int32 i = 0; i < num_gaussians_; i++) { - Y_[i].Write(out_stream, binary); - } - } - if (Z_.size() != 0) { - KALDI_ASSERT(R_.size() != 0); - WriteToken(out_stream, binary, ""); - for (int32 i = 0; i < num_gaussians_; i++) { - Z_[i].Write(out_stream, binary); - } - WriteToken(out_stream, binary, ""); - for (int32 i = 0; i < num_gaussians_; i++) { - R_[i].Write(out_stream, binary); - } - } - if (S_.size() != 0) { - KALDI_ASSERT(gamma_.size() != 0); - WriteToken(out_stream, binary, ""); - for (int32 i = 0; i < num_gaussians_; i++) { - S_[i].Write(out_stream, binary); - } - } - if (y_.size() != 0) { - KALDI_ASSERT(gamma_.size() != 0); - WriteToken(out_stream, binary, ""); - for (int32 j = 0; j < num_states_; j++) { - y_[j].Write(out_stream, binary); - } - } - if (gamma_.size() != 0) { - WriteToken(out_stream, binary, ""); - for (int32 j = 0; j < num_states_; j++) { - gamma_[j].Write(out_stream, binary); - } - } - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, total_like_); - - WriteToken(out_stream, binary, ""); - WriteBasicType(out_stream, binary, total_frames_); - - WriteToken(out_stream, binary, ""); -} - -void MleAmSgmmAccs::Read(std::istream &in_stream, bool binary, - bool add) { - uint32 tmp_uint32; - string token; - - ExpectToken(in_stream, binary, ""); - - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &tmp_uint32); - num_states_ = static_cast(tmp_uint32); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &tmp_uint32); - num_gaussians_ = static_cast(tmp_uint32); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &tmp_uint32); - feature_dim_ = static_cast(tmp_uint32); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &tmp_uint32); - phn_space_dim_ = static_cast(tmp_uint32); - ExpectToken(in_stream, binary, ""); - ReadBasicType(in_stream, binary, &tmp_uint32); - spk_space_dim_ = static_cast(tmp_uint32); - - ReadToken(in_stream, binary, &token); - - while (token != "") { - if (token == "") { - Y_.resize(num_gaussians_); - for (size_t i = 0; i < Y_.size(); i++) { - Y_[i].Read(in_stream, binary, add); - } - } else if (token == "") { - Z_.resize(num_gaussians_); - for (size_t i = 0; i < Z_.size(); i++) { - Z_[i].Read(in_stream, binary, add); - } - } else if (token == "") { - R_.resize(num_gaussians_); - if (gamma_s_.Dim() == 0) gamma_s_.Resize(num_gaussians_); - for (size_t i = 0; i < R_.size(); i++) { - R_[i].Read(in_stream, binary, add); - } - } else if (token == "") { - S_.resize(num_gaussians_); - for (size_t i = 0; i < S_.size(); i++) { - S_[i].Read(in_stream, binary, add); - } - } else if (token == "") { - y_.resize(num_states_); - for (int32 j = 0; j < num_states_; j++) { - y_[j].Read(in_stream, binary, add); - } - } else if (token == "") { - gamma_.resize(num_states_); - for (int32 j = 0; j < num_states_; j++) { - gamma_[j].Read(in_stream, binary, add); - } - // Don't read gamma_s, it's just a temporary variable and - // not part of the permanent (non-speaker-specific) accs. - } else if (token == "") { - double total_like; - ReadBasicType(in_stream, binary, &total_like); - if (add) - total_like_ += total_like; - else - total_like_ = total_like; - } else if (token == "") { - double total_frames; - ReadBasicType(in_stream, binary, &total_frames); - if (add) - total_frames_ += total_frames; - else - total_frames_ = total_frames; - } else { - KALDI_ERR << "Unexpected token '" << token << "' in model file "; - } - ReadToken(in_stream, binary, &token); - } -} - -void MleAmSgmmAccs::Check(const AmSgmm &model, - bool show_properties) const { - if (show_properties) { - KALDI_LOG << "SgmmPdfModel: J = " << num_states_ << ", D = " << - feature_dim_ << ", S = " << phn_space_dim_ << ", T = " << - spk_space_dim_ << ", I = " << num_gaussians_; - } - KALDI_ASSERT(num_states_ == model.NumPdfs() && num_states_ > 0); - KALDI_ASSERT(num_gaussians_ == model.NumGauss() && num_gaussians_ > 0); - KALDI_ASSERT(feature_dim_ == model.FeatureDim() && feature_dim_ > 0); - KALDI_ASSERT(phn_space_dim_ == model.PhoneSpaceDim() && phn_space_dim_ > 0); - KALDI_ASSERT(spk_space_dim_ == model.SpkSpaceDim()); - - std::ostringstream debug_str; - - if (Y_.size() == 0) { - debug_str << "Y: no. "; - } else { - KALDI_ASSERT(gamma_.size() != 0); - KALDI_ASSERT(Y_.size() == static_cast(num_gaussians_)); - bool nz = false; - for (int32 i = 0; i < num_gaussians_; i++) { - KALDI_ASSERT(Y_[i].NumRows() == feature_dim_ && - Y_[i].NumCols() == phn_space_dim_); - if (!nz && Y_[i](0, 0) != 0) { nz = true; } - } - debug_str << "Y: yes, " << string(nz ? "nonzero. " : "zero. "); - } - - if (Z_.size() == 0) { - KALDI_ASSERT(R_.size() == 0); - debug_str << "Z, R: no. "; - } else { - KALDI_ASSERT(gamma_s_.Dim() == num_gaussians_); - KALDI_ASSERT(Z_.size() == static_cast(num_gaussians_)); - KALDI_ASSERT(R_.size() == static_cast(num_gaussians_)); - bool Z_nz = false, R_nz = false; - for (int32 i = 0; i < num_gaussians_; i++) { - KALDI_ASSERT(Z_[i].NumRows() == feature_dim_ && - Z_[i].NumCols() == spk_space_dim_); - KALDI_ASSERT(R_[i].NumRows() == spk_space_dim_); - if (!Z_nz && Z_[i](0, 0) != 0) { Z_nz = true; } - if (!R_nz && R_[i](0, 0) != 0) { R_nz = true; } - } - bool gamma_s_nz = !gamma_s_.IsZero(); - debug_str << "Z: yes, " << string(Z_nz ? "nonzero. " : "zero. "); - debug_str << "R: yes, " << string(R_nz ? "nonzero. " : "zero. "); - debug_str << "gamma_s: yes, " << string(gamma_s_nz ? "nonzero. " : "zero. "); - } - - if (S_.size() == 0) { - debug_str << "S: no. "; - } else { - KALDI_ASSERT(gamma_.size() != 0); - bool S_nz = false; - KALDI_ASSERT(S_.size() == static_cast(num_gaussians_)); - for (int32 i = 0; i < num_gaussians_; i++) { - KALDI_ASSERT(S_[i].NumRows() == feature_dim_); - if (!S_nz && S_[i](0, 0) != 0) { S_nz = true; } - } - debug_str << "S: yes, " << string(S_nz ? "nonzero. " : "zero. "); - } - - if (y_.size() == 0) { - debug_str << "y: no. "; - } else { - KALDI_ASSERT(gamma_.size() != 0); - bool nz = false; - KALDI_ASSERT(y_.size() == static_cast(num_states_)); - for (int32 j = 0; j < num_states_; j++) { - KALDI_ASSERT(y_[j].NumRows() == model.NumSubstates(j)); - KALDI_ASSERT(y_[j].NumCols() == phn_space_dim_); - if (!nz && y_[j](0, 0) != 0) { nz = true; } - } - debug_str << "y: yes, " << string(nz ? "nonzero. " : "zero. "); - } - - if (gamma_.size() == 0) { - debug_str << "gamma: no. "; - } else { - debug_str << "gamma: yes. "; - bool nz = false; - KALDI_ASSERT(gamma_.size() == static_cast(num_states_)); - for (int32 j = 0; j < num_states_; j++) { - KALDI_ASSERT(gamma_[j].NumRows() == model.NumSubstates(j) && - gamma_[j].NumCols() == num_gaussians_); - // Just test the first substate for nonzero, else it would take too long. - if (!nz && gamma_[j].Row(0).Norm(1.0) != 0) { nz = true; } - } - debug_str << "gamma: yes, " << string(nz ? "nonzero. " : "zero. "); - } - - if (show_properties) - KALDI_LOG << "Subspace GMM model properties: " << debug_str.str(); -} - -void MleAmSgmmAccs::ResizeAccumulators(const AmSgmm &model, - SgmmUpdateFlagsType flags) { - num_states_ = model.NumPdfs(); - num_gaussians_ = model.NumGauss(); - feature_dim_ = model.FeatureDim(); - phn_space_dim_ = model.PhoneSpaceDim(); - spk_space_dim_ = model.SpkSpaceDim(); - - if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) { - Y_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; i++) { - Y_[i].Resize(feature_dim_, phn_space_dim_); - } - } else { - Y_.clear(); - } - - if (flags & kSgmmSpeakerProjections) { - if (spk_space_dim_ == 0) { - KALDI_ERR << "Cannot set up accumulators for speaker projections " - << "because speaker subspace has not been set up"; - } - gamma_s_.Resize(num_gaussians_); - Z_.resize(num_gaussians_); - R_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; i++) { - Z_[i].Resize(feature_dim_, spk_space_dim_); - R_[i].Resize(spk_space_dim_); - } - } else { - gamma_s_.Resize(0); - Z_.clear(); - R_.clear(); - } - - if (flags & kSgmmCovarianceMatrix) { - S_.resize(num_gaussians_); - for (int32 i = 0; i < num_gaussians_; i++) { - S_[i].Resize(feature_dim_); - } - } else { - S_.clear(); - } - - if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections | - kSgmmCovarianceMatrix | kSgmmSubstateWeights | - kSgmmPhoneProjections)) { - gamma_.resize(num_states_); - total_frames_ = total_like_ = 0; - for (int32 j = 0; j < num_states_; j++) { - gamma_[j].Resize(model.NumSubstates(j), num_gaussians_); - } - } else { - gamma_.clear(); - total_frames_ = total_like_ = 0; - } - - if (flags & kSgmmPhoneVectors) { - y_.resize(num_states_); - for (int32 j = 0; j < num_states_; j++) { - y_[j].Resize(model.NumSubstates(j), phn_space_dim_); - } - } else { - y_.clear(); - } -} - -BaseFloat MleAmSgmmAccs::Accumulate(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const VectorBase &v_s, // may be empty - int32 j, BaseFloat weight, - SgmmUpdateFlagsType flags) { - // Calculate Gaussian posteriors and collect statistics - Matrix posteriors; - BaseFloat log_like = model.ComponentPosteriors(frame_vars, j, &posteriors); - posteriors.Scale(weight); - BaseFloat count = AccumulateFromPosteriors(model, frame_vars, posteriors, - v_s, j, flags); - // Note: total_frames_ is incremented in AccumulateFromPosteriors(). - total_like_ += count * log_like; - return log_like; -} - - -BaseFloat MleAmSgmmAccs::AccumulateFromPosteriors( - const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const Matrix &posteriors, - const VectorBase &v_s, // may be empty - int32 j, - SgmmUpdateFlagsType flags) { - double tot_count = 0.0; - const vector &gselect = frame_vars.gselect; - // Intermediate variables - Vector gammat(gselect.size()); - Vector xt_jmi(feature_dim_), mu_jmi(feature_dim_), - zt_jmi(spk_space_dim_); - - int32 num_substates = model.NumSubstates(j); - for (int32 ki = 0; ki < static_cast(gselect.size()); ki++) { - int32 i = gselect[ki]; - - for (int32 m = 0; m < num_substates; m++) { - // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t) - BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_); - - // Accumulate statistics for non-zero gaussian posterior - if (gammat_jmi != 0.0) { - tot_count += gammat_jmi; - if (flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections | - kSgmmCovarianceMatrix | kSgmmSubstateWeights | - kSgmmPhoneProjections)) { - // Eq. (40): gamma_{jmi} = \sum_t gamma_{jmi}(t) - gamma_[j](m, i) += gammat_jmi; - } - - if (flags & kSgmmPhoneVectors) { - // Eq. (41): y_{jm} = \sum_{t, i} \gamma_{jmi}(t) z_{i}(t) - // Suggestion: move this out of the loop over m - y_[j].Row(m).AddVec(gammat_jmi, frame_vars.zti.Row(ki)); - } - - if (flags & (kSgmmPhoneProjections | kSgmmCovarianceMatrix)) { - // Eq. (42): Y_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) v_{jm}^T - Y_[i].AddVecVec(gammat_jmi, frame_vars.xti.Row(ki), - model.StateVectors(j).Row(m)); - } - - if (flags & kSgmmCovarianceMatrix) - gammat(ki) += gammat_jmi; - - // Accumulate for speaker projections - if (flags & kSgmmSpeakerProjections) { - KALDI_ASSERT(spk_space_dim_ > 0); - // Eq. (43): x_{jmi}(t) = x_k(t) - M{i} v_{jm} - model.GetSubstateMean(j, m, i, &mu_jmi); - xt_jmi.CopyFromVec(frame_vars.xt); - xt_jmi.AddVec(-1.0, mu_jmi); - // Eq. (44): Z_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{jmi}(t) v^{s}' - if (v_s.Dim() != 0) // interpret empty v_s as zero. - Z_[i].AddVecVec(gammat_jmi, xt_jmi, v_s); - // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi} - // Will be used when you call CommitStatsForSpk(), to update R_. - gamma_s_(i) += gammat_jmi; - } - } // non-zero posteriors - } // loop over substates - } // loop over selected Gaussians - - if (flags & kSgmmCovarianceMatrix) { - for (int32 ki = 0; ki < static_cast(gselect.size()); ki++) { - int32 i = gselect[ki]; - // Eq. (47): S_{i} = \sum_{t, j, m} \gamma_{jmi}(t) x_{i}(t) x_{i}(t)^T - if (gammat(ki) != 0.0) - S_[i].AddVec2(gammat(ki), frame_vars.xti.Row(ki)); - } - } - total_frames_ += tot_count; - return tot_count; -} - -void MleAmSgmmAccs::CommitStatsForSpk(const AmSgmm &model, - const VectorBase &v_s) { - if (v_s.Dim() != 0 && spk_space_dim_ > 0 && gamma_s_.Dim() != 0) { - if (!v_s.IsZero()) - for (int32 i = 0; i < num_gaussians_; i++) - // Accumulate Statistics R_{ki} - if (gamma_s_(i) != 0.0) - R_[i].AddVec2(static_cast(gamma_s_(i)), v_s); - } - gamma_s_.SetZero(); -} - -void MleAmSgmmAccs::GetStateOccupancies(Vector *occs) const { - occs->Resize(gamma_.size()); - for (int32 j = 0, end = gamma_.size(); j < end; j++) { - (*occs)(j) = gamma_[j].Sum(); - } -} - -BaseFloat MleAmSgmmUpdater::Update(const MleAmSgmmAccs &accs, - AmSgmm *model, - SgmmUpdateFlagsType flags) { - KALDI_ASSERT((flags & (kSgmmPhoneVectors | kSgmmPhoneProjections | - kSgmmPhoneWeightProjections | kSgmmCovarianceMatrix | - kSgmmSubstateWeights | kSgmmSpeakerProjections)) != 0); - - if (flags & kSgmmPhoneProjections) - ComputeQ(accs, *model, &Q_); - if (flags & kSgmmCovarianceMatrix) - ComputeSMeans(accs, *model, &S_means_); - - // quantities used in both vector and weights updates... - vector< SpMatrix > H; - // "smoothing" matrices, weighted sums of above. - SpMatrix H_sm; - Vector y_sm; // "smoothing" vectors - if ((flags & (kSgmmPhoneVectors | kSgmmPhoneWeightProjections)) - || update_options_.renormalize_V) { - model->ComputeH(&H); - ComputeSmoothingTerms(accs, *model, H, &H_sm, - (flags & kSgmmPhoneVectors) ? &y_sm : NULL); - } - - BaseFloat tot_impr = 0.0; - - if (flags & kSgmmPhoneVectors) { - if (update_options_.check_v) { - KALDI_ASSERT(update_options_.tau_vec == 0 && - "You cannot combine the check-v and tau-vec options."); - tot_impr += UpdatePhoneVectorsChecked(accs, model, H); - } else { - tot_impr += UpdatePhoneVectors(accs, model, H, H_sm, y_sm); - } - } - if (flags & kSgmmPhoneProjections) { - if (update_options_.tau_map_M > 0.0) - tot_impr += MapUpdateM(accs, model); // MAP adaptation of M - else - tot_impr += UpdateM(accs, model); - } - - if (flags & kSgmmPhoneWeightProjections) { - if (update_options_.use_sequential_weight_update) { - tot_impr += UpdateWSequential(accs, model); - } else { - tot_impr += UpdateWParallel(accs, model); - } - } - if (flags & kSgmmCovarianceMatrix) - tot_impr += UpdateVars(accs, model); - if (flags & kSgmmSubstateWeights) - tot_impr += UpdateSubstateWeights(accs, model); - if (flags & kSgmmSpeakerProjections) { - tot_impr += UpdateN(accs, model); - if (update_options_.renormalize_N) - RenormalizeN(accs, model); // if you renormalize N you have to - // alter any speaker vectors you're keeping around, as well. - } - - if (update_options_.renormalize_V) - RenormalizeV(accs, model, H_sm); - - KALDI_LOG << "*Overall auxf improvement, combining all parameters, is " - << tot_impr; - - KALDI_LOG << "***Overall data likelihood is " - << (accs.total_like_/accs.total_frames_) - << " over " << (accs.total_frames_) << " frames."; - - model->ComputeNormalizers(); // So that the model is ready to use. - return tot_impr; -} - -// Compute the Q_{i} (Eq. 64) -void MleAmSgmmUpdater::ComputeQ(const MleAmSgmmAccs &accs, - const AmSgmm &model, - std::vector< SpMatrix > *Q) { - Q->resize(accs.num_gaussians_); - for (int32 i = 0; i < accs.num_gaussians_; i++) { - (*Q)[i].Resize(accs.phn_space_dim_); - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model.NumSubstates(j); m++) { - if (accs.gamma_[j](m, i) > 0.0) { - (*Q)[i].AddVec2(static_cast(accs.gamma_[j](m, i)), - model.v_[j].Row(m)); - } - } - } - } -} - -// Compute the S_i^{(means)} quantities (Eq. 74). -// Note: we seem to have also included in this variable -// the term - (Y_i M_I^T + M_i Y_i^T). -void MleAmSgmmUpdater::ComputeSMeans(const MleAmSgmmAccs &accs, - const AmSgmm &model, - std::vector< SpMatrix > *S_means) { - S_means->resize(accs.num_gaussians_); - Matrix YM_MY(accs.feature_dim_, accs.feature_dim_); - Vector mu_jmi(accs.feature_dim_); - for (int32 i = 0; i < accs.num_gaussians_; i++) { - // YM_MY = - (Y_{i} M_{i}^T) - YM_MY.AddMatMat(-1.0, accs.Y_[i], kNoTrans, - Matrix(model.M_[i]), kTrans, 0.0); - // Add its own transpose: YM_MY = - (Y_{i} M_{i}^T + M_{i} Y_{i}^T) - { - Matrix M(YM_MY, kTrans); - YM_MY.AddMat(1.0, M); - } - (*S_means)[i].Resize(accs.feature_dim_, kUndefined); - (*S_means)[i].CopyFromMat(YM_MY); // Sigma_{i} = -(YM' + MY') - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model.NumSubstates(j); m++) { - if (accs.gamma_[j](m, i) != 0.0) { - // Sigma_{i} += gamma_{jmi} * mu_{jmi}*mu_{jmi}^T - mu_jmi.AddMatVec(1.0, model.M_[i], kNoTrans, model.v_[j].Row(m), 0.0); - (*S_means)[i].AddVec2(static_cast(accs.gamma_[j](m, i)), mu_jmi); - } - } - } - KALDI_ASSERT(1.0 / (*S_means)[i](0, 0) != 0.0); - } -} - -// Compute H^{(sm)}, the "smoothing" matrices. -void MleAmSgmmUpdater::ComputeSmoothingTerms(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const vector > &H, - SpMatrix *H_sm, - Vector *y_sm) const { - KALDI_ASSERT(H_sm != NULL); - H_sm->Resize(accs.phn_space_dim_); - if (y_sm != NULL) y_sm->Resize(accs.phn_space_dim_); - Vector gamma_i(accs.num_gaussians_); - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0, end = model.NumSubstates(j); m < end; m++) { - gamma_i.AddVec(1.0, accs.gamma_[j].Row(m)); - if (y_sm != NULL) (*y_sm).AddVec(1.0, accs.y_[j].Row(m)); - } - } - - double sum = 0.0; - for (int32 i = 0; i < accs.num_gaussians_; i++) { - if (gamma_i(i) > 0) { - H_sm->AddSp(gamma_i(i), H[i]); - sum += gamma_i(i); - } - } - - if (sum == 0.0) { - KALDI_WARN << "Sum of counts is zero. Smoothing matrix set to unit" - << string((y_sm != NULL)? " & smoothing vector set to 0." : "."); - H_sm->SetUnit(); // arbitrary non-singular matrix - } else { - if (y_sm != NULL) { - (*y_sm).Scale(1.0 / sum); - KALDI_VLOG(3) << "y_sm is " << (*y_sm); - } - H_sm->Scale(1.0 / sum); - Matrix H_sm_old(*H_sm); - int32 tmp = H_sm->LimitCondDouble(update_options_.max_cond_H_sm); - if (tmp > 0) { - KALDI_WARN << "Limited " << tmp << " eigenvalues of H_sm."; - if (update_options_.fixup_H_sm && y_sm != NULL) { - Vector avgVec(accs.phn_space_dim_); - SpMatrix HInv(H_sm_old); - HInv.Invert(); - avgVec.AddSpVec(1.0, HInv, (*y_sm), 0.0); - (*y_sm).AddSpVec(1.0, (*H_sm), avgVec, 0.0); - KALDI_VLOG(3) << "y_sm [fixed up] is " << (*y_sm); - } - } - } -} - - -class UpdatePhoneVectorsClass: public MultiThreadable { // For multi-threaded. - public: - UpdatePhoneVectorsClass(const MleAmSgmmUpdater &updater, - const MleAmSgmmAccs &accs, - AmSgmm *model, - const std::vector > &H, - const SpMatrix &H_sm, - const Vector &y_sm, - double *auxf_impr, - double *like_impr): - updater_(updater), accs_(accs), model_(model), - H_(H), H_sm_(H_sm), y_sm_(y_sm), auxf_impr_ptr_(auxf_impr), - auxf_impr_(0.0), like_impr_ptr_(like_impr), like_impr_(0.0) { } - - ~UpdatePhoneVectorsClass() { - *auxf_impr_ptr_ += auxf_impr_; - *like_impr_ptr_ += like_impr_; - } - - inline void operator() () { - // Note: give them local copy of the sums we're computing, - // which will be propagated to the total sums in the destructor. - updater_.UpdatePhoneVectorsInternal(accs_, model_, H_, H_sm_, y_sm_, - &auxf_impr_, &like_impr_, - num_threads_, thread_id_); - } - private: - const MleAmSgmmUpdater &updater_; - const MleAmSgmmAccs &accs_; - AmSgmm *model_; - const std::vector > &H_; - const SpMatrix &H_sm_; - const Vector &y_sm_; - double *auxf_impr_ptr_; - double auxf_impr_; - double *like_impr_ptr_; - double like_impr_; -}; - - -// Runs the phone vectors update for a subset of states (called -// multi-threaded). -void MleAmSgmmUpdater::UpdatePhoneVectorsInternal( - const MleAmSgmmAccs &accs, - AmSgmm *model, - const std::vector > &H, - const SpMatrix &H_sm, - const Vector &y_sm, - double *auxf_impr, - double *like_impr, - int32 num_threads, - int32 thread_id) const { - - int32 block_size = (accs.num_states_ + (num_threads-1)) / num_threads, - j_start = block_size * thread_id, - j_end = std::min(accs.num_states_, j_start + block_size); - - for (int32 j = j_start; j < j_end; j++) { - double state_count = 0.0, state_auxf_impr = 0.0, state_like_impr = 0.0; - Vector w_jm(accs.num_gaussians_); - for (int32 m = 0; m < model->NumSubstates(j); m++) { - double gamma_jm = accs.gamma_[j].Row(m).Sum(); - state_count += gamma_jm; - Vector g_jm(accs.phn_space_dim_); // computed using eq. 58 - SpMatrix H_jm(accs.phn_space_dim_); // computed using eq. 59 - // First compute normal H_jm. - - // need weights for this ... - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - w_jm.AddMatVec(1.0, Matrix(model->w_), kNoTrans, - Vector(model->v_[j].Row(m)), 0.0); - w_jm.ApplySoftMax(); - g_jm.CopyFromVec(accs.y_[j].Row(m)); - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_jmi = accs.gamma_[j](m, i); - double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i)); - double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term - * VecVec(model->w_.Row(i), model->v_[j].Row(m)); - g_jm.AddVec(scalar, model->w_.Row(i)); - if (gamma_jmi != 0.0) { - H_jm.AddSp(gamma_jmi, H[i]); // The most important term.. - } - if (quadratic_term > 1.0e-10) { - H_jm.AddVec2(static_cast(quadratic_term), model->w_.Row(i)); - } - } - SpMatrix H_jm_dash(H_jm); // with ad-hoc smoothing term. - Vector g_jm_dash(g_jm); // with ad-hoc smoothing term. - - // H_jm_dash = H_jm + (smoothing term) - H_jm_dash.AddSp(update_options_.tau_vec, H_sm); - // g_jm_dash.BlasGemv(update_options_.mTauVec, H_sm, kNoTrans, e_1, 1.0); - // g_jm_dash = g_jm + (smoothing term) - g_jm_dash.AddVec(update_options_.tau_vec, y_sm); - - // if (gamma_jm == 0) continue; - // no, we still want to update even with zero count. -#ifdef KALDI_PARANOID - if (update_options_.tau_vec > 0) - KALDI_ASSERT(H_jm_dash.IsPosDef()); -#endif - Vector vhat_jm(model->v_[j].Row(m)); - SolverOptions opts; - opts.name = "v"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - double objf_impr_with_prior = - SolveQuadraticProblem(H_jm_dash, - g_jm_dash, - opts, - &vhat_jm); - - SpMatrix H_jm_flt(H_jm); - - double objf_impr_noprior = - (VecVec(vhat_jm, g_jm) - - 0.5 * VecSpVec(vhat_jm, H_jm, vhat_jm)) - - (VecVec(model->v_[j].Row(m), g_jm) - - 0.5 * VecSpVec(model->v_[j].Row(m), H_jm_flt, model->v_[j].Row(m))); - model->v_[j].Row(m).CopyFromVec(vhat_jm); - if (j < 3 && m < 2 && thread_id == 0) { - KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is " - << (objf_impr_with_prior / (gamma_jm + 1.0e-20)) - << " (with ad-hoc prior) " - << (objf_impr_noprior / (gamma_jm + 1.0e-20)) - << " (no prior) over " << (gamma_jm) << " frames"; - } - state_auxf_impr += objf_impr_with_prior; - state_like_impr += objf_impr_noprior; - } - - *auxf_impr += state_auxf_impr; - *like_impr += state_like_impr; - if (j < 10 && thread_id == 0) { - KALDI_LOG << "Objf impr for state j = " << (j) << " is " - << (state_auxf_impr / (state_count + 1.0e-20)) - << " (with ad-hoc prior) " - << (state_like_impr / (state_count + 1.0e-20)) - << " (no prior) over " << (state_count) << " frames"; - } - } -} - -double MleAmSgmmUpdater::UpdatePhoneVectors(const MleAmSgmmAccs &accs, - AmSgmm *model, - const vector< SpMatrix > &H, - const SpMatrix &H_sm, - const Vector &y_sm) { - KALDI_LOG << "Updating phone vectors"; - - double count = 0.0, auxf_impr = 0.0, like_impr = 0.0; // sum over all states - - for (int32 j = 0; j < accs.num_states_; j++) count += accs.gamma_[j].Sum(); - - UpdatePhoneVectorsClass c(*this, accs, model, H, H_sm, y_sm, - &auxf_impr, &like_impr); - RunMultiThreaded(c); - - auxf_impr /= (count + 1.0e-20); - like_impr /= (count + 1.0e-20); - KALDI_LOG << "**Overall objf impr for v is " << auxf_impr - << "(with ad-hoc prior) " << like_impr << " (no prior) over " - << (count) << " frames"; - // Choosing to return actual likelihood impr here. - return like_impr; -} - - -/** - This is as UpdatePhoneVectors but does not support smoothing terms or - parallelization. However, it does compute the auxiliary function - after doing the update, and backtracks if it did not increase (due - to the weight terms, increase is not mathematically guaranteed). */ - -double MleAmSgmmUpdater::UpdatePhoneVectorsChecked(const MleAmSgmmAccs &accs, - AmSgmm *model, - const vector< SpMatrix > &H) { - KALDI_LOG << "Updating phone vectors (and checking auxiliary function)"; - - double tot_count = 0.0, tot_objf_impr = 0.0, tot_auxf_impr = 0.0; // sum over all states - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - double gamma_jm = accs.gamma_[j].Row(m).Sum(); - SpMatrix X_jm(accs.phn_space_dim_); // = \sum_i \gamma_{jmi} H_i - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_jmi = accs.gamma_[j](m, i); - if (gamma_jmi != 0.0) - X_jm.AddSp(gamma_jmi, H[i]); - } - - Vector v_jm_orig(model->v_[j].Row(m)), - v_jm(v_jm_orig); - - double exact_objf_start = 0.0, exact_objf = 0.0, auxf_impr = 0.0; - int32 backtrack_iter, max_backtrack = 10; - for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) { - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - Vector w_jm(accs.num_gaussians_); - w_jm.AddMatVec(1.0, Matrix(model->w_), kNoTrans, - v_jm, 0.0); - w_jm.Add(-w_jm.LogSumExp()); // it is now log w_jm - - exact_objf = VecVec(w_jm, accs.gamma_[j].Row(m)) - + VecVec(v_jm, accs.y_[j].Row(m)) - -0.5 * VecSpVec(v_jm, X_jm, v_jm); - - if (backtrack_iter == 0.0) { - exact_objf_start = exact_objf; - } else { - if (exact_objf >= exact_objf_start) { - break; // terminate backtracking. - } else { - KALDI_LOG << "Backtracking computation of v_jm for j = " << j - << " and m = " << m << " because objf changed by " - << (exact_objf-exact_objf_start) << " [vs. predicted:] " - << auxf_impr; - v_jm.AddVec(1.0, v_jm_orig); - v_jm.Scale(0.5); - } - } - - if (backtrack_iter == 0) { // computing updated value. - w_jm.ApplyExp(); // it is now w_jm - SpMatrix H_jm(X_jm); - Vector g_jm(accs.y_[j].Row(m)); - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_jmi = accs.gamma_[j](m, i); - double quadratic_term = std::max(gamma_jmi, gamma_jm * w_jm(i)); - double scalar = gamma_jmi - gamma_jm * w_jm(i) + quadratic_term - * VecVec(model->w_.Row(i), model->v_[j].Row(m)); - g_jm.AddVec(scalar, model->w_.Row(i)); - if (quadratic_term > 1.0e-10) { - H_jm.AddVec2(static_cast(quadratic_term), model->w_.Row(i)); - } - } - SolverOptions opts; - opts.name = "v"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - auxf_impr = SolveQuadraticProblem(H_jm, g_jm, opts, &v_jm); - } - } - double objf_impr = exact_objf - exact_objf_start; - tot_count += gamma_jm; - tot_objf_impr += objf_impr; - tot_auxf_impr += auxf_impr; - if (backtrack_iter == max_backtrack) { - KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]"; - } else { - model->v_[j].Row(m).CopyFromVec(v_jm); - } - - if (j < 3 && m < 2) { - KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is " - << objf_impr << " vs. quadratic auxf impr (before backtrack) " - << auxf_impr; - } - } - } - - tot_objf_impr /= (tot_count + 1.0e-20); - tot_auxf_impr /= (tot_count + 1.0e-20); - KALDI_LOG << "**Overall objf impr for v is " << tot_objf_impr - << " (auxf impr before backtracking:) " << tot_auxf_impr - << " over " << tot_count << " frames"; - // Choosing to return actual likelihood impr here. - return tot_objf_impr; -} - - - -class UpdatePhoneVectorsCheckedFromClusterableClass: public MultiThreadable { // For multi-threaded. - public: - UpdatePhoneVectorsCheckedFromClusterableClass( - MleAmSgmmUpdater *updater, - const std::vector &stats, - const std::vector > &H, - AmSgmm *model, - double *count, - double *like_impr): - updater_(updater), stats_(stats), H_(H), model_(model), - count_ptr_(count), count_(0.0), - like_impr_ptr_(like_impr), like_impr_(0.0) - { } - - ~UpdatePhoneVectorsCheckedFromClusterableClass() { - *count_ptr_ += count_; - *like_impr_ptr_ += like_impr_; - } - - inline void operator() () { - // Note: give them local copy of the sums we're computing, - // which will be propagated to the total sums in the destructor. - updater_->UpdatePhoneVectorsCheckedFromClusterableInternal( - stats_, H_, model_, &count_, &like_impr_, num_threads_, thread_id_); - } - private: - MleAmSgmmUpdater *updater_; - const std::vector &stats_; - const std::vector > &H_; - AmSgmm *model_; - double *count_ptr_; - double count_; - double *like_impr_ptr_; - double like_impr_; -}; - - -double MleAmSgmmUpdater::UpdatePhoneVectorsCheckedFromClusterable( - const std::vector &stats, - const vector< SpMatrix > &H, - AmSgmm *model) { - KALDI_LOG << "Updating phone vectors using stats from Clusterable class " - "(and checking auxiliary function)"; - double count = 0.0, like_impr = 0.0; - - UpdatePhoneVectorsCheckedFromClusterableClass c(this, stats, H, model, - &count, &like_impr); - RunMultiThreaded(c); - - KALDI_LOG << "**Overall objf impr for v is " << (like_impr / count) - << " over " << count << " frames."; - - return like_impr / count; -} - - -void MleAmSgmmUpdater::UpdatePhoneVectorsCheckedFromClusterableInternal( - const std::vector &stats, - const vector< SpMatrix > &H, - AmSgmm *model, - double *count_ptr, - double *like_impr_ptr, - int32 num_threads, - int32 thread_id) { - - int32 block_size = (model->NumPdfs() + (num_threads-1)) / num_threads, - j_start = block_size * thread_id, - j_end = std::min(model->NumPdfs(), j_start + block_size); - - double tot_count = 0.0, tot_objf_impr = 0.0, tot_auxf_impr = 0.0; // sum over all states - - KALDI_ASSERT(model->NumPdfs() == static_cast(stats.size())); - int32 num_gauss = model->NumGauss(); - for (int32 j = j_start; j < j_end; j++) { - KALDI_ASSERT(model->NumSubstates(j) == 1 && - "This function only works if there is 1 substate per state."); - int32 m = 0; // sub-state index. - const Vector &gamma = stats[j]->gamma(); - const Vector &y = stats[j]->y(); - - double gamma_jm = gamma.Sum(); - SpMatrix X_jm(model->PhoneSpaceDim()); // = \sum_i \gamma_{jmi} H_i - - for (int32 i = 0; i < num_gauss; i++) { - double gamma_jmi = gamma(i); - if (gamma_jmi != 0.0) - X_jm.AddSp(gamma_jmi, H[i]); - } - - Vector v_jm_orig(model->v_[j].Row(m)), - v_jm(v_jm_orig); - - double exact_objf_start = 0.0, exact_objf = 0.0, auxf_impr = 0.0; - int32 backtrack_iter, max_backtrack = 10; - for (backtrack_iter = 0; backtrack_iter < max_backtrack; backtrack_iter++) { - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - Vector w_jm(num_gauss); - w_jm.AddMatVec(1.0, Matrix(model->w_), kNoTrans, - v_jm, 0.0); - w_jm.Add(-w_jm.LogSumExp()); // it is now log w_jm - - exact_objf = VecVec(w_jm, gamma) - + VecVec(v_jm, y) - -0.5 * VecSpVec(v_jm, X_jm, v_jm); - - if (backtrack_iter == 0.0) { - exact_objf_start = exact_objf; - } else { - if (exact_objf >= exact_objf_start) { - break; // terminate backtracking. - } else { - KALDI_LOG << "Backtracking computation of v_jm for j = " << j - << " and m = " << m << " because objf changed by " - << (exact_objf-exact_objf_start) << " [vs. predicted:] " - << auxf_impr; - v_jm.AddVec(1.0, v_jm_orig); - v_jm.Scale(0.5); - } - } - - if (backtrack_iter == 0) { // computing updated value. - w_jm.ApplyExp(); // it is now w_jm - SpMatrix weight_2nd_deriv(model->PhoneSpaceDim()); // actually - // negatived 2nd derivative. - Vector num_deriv(model->PhoneSpaceDim()); - Vector den_deriv(model->PhoneSpaceDim()); - - // We modify the optimization to use the exact 2nd derivative. - // Because we do checking and backtracking, the loss of - // natural stability is OK. - for (int32 i = 0; i < num_gauss; i++) { - double gamma_jmi = gamma(i); - SubVector wi(model->w_, i); - num_deriv.AddVec(gamma_jmi, wi); - double scalar = gamma_jm * w_jm(i); // expected count. - den_deriv.AddVec(scalar, wi); - if (scalar > 1.0e-10) // if-statement is a speedup - weight_2nd_deriv.AddVec2(static_cast(scalar), wi); - } - Vector total_linear_term(y); - total_linear_term.AddVec(1.0, num_deriv); - total_linear_term.AddVec(-1.0, den_deriv); - if (gamma_jm > 0.0) - weight_2nd_deriv.AddVec2(-1.0/gamma_jm, den_deriv); - - total_linear_term.AddSpVec(1.0, weight_2nd_deriv, v_jm, 1.0); - // we want the derivatives around zero, not around the current point. - // Correction for this. - - SpMatrix total_quadratic_term(weight_2nd_deriv); - total_quadratic_term.AddSp(1.0, X_jm); - - SolverOptions opts; - opts.name = "v"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - auxf_impr = SolveQuadraticProblem(total_quadratic_term, - total_linear_term, opts, &v_jm); - } - } - double objf_impr = exact_objf - exact_objf_start; - tot_count += gamma_jm; - tot_objf_impr += objf_impr; - tot_auxf_impr += auxf_impr; - if (backtrack_iter == max_backtrack) { - KALDI_WARN << "Backtracked " << max_backtrack << " times [not updating]"; - } else { - model->v_[j].Row(m).CopyFromVec(v_jm); - } - if (j < 3) { - KALDI_LOG << "Objf impr for j = " << (j) << " m = " << (m) << " is " - << objf_impr << " vs. quadratic auxf impr (before backtrack) " - << auxf_impr; - } - } - - *like_impr_ptr = tot_objf_impr; - *count_ptr = tot_count; - - tot_objf_impr /= (tot_count + 1.0e-20); - tot_auxf_impr /= (tot_count + 1.0e-20); - - if (j_start == 0) - KALDI_LOG << "**For first batch: objf impr for v is " << tot_objf_impr - << " (auxf impr before backtracking:) " << tot_auxf_impr - << " over " << tot_count << " frames"; -} - - -void MleAmSgmmUpdater::RenormalizeV(const MleAmSgmmAccs &accs, - AmSgmm *model, - const SpMatrix &H_sm) { - SpMatrix Sigma(accs.phn_space_dim_); - int32 count = 0; - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - count++; - Sigma.AddVec2(static_cast(1.0), model->v_[j].Row(m)); - } - } - Sigma.Scale(1.0 / count); - int32 fixed_eigs = Sigma.LimitCondDouble(update_options_.max_cond); - if (fixed_eigs != 0) { - KALDI_WARN << "Scatter of vectors v is poorly conditioned. Fixed up " - << fixed_eigs << " eigenvalues."; - } - KALDI_LOG << "Eigenvalues of scatter of vectors v is : "; - Sigma.PrintEigs("Sigma"); - if (!Sigma.IsPosDef()) { - KALDI_LOG << "Not renormalizing v because scatter is not positive definite" - << " -- maybe first iter?"; - return; - } - - // Want to make variance of v unit and H_sm (like precision matrix) diagonal. - TpMatrix L(accs.phn_space_dim_); - L.Cholesky(Sigma); - TpMatrix LInv(L); - LInv.Invert(); - - Matrix tmpL(accs.phn_space_dim_, accs.phn_space_dim_); - tmpL.CopyFromTp(L); - - SpMatrix H_sm_proj(accs.phn_space_dim_); - H_sm_proj.AddMat2Sp(1.0, tmpL, kTrans, H_sm, 0.0); - // H_sm_proj := L^{T} * H_sm * L. - // This is right because we would transform the vectors themselves - // by L^{-1}, and H_sm is like the inverse of the vectors, - // so it's {L^{-1}}^{-T} = L^T. - - Matrix U(accs.phn_space_dim_, accs.phn_space_dim_); - Vector eigs(accs.phn_space_dim_); - H_sm_proj.SymPosSemiDefEig(&eigs, &U, 1.0); // 1.0 means no checking +ve def -> faster - KALDI_LOG << "Note on the next diagnostic: the first number is generally not " - << "that meaningful as it relates to the static offset"; - H_sm_proj.PrintEigs("H_sm_proj (Significance of dims in vector space.. note)"); - - // Transform on vectors is U^T L^{-1}. - // Why? Because transform on H_sm is T =U^T L^T - // and we want T^{-T} by normal rules of vector/covector and we - // have (U^T L^T)^{-T} = (L U)^{-1} = U^T L^{-1}. - Matrix Trans(accs.phn_space_dim_, accs.phn_space_dim_); // T^{-T} - Matrix tmpLInv(accs.phn_space_dim_, accs.phn_space_dim_); - tmpLInv.CopyFromTp(LInv); - Trans.AddMatMat(1.0, U, kTrans, tmpLInv, kNoTrans, 0.0); - Matrix TransInv(Trans); - TransInv.Invert(); // T in above... - -#ifdef KALDI_PARANOID - { - SpMatrix H_sm_tmp(accs.phn_space_dim_); - H_sm_tmp.AddMat2Sp(1.0, TransInv, kTrans, H_sm, 0.0); - KALDI_ASSERT(H_sm_tmp.IsDiagonal(0.1)); - } - { - SpMatrix Sigma_tmp(accs.phn_space_dim_); - Sigma_tmp.AddMat2Sp(1.0, Trans, kNoTrans, Sigma, 0.0); - KALDI_ASSERT(Sigma_tmp.IsUnit(0.1)); - } -#endif - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - Vector tmp(accs.phn_space_dim_); - tmp.AddMatVec(1.0, Trans, kNoTrans, Vector(model->v_[j].Row(m)), 0.0); - model->v_[j].Row(m).CopyFromVec(tmp); - } - } - for (int32 i = 0; i < accs.num_gaussians_; i++) { - Vector tmp(accs.phn_space_dim_); - tmp.AddMatVec(1.0, TransInv, kTrans, Vector(model->w_.Row(i)), 0.0); - model->w_.Row(i).CopyFromVec(tmp); - - Matrix tmpM(accs.feature_dim_, accs.phn_space_dim_); - // Multiplying on right not left so must not transpose TransInv. - tmpM.AddMatMat(1.0, Matrix(model->M_[i]), kNoTrans, - TransInv, kNoTrans, 0.0); - model->M_[i].CopyFromMat(tmpM); - } - KALDI_LOG << "Renormalized subspace."; -} - -double MleAmSgmmUpdater::UpdateM(const MleAmSgmmAccs &accs, - AmSgmm *model) { - double tot_count = 0.0, tot_like_impr = 0.0; - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_i = 0.0; - for (int32 j = 0; j < accs.num_states_; j++) - for (int32 m = 0; m < model->NumSubstates(j); m++) - gamma_i += accs.gamma_[j](m, i); - - if (gamma_i < accs.feature_dim_) { - KALDI_WARN << "For component " << i << ": not updating M due to very " - << "small count (=" << gamma_i << ")."; - continue; - } - - SolverOptions opts; - opts.name = "M"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - Matrix Mi(model->M_[i]); - double impr = SolveQuadraticMatrixProblem(Q_[i], accs.Y_[i], - SpMatrix(model->SigmaInv_[i]), - opts, &Mi); - model->M_[i].CopyFromMat(Mi); - - if (i < 10) { - KALDI_VLOG(2) << "Objf impr for projection M for i = " << i << ", is " - << (impr/(gamma_i + 1.0e-20)) << " over " << gamma_i - << " frames"; - } - tot_count += gamma_i; - tot_like_impr += impr; - } - tot_like_impr /= (tot_count + 1.0e-20); - KALDI_LOG << "Overall objective function improvement for model projections " - << "M is " << tot_like_impr << " over " << tot_count << " frames"; - return tot_like_impr; -} - -// Estimate the parameters of a Gaussian prior over the M matrices. There are -// as many mean matrices as UBM size and two covariance matrices for the rows -// of M and columns of M. The prior means M_i are fixed to the unadapted values. -// This is what was done in Lu, et al. "Maximum a posteriori adaptation of -// subspace Gaussian mixture models for cross-lingual speech recognition", -// ICASSP 2012. -void MleAmSgmmUpdater::ComputeMPrior(AmSgmm *model) { - KALDI_ASSERT(update_options_.map_M_prior_iters > 0); - int32 Ddim = model->FeatureDim(); - int32 Sdim = model->PhoneSpaceDim(); - int32 nGaussians = model->NumGauss(); - - // inverse variance of the columns of M: dim is # of rows - model->col_cov_inv_.Resize(Ddim); - // inverse covariance of the rows of M: dim is # of columns - model->row_cov_inv_.Resize(Sdim); - - model->col_cov_inv_.SetUnit(); - model->row_cov_inv_.SetUnit(); - - if (model->M_prior_.size() == 0) { - model->M_prior_.resize(nGaussians); - for (int32 i = 0; i < nGaussians; i++) { - model->M_prior_[i].Resize(Ddim, Sdim); - model->M_prior_[i].CopyFromMat(model->M_[i]); // We initialize Mpri as this - } - } - - if (update_options_.full_col_cov || update_options_.full_row_cov) { - Matrix avg_M(Ddim, Sdim); // average of the Gaussian prior means - for (int32 i = 0; i < nGaussians; i++) - avg_M.AddMat(1.0, Matrix(model->M_prior_[i])); - avg_M.Scale(1.0 / nGaussians); - - Matrix MDiff(Ddim, Sdim); - for (int32 iter = 0; iter < update_options_.map_M_prior_iters; iter++) { - { // diagnostic block. - double prior_like = -0.5 * nGaussians * (Ddim * Sdim * Log(2 * M_PI) - + Sdim * (-model->row_cov_inv_.LogPosDefDet()) - + Ddim * (-model->col_cov_inv_.LogPosDefDet())); - for (int32 i = 0; i < nGaussians; i++) { - MDiff.CopyFromMat(Matrix(model->M_prior_[i])); - MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M) - SpMatrix tmp(Ddim); - // tmp = MDiff.Omega_r^{-1}*MDiff^T. - tmp.AddMat2Sp(1.0, MDiff, kNoTrans, - SpMatrix(model->row_cov_inv_), 0.0); - prior_like -= 0.5 * TraceSpSp(tmp, SpMatrix(model->col_cov_inv_)); - } - KALDI_LOG << "Before iteration " << iter - << " of updating prior over M, log like per dimension modeled is " - << prior_like / (nGaussians * Ddim * Sdim); - } - - // First estimate the column covariances (\Omega_r in paper) - if (update_options_.full_col_cov) { - size_t limited; - model->col_cov_inv_.SetZero(); - for (int32 i = 0; i < nGaussians; i++) { - MDiff.CopyFromMat(Matrix(model->M_prior_[i])); - MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M) - // Omega_r += 1/(D*I) * Mdiff * Omega_c^{-1} * Mdiff^T - model->col_cov_inv_.AddMat2Sp(1.0 / (Ddim * nGaussians), - Matrix(MDiff), kNoTrans, - model->row_cov_inv_, 1.0); - } - model->col_cov_inv_.PrintEigs("col_cov"); - limited = model->col_cov_inv_.LimitCond(update_options_.max_cond, - true /*invert the matrix*/); - if (limited != 0) { - KALDI_LOG << "Computing column covariances for M: limited " << limited - << " singular values, max condition is " - << update_options_.max_cond; - } - } - - // Now estimate the row covariances (\Omega_c in paper) - if (update_options_.full_row_cov) { - size_t limited; - model->row_cov_inv_.SetZero(); - for (int32 i = 0; i < nGaussians; i++) { - MDiff.CopyFromMat(Matrix(model->M_prior_[i])); - MDiff.AddMat(-1.0, avg_M); // MDiff = M_{i} - avg(M) - // Omega_c += 1/(S*I) * Mdiff^T * Omega_r^{-1} * Mdiff. - model->row_cov_inv_.AddMat2Sp(1.0 / (Sdim * nGaussians), - Matrix(MDiff), kTrans, - model->col_cov_inv_, 1.0); - } - model->row_cov_inv_.PrintEigs("row_cov"); - limited = model->row_cov_inv_.LimitCond(update_options_.max_cond, - true /*invert the matrix*/); - if (limited != 0) { - KALDI_LOG << "Computing row covariances for M: limited " << limited - << " singular values, max condition is " - << update_options_.max_cond; - } - } - } // end iterations - } -} - - -// MAP adaptation of M with a matrix-variate Gaussian prior -double MleAmSgmmUpdater::MapUpdateM(const MleAmSgmmAccs &accs, AmSgmm *model) { - int32 Ddim = model->FeatureDim(); - int32 Sdim = model->PhoneSpaceDim(); - int32 nGaussians = model->NumGauss(); - - KALDI_LOG << "Prior smoothing parameter: Tau = " << update_options_.tau_map_M; - if (model->M_prior_.size() == 0 || model->col_cov_inv_.NumRows() == 0 - || model->row_cov_inv_.NumRows() == 0) { - KALDI_LOG << "Computing the prior first"; - ComputeMPrior(model); - } - - Matrix G(Ddim, Sdim); - // \tau \Omega_c^{-1} avg(M) \Omega_r^{-1}, depends on Gaussian index - Matrix prior_term_i(Ddim, Sdim); - SpMatrix P2(model->col_cov_inv_); - SpMatrix Q2(model->row_cov_inv_); - Q2.Scale(update_options_.tau_map_M); - - double totcount = 0.0, tot_like_impr = 0.0; - for (int32 i = 0; i < nGaussians; ++i) { - double gamma_i = 0.0; - for (int32 j = 0; j < accs.num_states_; ++j) - for (int32 m = 0; m < model->NumSubstates(j); ++m) - gamma_i += accs.gamma_[j](m, i); - - if (gamma_i < accs.feature_dim_) { - KALDI_WARN << "For component " << i << ": not updating M due to very " - << "small count (=" << gamma_i << ")."; - continue; - } - - Matrix tmp(Ddim, Sdim, kSetZero); - tmp.AddSpMat(1.0, SpMatrix(model->col_cov_inv_), - Matrix(model->M_prior_[i]), kNoTrans, 0.0); - prior_term_i.AddMatSp(update_options_.tau_map_M, tmp, kNoTrans, - SpMatrix(model->row_cov_inv_), 0.0); - - Matrix SigmaY(Ddim, Sdim, kSetZero); - SigmaY.AddSpMat(1.0, SpMatrix(model->SigmaInv_[i]), accs.Y_[i], - kNoTrans, 0.0); - G.CopyFromMat(SigmaY); // G = \Sigma_{i}^{-1} Y_{i} - G.AddMat(1.0, prior_term_i); // G += \tau \Omega_c^{-1} avg(M) \Omega_r^{-1} - SpMatrix P1(model->SigmaInv_[i]); - Matrix Mi(model->M_[i]); - - SolverOptions opts; - opts.name = "M"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - double impr = SolveDoubleQuadraticMatrixProblem(G, P1, P2, Q_[i], Q2, opts, &Mi); - model->M_[i].CopyFromMat(Mi); - if (i < 10) { - KALDI_LOG << "Objf impr for projection M for i = " << i << ", is " - << (impr / (gamma_i + 1.0e-20)) << " over " << gamma_i - << " frames"; - } - totcount += gamma_i; - tot_like_impr += impr; - } - tot_like_impr /= (totcount + 1.0e-20); - KALDI_LOG << "Overall objective function improvement for model projections " - << "M is " << tot_like_impr << " over " << totcount << " frames"; - return tot_like_impr; -} - - -/// This function gets stats used inside UpdateWParallel, where it accumulates -/// the F_i and g_i quantities. Note: F_i is viewed as a vector of SpMatrix -/// (one for each i); each row of F_i is viewed as an SpMatrix even though -/// it's stored as a vector.... -/// Note: w is just a double-precision copy of the matrix model->w_ - -// static -void MleAmSgmmUpdater::UpdateWParallelGetStats(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const Matrix &w, - Matrix *F_i, - Matrix *g_i, - double *tot_like, - int32 num_threads, - int32 thread_id) { - - // Accumulate stats from a block of states (this gets called in parallel). - int32 block_size = (accs.num_states_ + (num_threads-1)) / num_threads, - j_start = block_size * thread_id, - j_end = std::min(accs.num_states_, j_start + block_size); - - // Unlike in the report the inner most loop is over Gaussians, where - // per-gaussian statistics are accumulated. This is more memory demanding - // but more computationally efficient, as outer product v_{jvm} v_{jvm}^T - // is computed only once for all gaussians. - - SpMatrix v_vT(accs.phn_space_dim_); - - for (int32 j = j_start; j < j_end; j++) { - int32 num_substates = model.NumSubstates(j); - Matrix w_jm(num_substates, accs.num_gaussians_); - // The linear term and quadratic term for each Gaussian-- two scalars - // for each Gaussian, they appear in the accumulation formulas. - Matrix linear_term(num_substates, accs.num_gaussians_); - Matrix quadratic_term(num_substates, accs.num_gaussians_); - Matrix v_vT_m(num_substates, - (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2); - - // w_jm = softmax([w_{k1}^T ... w_{kD}^T] * v_{jkm}) eq.(7) - Matrix v_j_double(model.v_[j]); - w_jm.AddMatMat(1.0, v_j_double, kNoTrans, w, kTrans, 0.0); - - for (int32 m = 0; m < model.NumSubstates(j); m++) { - double gamma_jm = accs.gamma_[j].Row(m).Sum(); - - w_jm.Row(m).Add(-1.0 * w_jm.Row(m).LogSumExp()); - *tot_like += VecVec(w_jm.Row(m), accs.gamma_[j].Row(m)); - w_jm.Row(m).ApplyExp(); - v_vT.SetZero(); - // v_vT := v_{jkm} v_{jkm}^T - v_vT.AddVec2(static_cast(1.0), v_j_double.Row(m)); - v_vT_m.Row(m).CopyFromPacked(v_vT); // a bit wasteful, but does not dominate. - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - // Suggestion: g_jkm can be computed more efficiently - // using the Vector/Matrix routines for all i at once - // linear term around cur value. - linear_term(m, i) = accs.gamma_[j](m, i) - gamma_jm * w_jm(m, i); - quadratic_term(m, i) = std::max(accs.gamma_[j](m, i), - gamma_jm * w_jm(m, i)); - } - } // loop over substates - g_i->AddMatMat(1.0, linear_term, kTrans, v_j_double, kNoTrans, 1.0); - F_i->AddMatMat(1.0, quadratic_term, kTrans, v_vT_m, kNoTrans, 1.0); - } // loop over states -} - -// The parallel weight update, in the paper. -double MleAmSgmmUpdater::UpdateWParallel(const MleAmSgmmAccs &accs, - AmSgmm *model) { - KALDI_LOG << "Updating weight projections"; - - // tot_like_{after, before} are totals over multiple iterations, - // not valid likelihoods. but difference is valid (when divided by tot_count). - double tot_predicted_like_impr = 0.0, tot_like_before = 0.0, - tot_like_after = 0.0; - - Matrix g_i(accs.num_gaussians_, accs.phn_space_dim_); - // View F_i as a vector of SpMatrix. - Matrix F_i(accs.num_gaussians_, - (accs.phn_space_dim_*(accs.phn_space_dim_+1))/2); - - Matrix w(model->w_); - double tot_count = 0.0; - for (int32 j = 0; j < accs.num_states_; j++) tot_count += accs.gamma_[j].Sum(); - - for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) { - F_i.SetZero(); - g_i.SetZero(); - double k_like_before = 0.0; - - UpdateWParallelClass c(accs, *model, w, &F_i, &g_i, &k_like_before); - RunMultiThreaded(c); - - Matrix w_orig(w); - double k_predicted_like_impr = 0.0, k_like_after = 0.0; - double min_step = 0.001, step_size; - for (step_size = 1.0; step_size >= min_step; step_size /= 2) { - k_predicted_like_impr = 0.0; - k_like_after = 0.0; - - SolverOptions opts; - opts.name = "w"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - // auxf is formulated in terms of change in w. - Vector delta_w(accs.phn_space_dim_); - // returns objf impr with step_size = 1, - // but it may not be 1 so we recalculate it. - SpMatrix this_F_i(accs.phn_space_dim_); - this_F_i.CopyFromVec(F_i.Row(i)); - SolveQuadraticProblem(this_F_i, g_i.Row(i), opts, &delta_w); - - delta_w.Scale(step_size); - double predicted_impr = VecVec(delta_w, g_i.Row(i)) - - 0.5 * VecSpVec(delta_w, this_F_i, delta_w); - - // should never be negative because - // we checked inside SolveQuadraticProblem. - KALDI_ASSERT(predicted_impr >= -1.0e-05); - - if (i < 10) { - KALDI_LOG << "Predicted objf impr for w (not per frame), iter = " << - (iter) << ", i = " << (i) << " is " << (predicted_impr); - } - k_predicted_like_impr += predicted_impr; - w.Row(i).AddVec(1.0, delta_w); - } - Vector w_jm_vec(accs.num_gaussians_); - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - w_jm_vec.AddMatVec(1.0, w, kNoTrans, Vector(model->v_[j].Row(m)), 0.0); - w_jm_vec.Add((-1.0) * w_jm_vec.LogSumExp()); - k_like_after += VecVec(w_jm_vec, accs.gamma_[j].Row(m)); - } - } - KALDI_VLOG(2) << "For iteration " << (iter) << ", updating w gives " - << "predicted per-frame like impr " - << (k_predicted_like_impr / tot_count) << ", actual " - << ((k_like_after - k_like_before) / tot_count) << ", over " - << (tot_count) << " frames"; - if (k_like_after < k_like_before) { - w.CopyFromMat(w_orig); // Undo what we computed. - if (fabs(k_like_after - k_like_before) / tot_count < 1.0e-05) { - k_like_after = k_like_before; - KALDI_WARN << "Not updating weights as not increasing auxf and " - << "probably due to numerical issues (since small change)."; - break; - } else { - KALDI_WARN << "Halving step size for weights as likelihood did " - << "not increase"; - } - } else { - break; - } - } - if (step_size < min_step) { - // Undo any step as we have no confidence that this is right. - w.CopyFromMat(w_orig); - } else { - tot_predicted_like_impr += k_predicted_like_impr; - tot_like_after += k_like_after; - tot_like_before += k_like_before; - } - } - - model->w_.CopyFromMat(w); - - tot_predicted_like_impr /= tot_count; - tot_like_after = (tot_like_after - tot_like_before) / tot_count; - KALDI_LOG << "**Overall objf impr for w is " << tot_predicted_like_impr - << ", actual " << tot_like_after << ", over " - << tot_count << " frames"; - return tot_like_after; -} - -double MleAmSgmmUpdater::UpdateWSequential( - const MleAmSgmmAccs &accs, AmSgmm *model) { - // Sequential version, in paper. - /* This is the approach for the weight projections that - * I originally implemented, in which we test the auxiliary function - improvement for each i that we update. This requires some - careful bookkeeping. It means that we need to store the - total of the un-normalized weights for each j, m. */ - - KALDI_LOG << "Updating weight projections [original approach, checking each" - << "Gaussian component]."; - - SpMatrix v_vT(accs.phn_space_dim_); - // tot_like_{after, before} are totals over multiple iterations, - // not valid likelihoods... - // but difference is valid (when divided by tot_count). - double tot_delta_predicted = 0.0, tot_delta_observed = 0.0, - tot_count = 0.0; - - Vector w_jm(accs.num_gaussians_); - Vector g_i(accs.phn_space_dim_); - SpMatrix F_i(accs.phn_space_dim_); - - double k_count = 0.0; - // Total count in each substate. - std::vector< Vector > gamma_jm(accs.num_states_); - for (int32 j = 0; j < accs.num_states_; j++) { // Initialize gamma_jm - gamma_jm[j].Resize(model->NumSubstates(j)); - for (int32 m = 0; m < model->NumSubstates(j); m++) { - k_count += (gamma_jm[j](m) = accs.gamma_[j].Row(m).Sum()); - } - } - - Matrix w(model->w_); - - for (int iter = 0; iter < update_options_.weight_projections_iters; iter++) { - double k_delta_predicted = 0.0, k_delta_observed = 0.0; - - // log total of un-normalized weights for each j, m - std::vector< Vector > weight_tots(accs.num_states_); - - // Initialize weight_tots - for (int32 j = 0; j < accs.num_states_; j++) { - weight_tots[j].Resize(model->NumSubstates(j)); - for (int32 m = 0; m < model->NumSubstates(j); m++) { - w_jm.AddMatVec(1.0, w, kNoTrans, Vector(model->v_[j].Row(m)), 0.0); - weight_tots[j](m) = w_jm.LogSumExp(); - } - } - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - F_i.SetZero(); - g_i.SetZero(); - SubVector w_i = w.Row(i); - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - double this_unnormalized_weight = VecVec(w_i, model->v_[j].Row(m)); - double normalizer = weight_tots[j](m); - double this_log_w = this_unnormalized_weight - normalizer, - this_w = Exp(this_log_w), - substate_count = gamma_jm[j](m), - this_count = accs.gamma_[j](m, i); - - double linear_term = this_count - substate_count * this_w; - double quadratic_term = std::max(this_count, substate_count * this_w); - - g_i.AddVec(linear_term, model->v_[j].Row(m)); - // should not ever be zero, but check anyway. - if (quadratic_term != 0.0) - F_i.AddVec2(static_cast(quadratic_term), model->v_[j].Row(m)); - } - } - - SolverOptions opts; - opts.name = "w"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - // auxf is formulated in terms of change in w. - Vector delta_w(accs.phn_space_dim_); - // returns objf impr with step_size = 1, - // but it may not be 1 so we recalculate it. - SolveQuadraticProblem(F_i, - g_i, - opts, - &delta_w); - - try { // In case we have a problem in LogSub. - double step_size, min_step = 0.0001; - for (step_size = 1.0; step_size >= min_step; step_size /= 2) { - Vector new_w_i(w_i); - // copy it in case we do not commit this change. - std::vector > new_weight_tots(weight_tots); - new_w_i.AddVec(step_size, delta_w); - double predicted_impr = step_size * VecVec(delta_w, g_i) - - 0.5 * step_size * step_size * VecSpVec(delta_w, F_i, delta_w); - if (predicted_impr < -0.1) { - KALDI_WARN << "Negative predicted auxf improvement " << - (predicted_impr) << ", not updating this gaussian " << - "(either numerical problems or a code mistake."; - break; - } - // Now compute observed objf change. - double observed_impr = 0.0, this_tot_count = 0.0; - - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - double old_unnorm_weight = VecVec(w_i, model->v_[j].Row(m)), - new_unnorm_weight = VecVec(new_w_i, model->v_[j].Row(m)), - substate_count = gamma_jm[j](m), - this_count = accs.gamma_[j](m, i); - this_tot_count += this_count; - observed_impr += this_count * // from numerator. - (new_unnorm_weight - old_unnorm_weight); - double old_normalizer = new_weight_tots[j](m), delta; - if (new_unnorm_weight > old_unnorm_weight) { - delta = LogAdd(0, LogSub(new_unnorm_weight - old_normalizer, - old_unnorm_weight - old_normalizer)); - } else { - delta = LogSub(0, LogSub(old_unnorm_weight - old_normalizer, - new_unnorm_weight - old_normalizer)); - // The if-statement above is equivalent to: - // delta = LogAdd(LogSub(0, - // old_unnorm_weight-old_normalizer), - // new_unnorm_weight-old_normalizer) - // but has better behaviour numerically. - } - observed_impr -= substate_count * delta; - new_weight_tots[j](m) += delta; - } - } - if (observed_impr < 0.0) { // failed, so we reduce step size. - KALDI_LOG << "Updating weights, for i = " << (i) << ", predicted " - "auxf: " << (predicted_impr/(this_tot_count + 1.0e-20)) - << ", observed " << observed_impr/(this_tot_count + 1.0e-20) - << " over " << this_tot_count << " frames. Reducing step size " - << "to " << (step_size/2); - if (predicted_impr / (this_tot_count + 1.0e-20) < 1.0e-07) { - KALDI_WARN << "Not updating this weight vector as auxf decreased" - << " probably due to numerical issues (since small change)."; - break; - } - } else { - if (i < 10) - KALDI_LOG << "Updating weights, for i = " << (i) - << ", auxf change per frame is" << ": predicted " << - (predicted_impr /(this_tot_count + 1.0e-20)) << ", observed " - << (observed_impr / (this_tot_count + 1.0e-20)) - << " over " << (this_tot_count) << " frames."; - - k_delta_predicted += predicted_impr; - k_delta_observed += observed_impr; - w.Row(i).CopyFromVec(new_w_i); - weight_tots = new_weight_tots; // Copy over normalizers. - break; - } - } - } catch(...) { - KALDI_LOG << "Warning: weight update for i = " << i - << " failed, possible numerical problem."; - } - } - KALDI_LOG << "For iteration " << iter << ", updating w gives predicted " - << "per-frame like impr " << (k_delta_predicted / k_count) << - ", observed " << (k_delta_observed / k_count) << ", over " << (k_count) - << " frames"; - if (iter == 0) tot_count += k_count; - tot_delta_predicted += k_delta_predicted; - tot_delta_observed += k_delta_observed; - } - - model->w_.CopyFromMat(w); - - tot_delta_observed /= tot_count; - tot_delta_predicted /= tot_count; - KALDI_LOG << "**Overall objf impr for w is " << tot_delta_predicted - << ", observed " << tot_delta_observed << ", over " - << tot_count << " frames"; - return tot_delta_observed; -} - -double MleAmSgmmUpdater::UpdateN(const MleAmSgmmAccs &accs, - AmSgmm *model) { - double tot_count = 0.0, tot_like_impr = 0.0; - if (accs.spk_space_dim_ == 0 || accs.R_.size() == 0 || accs.Z_.size() == 0) { - KALDI_ERR << "Speaker subspace dim is zero or no stats accumulated"; - } - - Vector gamma_i(accs.num_gaussians_); - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - gamma_i.AddVec(1.0, accs.gamma_[j].Row(m)); - } - } - - SolverOptions opts; - opts.name = "N"; - opts.K = update_options_.max_cond; - opts.eps = update_options_.epsilon; - - for (int32 i = 0; i < accs.num_gaussians_; i++) { - if (gamma_i(i) < 2 * accs.spk_space_dim_) { - KALDI_WARN << "Not updating speaker basis for i = " << (i) - << " because count is too small " << (gamma_i(i)); - continue; - } - Matrix Ni(model->N_[i]); - double impr = - SolveQuadraticMatrixProblem(accs.R_[i], accs.Z_[i], - SpMatrix(model->SigmaInv_[i]), - opts, &Ni); - model->N_[i].CopyFromMat(Ni); - if (i < 10) { - KALDI_LOG << "Objf impr for spk projection N for i = " << (i) - << ", is " << (impr / (gamma_i(i) + 1.0e-20)) << " over " - << (gamma_i(i)) << " frames"; - } - tot_count += gamma_i(i); - tot_like_impr += impr; - } - - tot_like_impr /= (tot_count+1.0e-20); - KALDI_LOG << "**Overall objf impr for N is " << tot_like_impr << " over " - << tot_count << " frames"; - return tot_like_impr; -} - -void MleAmSgmmUpdater::RenormalizeN( - const MleAmSgmmAccs &accs, AmSgmm *model) { - KALDI_ASSERT(accs.R_.size() != 0); - Vector gamma_i(accs.num_gaussians_); - for (int32 j = 0; j < accs.num_states_; j++) { - for (int32 m = 0; m < model->NumSubstates(j); m++) { - gamma_i.AddVec(1.0, accs.gamma_[j].Row(m)); - } - } - double tot_count = gamma_i.Sum(); - if (tot_count == 0) { - KALDI_WARN << "Not renormalizing N, since there are no counts."; - return; - } - - SpMatrix RTot(accs.spk_space_dim_); - // for (int32 i = 0; i < accs.num_gaussians_; i++) { - // RTot.AddSp(1.0, accs.R_[i]); - // } - for (int32 i = 0; i < accs.num_gaussians_; i++) { - RTot.AddSp(gamma_i(i), accs.R_[i]); - } - RTot.Scale(1.0 / tot_count); - Matrix U(accs.spk_space_dim_, accs.spk_space_dim_); - Vector eigs(accs.spk_space_dim_); - RTot.SymPosSemiDefEig(&eigs, &U); - KALDI_LOG << "Renormalizing N, eigs are: " << (eigs); - Vector sqrteigs(accs.spk_space_dim_); - for (int32 t = 0; t < accs.spk_space_dim_; t++) { - sqrteigs(t) = sqrt(eigs(t)); - } - // e.g. diag(eigs)^{-0.5} * U' * RTot * U * diag(eigs)^{-0.5} = 1 - // But inverse transpose of this transformation needs to take place on R, - // i.e. not (on left: diag(eigs)^{-0.5} * U') - // but: (inverse it: U . diag(eigs)^{0.5}, - // transpose it: diag(eigs)^{0.5} U^T. Need to do this on the right to N - // (because N has the spk vecs on the right), so N := N U diag(eigs)^{0.5} - U.MulColsVec(sqrteigs); - Matrix Ntmp(accs.feature_dim_, accs.spk_space_dim_); - for (int32 i = 0; i < accs.num_gaussians_; i++) { - Ntmp.AddMatMat(1.0, Matrix(model->N_[i]), kNoTrans, U, kNoTrans, 0.0); - model->N_[i].CopyFromMat(Ntmp); - } -} - - -double MleAmSgmmUpdater::UpdateVars(const MleAmSgmmAccs &accs, - AmSgmm *model) { - KALDI_ASSERT(S_means_.size() == static_cast(accs.num_gaussians_) && - "Must call PreComputeStats before updating the covariances."); - SpMatrix Sigma_i(accs.feature_dim_), Sigma_i_ml(accs.feature_dim_); - double tot_objf_impr = 0.0, tot_t = 0.0; - SpMatrix covfloor(accs.feature_dim_); - Vector gamma_vec(accs.num_gaussians_); - Vector objf_improv(accs.num_gaussians_); - - // First pass over all (shared) Gaussian components to calculate the - // ML estimate of the covariances, and the total covariance for flooring. - for (int32 i = 0; i < accs.num_gaussians_; i++) { - double gamma_i = 0; - for (int32 j = 0; j < accs.num_states_; j++) - for (int32 m = 0, end = model->NumSubstates(j); m < end; m++) - gamma_i += accs.gamma_[j](m, i); - - // Eq. (75): Sigma_{i}^{ml} = 1/gamma_{i} [S_{i} + S_{i}^{(means)} - ... - // Y_{i} M_{i}^T - M_{i} Y_{i}^T] - // Note the S_means_ already contains the Y_{i} M_{i}^T terms. - Sigma_i_ml.CopyFromSp(S_means_[i]); - Sigma_i_ml.AddSp(1.0, accs.S_[i]); - - gamma_vec(i) = gamma_i; - covfloor.AddSp(1.0, Sigma_i_ml); - // inverting small values e.g. 4.41745328e-40 seems to generate inf, - // although would be fixed up later. - if (gamma_i > 1.0e-20) { - Sigma_i_ml.Scale(1 / (gamma_i + 1.0e-20)); - } else { - Sigma_i_ml.SetUnit(); - } - KALDI_ASSERT(1.0 / Sigma_i_ml(0, 0) != 0.0); - // Eq. (76): Compute the objective function with the old parameter values - objf_improv(i) = model->SigmaInv_[i].LogPosDefDet() - - TraceSpSp(SpMatrix(model->SigmaInv_[i]), Sigma_i_ml); - - model->SigmaInv_[i].CopyFromSp(Sigma_i_ml); // inverted in the next loop. - } - - // Compute the covariance floor. - if (gamma_vec.Sum() == 0) { // If no count, use identity. - KALDI_WARN << "Updating variances: zero counts. Setting floor to unit."; - covfloor.SetUnit(); - } else { // else, use the global average covariance. - covfloor.Scale(update_options_.cov_floor / gamma_vec.Sum()); - int32 tmp; - if ((tmp = covfloor.LimitCondDouble(update_options_.max_cond)) != 0) { - KALDI_WARN << "Covariance flooring matrix is poorly conditioned. Fixed " - << "up " << (tmp) << " eigenvalues."; - } - } - - if (update_options_.cov_diag_ratio > 1000) { - KALDI_LOG << "Assuming you want to build a diagonal system since " - << "cov_diag_ratio is large: making diagonal covFloor."; - for (int32 i = 0; i < covfloor.NumRows(); i++) - for (int32 j = 0; j < i; j++) - covfloor(i, j) = 0.0; - } - - // Second pass over all (shared) Gaussian components to calculate the - // floored estimate of the covariances, and update the model. - for (int32 i = 0; i < accs.num_gaussians_; i++) { - Sigma_i.CopyFromSp(model->SigmaInv_[i]); - Sigma_i_ml.CopyFromSp(Sigma_i); - // In case of insufficient counts, make the covariance matrix diagonal. - // cov_diag_ratio is 2 by default, set to very large to always get diag-cov - if (gamma_vec(i) < update_options_.cov_diag_ratio * accs.feature_dim_) { - KALDI_WARN << "For Gaussian component " << i << ": Too low count " - << gamma_vec(i) << " for covariance matrix estimation. Setting to " - << "diagonal"; - for (int32 d = 0; d < accs.feature_dim_; d++) - for (int32 e = 0; e < d; e++) - Sigma_i(d, e) = 0.0; // SpMatrix, can only set lower traingular part - - int floored = Sigma_i.ApplyFloor(covfloor); - if (floored > 0) { - KALDI_WARN << "For Gaussian component " << i << ": Floored " << floored - << " covariance eigenvalues."; - } - model->SigmaInv_[i].CopyFromSp(Sigma_i); - model->SigmaInv_[i].InvertDouble(); - } else { // Updating the full covariance matrix. - try { - int floored = Sigma_i.ApplyFloor(covfloor); - if (floored > 0) { - KALDI_WARN << "For Gaussian component " << i << ": Floored " - << floored << " covariance eigenvalues."; - } - model->SigmaInv_[i].CopyFromSp(Sigma_i); - model->SigmaInv_[i].InvertDouble(); - - objf_improv(i) += Sigma_i.LogPosDefDet() + - TraceSpSp(SpMatrix(model->SigmaInv_[i]), Sigma_i_ml); - objf_improv(i) *= (-0.5 * gamma_vec(i)); // Eq. (76) - - tot_objf_impr += objf_improv(i); - tot_t += gamma_vec(i); - if (i < 5) { - KALDI_VLOG(2) << "objf impr from variance update =" << objf_improv(i) - / (gamma_vec(i) + 1.0e-20) << " over " << (gamma_vec(i)) - << " frames for i = " << (i); - } - } catch(...) { - KALDI_WARN << "Updating within-class covariance matrix i = " << (i) - << ", numerical problem"; - // This is a catch-all thing in case of unanticipated errors, but - // flooring should prevent this occurring for the most part. - model->SigmaInv_[i].SetUnit(); // Set to unit. - } - } - } - KALDI_LOG << "**Overall objf impr for variance update = " - << (tot_objf_impr / (tot_t+ 1.0e-20)) - << " over " << (tot_t) << " frames"; - return tot_objf_impr / (tot_t + 1.0e-20); -} - - -double MleAmSgmmUpdater::UpdateSubstateWeights( - const MleAmSgmmAccs &accs, AmSgmm *model) { - KALDI_LOG << "Updating substate mixture weights"; - // Also set the vector gamma_j which is a cache of the state occupancies. - gamma_j_.Resize(accs.num_states_); - - double tot_gamma = 0.0, objf_impr = 0.0; - for (int32 j = 0; j < accs.num_states_; j++) { - double gamma_j_sm = 0.0; - int32 num_substates = model->NumSubstates(j); - Vector occs(num_substates), - smoothed_occs(num_substates); - for (int32 m = 0; m < num_substates; m++) { - occs(m) = accs.gamma_[j].Row(m).Sum(); // \sum_i gamma_{jmi} - gamma_j_(j) += occs(m); // actual state occupancy. - smoothed_occs(m) = occs(m) + update_options_.tau_c; - gamma_j_sm += smoothed_occs(m); // smoothed state occupancy for update. - } - - for (int32 m = 0; m < num_substates; m++) { - double cur_weight = model->c_[j](m); - if (cur_weight <= 0) { - KALDI_WARN << "Zero or negative weight, flooring"; - cur_weight = 1.0e-10; // future work(arnab): remove magic numbers - } - model->c_[j](m) = smoothed_occs(m) / gamma_j_sm; - objf_impr += Log(model->c_[j](m) / cur_weight) * occs(m); - } - tot_gamma += gamma_j_(j); - } - objf_impr /= (tot_gamma + 1.0e-20); - KALDI_LOG << "**Overall objf impr for c is " << objf_impr << ", over " - << tot_gamma << " frames."; - return objf_impr; -} - - -MleSgmmSpeakerAccs::MleSgmmSpeakerAccs(const AmSgmm &model, BaseFloat prune) - : rand_prune_(prune) { - KALDI_ASSERT(model.SpkSpaceDim() != 0); - H_spk_.resize(model.NumGauss()); - for (int32 i = 0; i < model.NumGauss(); i++) { - // Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i} - H_spk_[i].Resize(model.SpkSpaceDim()); - H_spk_[i].AddMat2Sp(1.0, Matrix(model.N_[i]), - kTrans, SpMatrix(model.SigmaInv_[i]), 0.0); - } - - model.GetNtransSigmaInv(&NtransSigmaInv_); - - gamma_s_.Resize(model.NumGauss()); - y_s_.Resize(model.SpkSpaceDim()); -} - -void MleSgmmSpeakerAccs::Clear() { - y_s_.SetZero(); - gamma_s_.SetZero(); -} - - -BaseFloat -MleSgmmSpeakerAccs::Accumulate(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - int32 j, - BaseFloat weight) { - // Calculate Gaussian posteriors and collect statistics - Matrix posteriors; - BaseFloat log_like = model.ComponentPosteriors(frame_vars, j, &posteriors); - posteriors.Scale(weight); - AccumulateFromPosteriors(model, frame_vars, posteriors, j); - return log_like; -} - -BaseFloat -MleSgmmSpeakerAccs::AccumulateFromPosteriors(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const Matrix &posteriors, - int32 j) { - double tot_count = 0.0; - int32 feature_dim = model.FeatureDim(), - spk_space_dim = model.SpkSpaceDim(); - KALDI_ASSERT(spk_space_dim != 0); - const vector &gselect = frame_vars.gselect; - - // Intermediate variables - Vector xt_jmi(feature_dim), mu_jmi(feature_dim), - zt_jmi(spk_space_dim); - int32 num_substates = model.NumSubstates(j); - for (int32 ki = 0; ki < static_cast(gselect.size()); ki++) { - int32 i = gselect[ki]; - for (int32 m = 0; m < num_substates; m++) { - // Eq. (39): gamma_{jmi}(t) = p (j, m, i|t) - BaseFloat gammat_jmi = RandPrune(posteriors(ki, m), rand_prune_); - if (gammat_jmi != 0.0) { - tot_count += gammat_jmi; - model.GetSubstateMean(j, m, i, &mu_jmi); - xt_jmi.CopyFromVec(frame_vars.xt); - xt_jmi.AddVec(-1.0, mu_jmi); - // Eq. (48): z{jmi}(t) = N_{i}^{T} \Sigma_{i}^{-1} x_{jmi}(t) - zt_jmi.AddMatVec(1.0, NtransSigmaInv_[i], kNoTrans, xt_jmi, 0.0); - // Eq. (49): \gamma_{i}^{(s)} = \sum_{t\in\Tau(s), j, m} gamma_{jmi} - gamma_s_(i) += gammat_jmi; - // Eq. (50): y^{(s)} = \sum_{t, j, m, i} gamma_{jmi}(t) z_{jmi}(t) - y_s_.AddVec(gammat_jmi, zt_jmi); - } - } - } - return tot_count; -} - -void MleSgmmSpeakerAccs::Update(BaseFloat min_count, - Vector *v_s, - BaseFloat *objf_impr_out, - BaseFloat *count_out) { - double tot_gamma = gamma_s_.Sum(); - KALDI_ASSERT(y_s_.Dim() != 0); - int32 T = y_s_.Dim(); // speaker-subspace dim. - int32 num_gauss = gamma_s_.Dim(); - if (v_s->Dim() != T) v_s->Resize(T); // will set it to zero. - - if (tot_gamma < min_count) { - KALDI_WARN << "Updating speaker vectors, count is " << tot_gamma - << " < " << min_count << "not updating."; - if (objf_impr_out) *objf_impr_out = 0.0; - if (count_out) *count_out = 0.0; - return; - } - - // Eq. (84): H^{(s)} = \sum_{i} \gamma_{i}(s) H_{i}^{spk} - SpMatrix H_s(T); - - for (int32 i = 0; i < num_gauss; i++) - H_s.AddSp(gamma_s_(i), H_spk_[i]); - - - // Don't make these options to SolveQuadraticProblem configurable... - // they really don't make a difference at all unless the matrix in - // question is singular, which wouldn't happen in this case. - Vector v_s_dbl(*v_s); - double tot_objf_impr = - SolveQuadraticProblem(H_s, y_s_, SolverOptions("v_s"), &v_s_dbl); - v_s->CopyFromVec(v_s_dbl); - - KALDI_LOG << "*Objf impr for speaker vector is " << (tot_objf_impr / tot_gamma) - << " over " << (tot_gamma) << " frames."; - - if (objf_impr_out) *objf_impr_out = tot_objf_impr; - if (count_out) *count_out = tot_gamma; -} - - -MleAmSgmmAccs::~MleAmSgmmAccs() { - if (gamma_s_.Sum() != 0.0) - KALDI_ERR << "In destructor of MleAmSgmmAccs: detected that you forgot to " - "call CommitStatsForSpk()"; -} - - -} // namespace kaldi diff --git a/src/sgmm/estimate-am-sgmm.h b/src/sgmm/estimate-am-sgmm.h deleted file mode 100644 index c5c499dcd7d..00000000000 --- a/src/sgmm/estimate-am-sgmm.h +++ /dev/null @@ -1,475 +0,0 @@ -// sgmm/estimate-am-sgmm.h - -// Copyright 2009-2011 Microsoft Corporation; Lukas Burget; -// Saarland University (Author: Arnab Ghoshal); -// Ondrej Glembek; Yanmin Qian; -// Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey) -// Liang Lu; Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_ESTIMATE_AM_SGMM_H_ -#define KALDI_SGMM_ESTIMATE_AM_SGMM_H_ 1 - -#include -#include - -#include "sgmm/am-sgmm.h" -#include "gmm/model-common.h" -#include "itf/options-itf.h" -#include "sgmm/sgmm-clusterable.h" -#include "thread/kaldi-thread.h" // for MultiThreadable - -namespace kaldi { - -/** \struct MleAmSgmmOptions - * Configuration variables needed in the SGMM estimation process. - */ -struct MleAmSgmmOptions { - /// Configuration Parameters. See initialization code for more comments. - BaseFloat tau_vec; ///< Amount of smoothing for v_{jm} update - BaseFloat tau_c; ///< Tau value for smoothing substate weights (c) - /// Floor covariance matrices Sigma_i to this times average cov. - BaseFloat cov_floor; - /// ratio to dim below which we use diagonal. default 2, set to inf for diag. - BaseFloat cov_diag_ratio; - /// Max on condition of matrices in update beyond which we do not update. - /// Should probably be related to numerical properties of machine - /// or BaseFloat type. - BaseFloat max_cond; - /// Limits condition of smoothing matrices H_sm (e.g. 100). - /// Only really important on 1st iter if using priors. - BaseFloat max_cond_H_sm; - /// Fix for the smoothing approach, necessary if max_cond_H_sm != inf - /// note: only has an effect if tau_vec != 0. - bool fixup_H_sm; - /// Set check_v to true if you want to use the "checking" version of the update - /// for the v's, in which it checks the "real" objective function value and - /// backtracks if necessary; - bool check_v; - - bool renormalize_V; // Renormalize the phonetic space. - bool renormalize_N; // Renormalize the speaker space. - - /// Number of iters when re-estimating weight projections "w". - int weight_projections_iters; - /// The "sequential" weight update that checks each i in turn. - /// (if false, uses the "parallel" one). - bool use_sequential_weight_update; - - BaseFloat epsilon; ///< very small value used to prevent SVD crashing. - - BaseFloat tau_map_M; ///< For MAP update of the phonetic subspace M - int map_M_prior_iters; ///< num of iterations to update the prior of M - bool full_row_cov; ///< Estimate row covariance instead of using I - bool full_col_cov; ///< Estimate col covariance instead of using I - - MleAmSgmmOptions() { - // tau value used in smoothing vector re-estimation (if no prior used). - tau_vec = 0.0; - tau_c = 5.0; - cov_floor = 0.025; - cov_diag_ratio = 2.0; // set to very large to get diagonal-cov models. - max_cond = 1.0e+05; - epsilon = 1.0e-40; - max_cond_H_sm = 1.0e+05; // only for diagnostics in normal situations. - fixup_H_sm = true; - check_v = false; // for back-compat. - renormalize_V = true; - renormalize_N = false; // default to false since will invalidate spk vectors - // on disk. - weight_projections_iters = 3; - use_sequential_weight_update = false; - - map_M_prior_iters = 5; - tau_map_M = 0.0; // No MAP update by default (~500-1000 depending on prior) - full_row_cov = false; - full_col_cov = false; - } - - void Register(OptionsItf *opts) { - std::string module = "MleAmSgmmOptions: "; - opts->Register("tau-vec", &tau_vec, module+ - "Smoothing for phone vector estimation."); - opts->Register("tau-c", &tau_c, module+ - "Smoothing for substate weights estimation."); - opts->Register("cov-floor", &cov_floor, module+ - "Covariance floor (fraction of average covariance)."); - opts->Register("cov-diag-ratio", &cov_diag_ratio, module+ - "Minimum occ/dim ratio below which use diagonal covariances."); - opts->Register("max-cond", &max_cond, module+"Maximum condition number beyond" - " which matrices are not updated."); - opts->Register("weight-projections-iters", &weight_projections_iters, module+ - "Number for iterations for weight projection estimation."); - opts->Register("renormalize-v", &renormalize_V, module+"If true, renormalize " - "the phonetic-subspace vectors to have meaningful sizes."); - opts->Register("check-v", &check_v, module+"If true, check real auxf " - "improvement in update of v and backtrack if needed " - "(not compatible with smoothing v)"); - opts->Register("renormalize-n", &renormalize_N, module+"If true, renormalize " - "the speaker subspace to have meaningful sizes."); - - opts->Register("tau-map-M", &tau_map_M, module+"Smoothing for MAP estimate " - "of M (0 means ML update)."); - opts->Register("map-M-prior-iters", &map_M_prior_iters, module+ - "Number of iterations to estimate prior covariances for M."); - opts->Register("full-row-cov", &full_row_cov, module+ - "Estimate row covariance instead of using I."); - opts->Register("full-col-cov", &full_col_cov, module+ - "Estimate column covariance instead of using I."); - } -}; - -/** \class MleAmSgmmAccs - * Class for the accumulators associated with the SGMM parameters except - * speaker vectors. - */ -class MleAmSgmmAccs { - public: - explicit MleAmSgmmAccs(BaseFloat rand_prune = 1.0e-05) - : total_frames_(0.0), total_like_(0.0), feature_dim_(0), - phn_space_dim_(0), spk_space_dim_(0), num_gaussians_(0), - num_states_(0), rand_prune_(rand_prune) {} - - MleAmSgmmAccs(const AmSgmm &model, SgmmUpdateFlagsType flags, - BaseFloat rand_prune = 1.0e-05) - : total_frames_(0.0), total_like_(0.0), rand_prune_(rand_prune) { - ResizeAccumulators(model, flags); - } - - ~MleAmSgmmAccs(); - - void Read(std::istream &in_stream, bool binary, bool add); - void Write(std::ostream &out_stream, bool binary) const; - - /// Checks the various accumulators for correct sizes given a model. With - /// wrong sizes, assertion failure occurs. When the show_properties argument - /// is set to true, dimensions and presence/absence of the various - /// accumulators are printed. For use when accumulators are read from file. - void Check(const AmSgmm &model, bool show_properties = true) const; - - /// Resizes the accumulators to the correct sizes given the model. The flags - /// argument control which accumulators to resize. - void ResizeAccumulators(const AmSgmm &model, SgmmUpdateFlagsType flags); - - /// Returns likelihood. - BaseFloat Accumulate(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const VectorBase &v_s, // spk-vec, may be empty - int32 state_index, BaseFloat weight, - SgmmUpdateFlagsType flags); - - /// Returns count accumulated (may differ from posteriors.Sum() - /// due to weight pruning). - BaseFloat AccumulateFromPosteriors(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const Matrix &posteriors, - const VectorBase &v_s, // may be empty - int32 state_index, - SgmmUpdateFlagsType flags); - - /// Accumulates global stats for the current speaker (if applicable). - /// If flags contains kSgmmSpeakerProjections (N), must call - /// this after finishing the speaker's data. - void CommitStatsForSpk(const AmSgmm &model, - const VectorBase &v_s); - - /// Accessors - void GetStateOccupancies(Vector *occs) const; - const std::vector< Matrix >& GetOccs() const { - return gamma_; - } - int32 FeatureDim() const { return feature_dim_; } - int32 PhoneSpaceDim() const { return phn_space_dim_; } - int32 NumStates() const { return num_states_; } - int32 NumGauss() const { return num_gaussians_; } - double TotalFrames() const { return total_frames_; } - double TotalLike() const { return total_like_; } - - private: - /// The stats which are not tied to any state. - /// Stats Y_{i} for phonetic-subspace projections M; Dim is [I][D][S]. - std::vector< Matrix > Y_; - /// Stats Z_{i} for speaker-subspace projections N. Dim is [I][D][T]. - std::vector< Matrix > Z_; - /// R_{i}, quadratic term for speaker subspace estimation. Dim is [I][T][T] - std::vector< SpMatrix > R_; - /// S_{i}^{-}, scatter of adapted feature vectors x_{i}(t). Dim is [I][D][D]. - std::vector< SpMatrix > S_; - - /// The SGMM state specific stats. - /// Statistics y_{jm} for state vectors v_{jm}. dimension is [J][M_{j}[S]. - std::vector< Matrix > y_; - /// Gaussian occupancies gamma_{jmi} for each substate. Dim is [J][M_{j}][I]. - std::vector< Matrix > gamma_; - - /// gamma_{i}^{(s)}. Per-speaker counts for each Gaussian. Dimension is [I] - /// Needed for stats R_. - Vector gamma_s_; - - double total_frames_, total_like_; - - /// Dimensionality of various subspaces - int32 feature_dim_, phn_space_dim_, spk_space_dim_; - int32 num_gaussians_, num_states_; ///< Other model specifications - - BaseFloat rand_prune_; - - KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmAccs); - friend class MleAmSgmmUpdater; - friend class EbwAmSgmmUpdater; - friend class MleAmSgmmGlobalAccs; -}; - -/** \class MleAmSgmmUpdater - * Contains the functions needed to update the SGMM parameters. - */ -class MleAmSgmmUpdater { - public: - explicit MleAmSgmmUpdater(const MleAmSgmmOptions &options) - : update_options_(options) {} - void Reconfigure(const MleAmSgmmOptions &options) { - update_options_ = options; - } - - /// Main update function: Computes some overall stats, does parameter updates - /// and returns the total improvement of the different auxiliary functions. - BaseFloat Update(const MleAmSgmmAccs &accs, - AmSgmm *model, - SgmmUpdateFlagsType flags); - - /// This function is like UpdatePhoneVectorsChecked, which supports - /// objective-function checking and backtracking but no smoothing term, but it - /// takes as input the stats used in SGMM-based tree clustering-- this is used - /// in initializing an SGMM from the tree stats. It's not part of the - /// normal recipe. - double UpdatePhoneVectorsCheckedFromClusterable( - const std::vector &stats, - const std::vector > &H, - AmSgmm *model); - - protected: - friend class UpdateWParallelClass; - friend class UpdatePhoneVectorsClass; - friend class UpdatePhoneVectorsCheckedFromClusterableClass; - friend class EbwEstimateAmSgmm; - - /// Compute the Q_i quantities (Eq. 64). - static void ComputeQ(const MleAmSgmmAccs &accs, - const AmSgmm &model, - std::vector< SpMatrix > *Q); - - /// Compute the S_means quantities, minus sum: (Y_i M_i^T + M_i Y_I^T). - static void ComputeSMeans(const MleAmSgmmAccs &accs, - const AmSgmm &model, - std::vector< SpMatrix > *S_means); - friend class EbwAmSgmmUpdater; - private: - MleAmSgmmOptions update_options_; - /// Q_{i}, quadratic term for phonetic subspace estimation. Dim is [I][S][S] - std::vector< SpMatrix > Q_; - - /// Eq (74): S_{i}^{(means)}, scatter of substate mean vectors for estimating - /// the shared covariance matrices. [Actually this variable contains also the - /// term -(Y_i M_i^T + M_i Y_I^T).] Dimension is [I][D][D]. - std::vector< SpMatrix > S_means_; - - Vector gamma_j_; ///< State occupancies - - - void ComputeSmoothingTerms(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const std::vector< SpMatrix > &H, - SpMatrix *H_sm, - Vector *y_sm) const; - - // UpdatePhoneVectors function that allows smoothing terms (but - // no checking of proper auxiliary function RE weights) - double UpdatePhoneVectors(const MleAmSgmmAccs &accs, - AmSgmm *model, - const std::vector > &H, - const SpMatrix &H_sm, - const Vector &y_sm); - - - // Called from UpdatePhoneVectors; updates a subset of states - // (relates to multi-threading). - void UpdatePhoneVectorsInternal(const MleAmSgmmAccs &accs, - AmSgmm *model, - const std::vector > &H, - const SpMatrix &H_sm, - const Vector &y_sm, - double *auxf_impr, - double *like_impr, - int32 num_threads, - int32 thread_id) const; - - // UpdatePhoneVectors function that does not support smoothing - // terms, but allows checking of objective-function improvement, - // and backtracking. - double UpdatePhoneVectorsChecked(const MleAmSgmmAccs &accs, - AmSgmm *model, - const std::vector > &H); - - // Called (indirectly) from UpdatePhoneVectorsCheckedFromClusterable() - void UpdatePhoneVectorsCheckedFromClusterableInternal( - const std::vector &stats, - const std::vector< SpMatrix > &H, - AmSgmm *model, - double *count_ptr, - double *like_impr_ptr, - int32 num_threads, - int32 thread_id); - - double UpdateM(const MleAmSgmmAccs &accs, AmSgmm *model); - - void RenormalizeV(const MleAmSgmmAccs &accs, AmSgmm *model, - const SpMatrix &H_sm); - double UpdateN(const MleAmSgmmAccs &accs, AmSgmm *model); - void RenormalizeN(const MleAmSgmmAccs &accs, AmSgmm *model); - double UpdateVars(const MleAmSgmmAccs &accs, AmSgmm *model); - double UpdateWParallel(const MleAmSgmmAccs &accs, AmSgmm *model); - - /// Called, multithreaded, inside UpdateWParallel - static - void UpdateWParallelGetStats(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const Matrix &w, - Matrix *F_i, - Matrix *g_i, - double *tot_like, - int32 num_threads, - int32 thread_id); - - double UpdateWSequential(const MleAmSgmmAccs &accs, - AmSgmm *model); - double UpdateSubstateWeights(const MleAmSgmmAccs &accs, - AmSgmm *model); - - void ComputeMPrior(AmSgmm *model); // TODO(arnab): Maybe make this static? - double MapUpdateM(const MleAmSgmmAccs &accs, AmSgmm *model); - - KALDI_DISALLOW_COPY_AND_ASSIGN(MleAmSgmmUpdater); - MleAmSgmmUpdater() {} // Prevent unconfigured updater. -}; - - -/** \class MleSgmmSpeakerAccs - * Class for the accumulators required to update the speaker - * vectors v_s. - * Note: if you have multiple speakers you will want to initialize - * this just once and call Clear() after you're done with each speaker, - * rather than creating a new object for each speaker, since the - * initialization function does nontrivial work. - */ - -class MleSgmmSpeakerAccs { - public: - /// Initialize the object. Error if speaker subspace not set up. - MleSgmmSpeakerAccs(const AmSgmm &model, BaseFloat rand_prune_ = 1.0e-05); - - /// Clear the statistics. - void Clear(); - - /// Accumulate statistics. Returns per-frame log-likelihood. - BaseFloat Accumulate(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - int32 state_index, BaseFloat weight); - - /// Accumulate statistics, given posteriors. Returns total - /// count accumulated, which may differ from posteriors.Sum() - /// due to randomized pruning. - BaseFloat AccumulateFromPosteriors(const AmSgmm &model, - const SgmmPerFrameDerivedVars &frame_vars, - const Matrix &posteriors, - int32 state_index); - - /// Update speaker vector. If v_s was empty, will assume it started as zero - /// and will resize it to the speaker-subspace size. - void Update(BaseFloat min_count, // e.g. 100 - Vector *v_s, - BaseFloat *objf_impr_out, - BaseFloat *count_out); - - private: - /// Statistics for speaker adaptation (vectors), stored per-speaker. - /// Per-speaker stats for vectors, y^{(s)}. Dimension [T]. - Vector y_s_; - /// gamma_{i}^{(s)}. Per-speaker counts for each Gaussian. Dimension is [I] - Vector gamma_s_; - - /// The following variable does not change per speaker. - /// Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i} - std::vector< SpMatrix > H_spk_; - - /// N_i^T \Sigma_{i}^{-1}. Needed for y^{(s)} - std::vector< Matrix > NtransSigmaInv_; - - /// small constant to randomly prune tiny posteriors - BaseFloat rand_prune_; -}; - -// This class, used in multi-core implementation of the updates of the "w_i" -// quantities, was previously in estimate-am-sgmm.cc, but is being moved to the -// header so it can be used in estimate-am-sgmm-ebw.cc. It is responsible for -// computing, in parallel, the F_i and g_i quantities used in the updates of -// w_i. -class UpdateWParallelClass: public MultiThreadable { - public: - UpdateWParallelClass(const MleAmSgmmAccs &accs, - const AmSgmm &model, - const Matrix &w, - Matrix *F_i, - Matrix *g_i, - double *tot_like): - accs_(accs), model_(model), w_(w), - F_i_ptr_(F_i), g_i_ptr_(g_i), tot_like_ptr_(tot_like) { - tot_like_ = 0.0; - F_i_.Resize(F_i->NumRows(), F_i->NumCols()); - g_i_.Resize(g_i->NumRows(), g_i->NumCols()); - } - - ~UpdateWParallelClass() { - F_i_ptr_->AddMat(1.0, F_i_, kNoTrans); - g_i_ptr_->AddMat(1.0, g_i_, kNoTrans); - *tot_like_ptr_ += tot_like_; - } - - inline void operator() () { - // Note: give them local copy of the sums we're computing, - // which will be propagated to the total sums in the destructor. - MleAmSgmmUpdater::UpdateWParallelGetStats(accs_, model_, w_, - &F_i_, &g_i_, &tot_like_, - num_threads_, thread_id_); - } - private: - // MleAmSgmmUpdater *updater_; - const MleAmSgmmAccs &accs_; - const AmSgmm &model_; - const Matrix &w_; - Matrix *F_i_ptr_; - Matrix *g_i_ptr_; - Matrix F_i_; - Matrix g_i_; - double *tot_like_ptr_; - double tot_like_; -}; - - -} // namespace kaldi - - -#endif // KALDI_SGMM_ESTIMATE_AM_SGMM_H_ diff --git a/src/sgmm/fmllr-sgmm-test.cc b/src/sgmm/fmllr-sgmm-test.cc deleted file mode 100644 index c9239d5740c..00000000000 --- a/src/sgmm/fmllr-sgmm-test.cc +++ /dev/null @@ -1,233 +0,0 @@ -// sgmm/fmllr-sgmm-test.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "base/kaldi-math.h" -#include "gmm/model-test-common.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/fmllr-sgmm.h" -#include "util/kaldi-io.h" - -using kaldi::AmSgmm; -using kaldi::int32; -using kaldi::BaseFloat; -using kaldi::Vector; -using kaldi::Matrix; -using kaldi::Exp; - -namespace ut = kaldi::unittest; - -void ApplyFmllrXform(const kaldi::VectorBase &in, - const Matrix &xf, - Vector *out) { - int32 dim = in.Dim(); - KALDI_ASSERT(xf.NumRows() == dim && xf.NumCols() == dim + 1); - Vector tmp(dim + 1); - tmp.Range(0, dim).CopyFromVec(in); - tmp(dim) = 1.0; - out->Resize(dim, kaldi::kSetZero); - out->AddMatVec(1.0, xf, kaldi::kNoTrans, tmp, 0.0); -} - -// Tests the Read() and Write() methods for the accumulators, in both binary -// and ASCII mode, as well as Check(). -void TestSgmmFmllrAccsIO(const AmSgmm &sgmm, - const kaldi::Matrix &feats) { - KALDI_LOG << "Test IO start."; - using namespace kaldi; - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmPerFrameDerivedVars frame_vars; - kaldi::SgmmPerSpkDerivedVars empty; - kaldi::SgmmFmllrGlobalParams fmllr_globals; - kaldi::SgmmGselectConfig sgmm_config; - - frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim()); - sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest, - sgmm.NumGauss()); - kaldi::Vector occs(sgmm.NumPdfs()); - occs.Set(feats.NumRows()); - sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_, - &fmllr_globals.inv_xform_, - &fmllr_globals.mean_scatter_); - if (fmllr_globals.mean_scatter_.Min() == 0.0) { - KALDI_WARN << "Global covariances low rank!"; - KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_; - return; - } - -// std::cout << "Pre-Xform = " << fmllr_globals.pre_xform_; -// std::cout << "Inv-Xform = " << fmllr_globals.inv_xform_; - - FmllrSgmmAccs accs; - accs.Init(sgmm.FeatureDim(), sgmm.NumGauss()); - BaseFloat loglike = 0.0; - Vector empty_spk; - std::vector gselect; - for (int32 i = 0; i < feats.NumRows(); i++) { - sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect); - sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars); - loglike += accs.Accumulate(sgmm, empty, feats.Row(i), frame_vars, 0, 1.0); - } - - kaldi::SgmmFmllrConfig update_opts; -// update_opts.fmllr_min_count = 100; - kaldi::Matrix xform_mat(dim, dim+1); - xform_mat.SetUnit(); - BaseFloat frames, impr; - accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, &frames, &impr); - - Vector xformed_feat(dim); - ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat); - sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect); - sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars); - BaseFloat loglike1 = sgmm.LogLikelihood(frame_vars, 0); - - bool binary_in; - // First, non-binary write - KALDI_LOG << "Test ASCII IO."; - accs.Write(kaldi::Output("tmpf", false).Stream(), false); - FmllrSgmmAccs *accs1 = new FmllrSgmmAccs(); - // Non-binary read - kaldi::Input ki1("tmpf", &binary_in); - accs1->Read(ki1.Stream(), binary_in, false); - xform_mat.SetUnit(); - accs1->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL); - ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat); - sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect); - sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars); - BaseFloat loglike2 = sgmm.LogLikelihood(frame_vars, 0); - std::cout << "LL1 = " << loglike1 << ", LL2 = " << loglike2 << std::endl; - kaldi::AssertEqual(loglike1, loglike2, 1e-2); - delete accs1; - - // Next, binary write - KALDI_LOG << "Test Binary IO."; - accs.Write(kaldi::Output("tmpfb", true).Stream(), true); - FmllrSgmmAccs *accs2 = new FmllrSgmmAccs(); - // Binary read - kaldi::Input ki2("tmpfb", &binary_in); - accs2->Read(ki2.Stream(), binary_in, false); - xform_mat.SetUnit(); - accs2->Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL); - ApplyFmllrXform(feats.Row(0), xform_mat, &xformed_feat); - sgmm.GaussianSelection(sgmm_config, xformed_feat, &gselect); - sgmm.ComputePerFrameVars(xformed_feat, gselect, empty, 0.0, &frame_vars); - BaseFloat loglike3 = sgmm.LogLikelihood(frame_vars, 0); - std::cout << "LL1 = " << loglike1 << ", LL3 = " << loglike3 << std::endl; - kaldi::AssertEqual(loglike1, loglike3, 1e-4); - delete accs2; - KALDI_LOG << "Test IO end."; - - unlink("tmpf"); - unlink("tmpfb"); -} - -void TestSgmmFmllrSubspace(const AmSgmm &sgmm, - const kaldi::Matrix &feats) { - KALDI_LOG << "Test Subspace start."; - using namespace kaldi; - int32 dim = sgmm.FeatureDim(); - kaldi::SgmmPerFrameDerivedVars frame_vars; - kaldi::SgmmPerSpkDerivedVars empty; - kaldi::SgmmFmllrGlobalParams fmllr_globals; - kaldi::SgmmGselectConfig sgmm_config; - - frame_vars.Resize(sgmm.NumGauss(), dim, sgmm.PhoneSpaceDim()); - sgmm_config.full_gmm_nbest = std::min(sgmm_config.full_gmm_nbest, - sgmm.NumGauss()); - kaldi::Vector occs(sgmm.NumPdfs()); - occs.Set(feats.NumRows()); - sgmm.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_, - &fmllr_globals.inv_xform_, - &fmllr_globals.mean_scatter_); - if (fmllr_globals.mean_scatter_.Min() == 0.0) { - KALDI_WARN << "Global covariances low rank!"; - KALDI_WARN << "Diag-scatter = " << fmllr_globals.mean_scatter_; - return; - } - - FmllrSgmmAccs accs; - accs.Init(sgmm.FeatureDim(), sgmm.NumGauss()); - BaseFloat loglike = 0.0; - Vector empty_spk; - std::vector gselect; - for (int32 i = 0; i < feats.NumRows(); i++) { - sgmm.GaussianSelection(sgmm_config, feats.Row(i), &gselect); - sgmm.ComputePerFrameVars(feats.Row(i), gselect, empty, 0.0, &frame_vars); - loglike += accs.Accumulate(sgmm, empty, feats.Row(i), frame_vars, 0, 1.0); - } - - SpMatrix grad_scatter(dim * (dim+1)); - accs.AccumulateForFmllrSubspace(sgmm, fmllr_globals, &grad_scatter); - kaldi::SgmmFmllrConfig update_opts; - EstimateSgmmFmllrSubspace(grad_scatter, update_opts.num_fmllr_bases, dim, - &fmllr_globals); -// update_opts.fmllr_min_count = 100; - kaldi::Matrix xform_mat(dim, dim+1); - xform_mat.SetUnit(); - accs.Update(sgmm, fmllr_globals, update_opts, &xform_mat, NULL, NULL); - KALDI_LOG << "Test Subspace end."; -} - -void TestSgmmFmllr() { - // srand(time(NULL)); - int32 dim = 1 + kaldi::RandInt(0, 9); // random dimension of the gmm - int32 num_comp = 2 + kaldi::RandInt(0, 9); // random number of mixtures - kaldi::FullGmm full_gmm; - ut::InitRandFullGmm(dim, num_comp, &full_gmm); - - int32 num_states = 1; - AmSgmm sgmm; - kaldi::SgmmGselectConfig config; - sgmm.InitializeFromFullGmm(full_gmm, num_states, dim+1, dim); - sgmm.ComputeNormalizers(); - - kaldi::Matrix feats; - - { // First, generate random means and variances - int32 num_feat_comp = num_comp + kaldi::RandInt(-num_comp/2, num_comp/2); - kaldi::Matrix means(num_feat_comp, dim), - vars(num_feat_comp, dim); - for (int32 m = 0; m < num_feat_comp; m++) { - for (int32 d= 0; d < dim; d++) { - means(m, d) = kaldi::RandGauss(); - vars(m, d) = Exp(kaldi::RandGauss()) + 1e-2; - } - } - // Now generate random features with those means and variances. - feats.Resize(num_feat_comp * 200, dim); - for (int32 m = 0; m < num_feat_comp; m++) { - kaldi::SubMatrix tmp(feats, m*200, 200, 0, dim); - ut::RandDiagGaussFeatures(200, means.Row(m), vars.Row(m), &tmp); - } - } - TestSgmmFmllrAccsIO(sgmm, feats); - TestSgmmFmllrSubspace(sgmm, feats); -} - -int main() { - std::srand(1000); - kaldi::g_kaldi_verbose_level = 5; - for (int i = 0; i < 10; i++) - TestSgmmFmllr(); - std::cout << "Test OK.\n"; - return 0; -} diff --git a/src/sgmm/fmllr-sgmm.cc b/src/sgmm/fmllr-sgmm.cc deleted file mode 100644 index b1f87f9a967..00000000000 --- a/src/sgmm/fmllr-sgmm.cc +++ /dev/null @@ -1,554 +0,0 @@ -// sgmm/fmllr-sgmm.cc - -// Copyright 2009-2011 Saarland University (author: Arnab Ghoshal) -// 2012 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -using std::vector; - -#include "sgmm/fmllr-sgmm.h" -#include "util/parse-options.h" - -namespace kaldi { - -static void ApplyPreXformToGradient(const SgmmFmllrGlobalParams &globals, - const Matrix &gradient_in, - Matrix *gradient_out) { - // Eq. (B.14): P' = A_{inv}^T P {W_{pre}^+}^T - int32 dim = gradient_in.NumRows(); - Matrix Wpre_plus(dim + 1, dim + 1, kSetZero); - Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_); - Wpre_plus(dim, dim) = 1; - SubMatrix Ainv(globals.inv_xform_, 0, dim, 0, dim); - Matrix AinvP(dim, dim + 1, kUndefined); - AinvP.AddMatMat(1.0, Ainv, kTrans, gradient_in, kNoTrans, 0.0); - gradient_out->AddMatMat(1.0, AinvP, kNoTrans, Wpre_plus, kTrans, 0.0); -} - -static void ApplyInvPreXformToChange(const SgmmFmllrGlobalParams &globals, - const Matrix &delta_in, - Matrix *delta_out) { - // Eq. (B.25): \Delta = A_{inv} \Delta' W_{pre}^+ - int32 dim = delta_in.NumRows(); - Matrix Wpre_plus(dim + 1, dim + 1, kSetZero); - Wpre_plus.Range(0, dim, 0, dim + 1).CopyFromMat(globals.pre_xform_); - Wpre_plus(dim, dim) = 1; - SubMatrix Ainv(globals.inv_xform_, 0, dim, 0, dim); - Matrix AinvD(dim, dim + 1, kUndefined); - AinvD.AddMatMat(1.0, Ainv, kNoTrans, delta_in, kNoTrans, 0.0); - delta_out->AddMatMat(1.0, AinvD, kNoTrans, Wpre_plus, kNoTrans, 0.0); -} - -static void ApplyHessianXformToGradient(const SgmmFmllrGlobalParams &globals, - const Matrix &gradient_in, - Matrix *gradient_out) { - int32 dim = gradient_in.NumRows(); - const Vector &D = globals.mean_scatter_; - if (D.Min() <= 0.0) - KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues."; - for (int32 r = 0; r < dim; r++) { - for (int32 c = 0; c < r; c++) { - // Eq. (B.15) - (*gradient_out)(r, c) = gradient_in(r, c) / std::sqrt(1 + D(c)); - // Eq. (B.16) - (*gradient_out)(c, r) = gradient_in(c, r) / std::sqrt(1 + D(r) - - 1 / (1 + D(c))) - gradient_in(r, c) / ((1 + D(c)) * - std::sqrt(1 + D(r) - 1 / (1 + D(c)))); - } - // Eq. (B.17) & (B.18) - (*gradient_out)(r, r) = gradient_in(r, r) / std::sqrt(2 + D(r)); - (*gradient_out)(r, dim) = gradient_in(r, dim); - } -} - -static void ApplyInvHessianXformToChange(const SgmmFmllrGlobalParams &globals, - const Matrix &delta_in, - Matrix *delta_out) { - int32 dim = delta_in.NumRows(); - const Vector &D = globals.mean_scatter_; - if (D.Min() <= 0.0) - KALDI_ERR << "Cannot estimate FMLLR: mean scatter has 0 eigenvalues."; - for (int32 r = 0; r < dim; r++) { - for (int32 c = 0; c < r; c++) { - // Eq. (B.21) - (*delta_out)(r, c) = delta_in(r, c) / std::sqrt(1 + D(c)) - - delta_in(c, r) / ((1 + D(c)) * std::sqrt(1 + D(r) - 1 / (1 + D(c)))); - // Eq. (B.22) - (*delta_out)(c, r) = delta_in(c, r) / std::sqrt(1 + D(r) - 1/ (1 + D(c))); - } - // Eq. (B.23) & (B.24) - (*delta_out)(r, r) = delta_in(r, r) / std::sqrt(2 + D(r)); - (*delta_out)(r, dim) = delta_in(r, dim); - } -} - - -void SgmmFmllrGlobalParams::Write(std::ostream &out, bool binary) const { - WriteToken(out, binary, ""); - WriteToken(out, binary, ""); - pre_xform_.Write(out, binary); - WriteToken(out, binary, ""); - inv_xform_.Write(out, binary); - WriteToken(out, binary, ""); - mean_scatter_.Write(out, binary); - if (fmllr_bases_.size() != 0) { - WriteToken(out, binary, ""); - uint32 tmp = static_cast(fmllr_bases_.size()); - WriteBasicType(out, binary, tmp); - for (uint32 i = 0; i < tmp; i++) { - fmllr_bases_[i].Write(out, binary); - } - } - WriteToken(out, binary, ""); -} - -void SgmmFmllrGlobalParams::Read(std::istream &in, bool binary) { - ExpectToken(in, binary, ""); - ExpectToken(in, binary, ""); - pre_xform_.Read(in, binary); - ExpectToken(in, binary, ""); - inv_xform_.Read(in, binary); - ExpectToken(in, binary, ""); - mean_scatter_.Read(in, binary); - std::string token; - ReadToken(in, binary, &token); - if (token == "") { - uint32 tmp; - ReadBasicType(in, binary, &tmp); - fmllr_bases_.resize(tmp); - for (uint32 i = 0; i < tmp; i++) { - fmllr_bases_[i].Read(in, binary); - } - } else { - if (token != "") - KALDI_ERR << "Unexpected token '" << token << "' found."; - } -} - - -void FmllrSgmmAccs::Init(int32 dim, int32 num_gaussians) { - if (dim == 0) { // empty stats - dim_ = 0; // non-zero dimension is meaningless in empty stats - stats_.Init(0, 0); // clear the stats - } else { - dim_ = dim; - stats_.Init(dim, num_gaussians); - } -} - -BaseFloat FmllrSgmmAccs::Accumulate(const AmSgmm &model, - const SgmmPerSpkDerivedVars &spk, - const VectorBase &data, - const SgmmPerFrameDerivedVars &frame_vars, - int32 pdf_index, BaseFloat weight) { - // Calulate Gaussian posteriors and collect statistics - Matrix posteriors; - BaseFloat log_like = model.ComponentPosteriors(frame_vars, pdf_index, - &posteriors); - posteriors.Scale(weight); - AccumulateFromPosteriors(model, spk, data, frame_vars.gselect, posteriors, - pdf_index); - return log_like; -} - -void -FmllrSgmmAccs::AccumulateFromPosteriors(const AmSgmm &model, - const SgmmPerSpkDerivedVars &spk, - const VectorBase &data, - const vector &gselect, - const Matrix &posteriors, - int32 pdf_index) { - Vector var_scaled_mean(dim_), extended_data(dim_+1); - extended_data.Range(0, dim_).CopyFromVec(data); - extended_data(dim_) = 1.0; - SpMatrix scatter(dim_+1, kSetZero); - scatter.AddVec2(1.0, extended_data); - - for (int32 ki = 0, ki_max = gselect.size(); ki < ki_max; ki++) { - int32 i = gselect[ki]; - - for (int32 m = 0; m < model.NumSubstates(pdf_index); m++) { - // posterior gamma_{jkmi}(t) eq.(39) - BaseFloat gammat_jmi = posteriors(ki, m); - - // Accumulate statistics for non-zero gaussian posterior - if (gammat_jmi > 0.0) { - stats_.beta_ += gammat_jmi; - model.GetVarScaledSubstateSpeakerMean(pdf_index, m, i, spk, - &var_scaled_mean); - // Eq. (52): K += \gamma_{jmi} \Sigma_{i}^{-1} \mu_{jmi}^{(s)} x^{+T} - stats_.K_.AddVecVec(gammat_jmi, var_scaled_mean, extended_data); - // Eq. (53): G_{i} += \gamma_{jmi} x^{+} x^{+T} - stats_.G_[i].AddSp(gammat_jmi, scatter); - } // non-zero posteriors - } // loop over substates - } // loop over selected Gaussians -} - -void FmllrSgmmAccs::AccumulateForFmllrSubspace(const AmSgmm &sgmm, - const SgmmFmllrGlobalParams &globals, SpMatrix *grad_scatter) { - if (stats_.beta_ <= 0.0) { - KALDI_WARN << "Not committing any stats since no stats accumulated."; - return; - } - int32 dim = sgmm.FeatureDim(); - Matrix xform(dim, dim + 1, kUndefined); - xform.SetUnit(); - Matrix grad(dim, dim + 1, kSetZero); - this->FmllrObjGradient(sgmm, xform, &grad, NULL); - Matrix pre_xformed_grad(dim, dim + 1, kSetZero); - ApplyPreXformToGradient(globals, grad, &pre_xformed_grad); - Matrix hess_xformed_grad(dim, dim + 1, kSetZero); - ApplyHessianXformToGradient(globals, pre_xformed_grad, &hess_xformed_grad); - Vector grad_vec(dim * (dim + 1)); - grad_vec.CopyRowsFromMat(hess_xformed_grad); - grad_vec.Scale(1 / std::sqrt(stats_.beta_)); - grad_scatter->AddVec2(1.0, grad_vec); - KALDI_LOG << "Frame counts for when committing fMLLR subspace stats are " - << stats_.beta_; -} - - -BaseFloat FmllrSgmmAccs::FmllrObjGradient(const AmSgmm &sgmm, - const Matrix &xform, - Matrix *grad_out, - Matrix *G_out) const { - int32 dim = sgmm.FeatureDim(), - num_gauss = sgmm.NumGauss(); - KALDI_ASSERT(stats_.G_.size() == static_cast(num_gauss)); - Matrix xform_d(xform); - SubMatrix A(xform_d, 0, dim, 0, dim); - Matrix xform_g(dim, dim + 1), total_g(dim, dim + 1); - SpMatrix inv_covar(dim); - double obj = stats_.beta_ * A.LogDet() + - TraceMatMat(xform_d, stats_.K_, kTrans); - for (int32 i = 0; i < num_gauss; i++) { - sgmm.GetInvCovars(i, &inv_covar); - xform_g.AddMatSp(1.0, xform_d, kNoTrans, stats_.G_[i], 0.0); - total_g.AddSpMat(1.0, inv_covar, xform_g, kNoTrans, 1.0); - } - obj -= 0.5 * TraceMatMat(xform_d, total_g, kTrans); - if (G_out != NULL) G_out->CopyFromMat(total_g); - - // Compute the gradient: P = \beta [(A^{-1})^{T} , 0] + K - S - if (grad_out != NULL) { - Matrix grad_d(dim, dim + 1, kSetZero); - grad_d.Range(0, dim, 0, dim).CopyFromMat(A); - grad_d.Range(0, dim, 0, dim).InvertDouble(); - grad_d.Range(0, dim, 0, dim).Transpose(); - grad_d.Scale(stats_.beta_); - grad_d.AddMat(-1.0, total_g, kNoTrans); - grad_d.AddMat(1.0, stats_.K_, kNoTrans); - grad_out->CopyFromMat(grad_d); - } - - return obj; -} - - -void FmllrSgmmAccs::Write(std::ostream &out, bool binary) const { - WriteToken(out, binary, ""); - WriteToken(out, binary, ""); - WriteBasicType(out, binary, dim_); - WriteToken(out, binary, ""); - stats_.Write(out, binary); - WriteToken(out, binary, ""); -} - -void FmllrSgmmAccs::Read(std::istream &in, bool binary, bool add) { - ExpectToken(in, binary, ""); - ExpectToken(in, binary, ""); - ReadBasicType(in, binary, &dim_); - KALDI_ASSERT(dim_ > 0); - ExpectToken(in, binary, ""); - stats_.Read(in, binary, add); - ExpectToken(in, binary, ""); -} - - -static BaseFloat CalcFmllrStepSize(const AffineXformStats &stats, - const AmSgmm &sgmm, - const MatrixBase &Delta, - const MatrixBase &A, - const Matrix &G, - int32 max_iters) { - int32 dim = sgmm.FeatureDim(); - Matrix Delta_d(Delta); - Matrix G_d(G); - SubMatrix Delta_C(Delta_d, 0, dim, 0, dim); - - // Eq. (B.28): m = tr(\Delta K^T) - tr(\Delta S^T) - BaseFloat m = TraceMatMat(Delta_d, stats.K_, kTrans) - - TraceMatMat(Delta_d, G_d, kTrans); - // Eq. (B.29): n = \sum_i tr(\Delta \Sigma_{i}^{-1} \Delta S_{i}) - BaseFloat n = 0; - SpMatrix inv_covar; - for (int32 i = 0, num_gauss = sgmm.NumGauss(); i < num_gauss; i++) { - sgmm.GetInvCovars(i, &inv_covar); - n += TraceMatSpMatSp(Delta_d, kTrans, inv_covar, Delta_d, kNoTrans, - stats.G_[i]); - } - - BaseFloat step_size = 0.0; - // initialize just to get rid of compile errors. - BaseFloat obj_step_old, obj_step_new = 0.0; - Matrix new_A(dim, dim); - Matrix B(dim, dim); - for (int32 iter_step = 0; iter_step < max_iters; iter_step++) { - if (iter_step == 0) { - obj_step_old = stats.beta_ * A.LogDet(); // Q_0 = \beta * log det(A) - } else { - obj_step_old = obj_step_new; - } - - // Eq. (B.30); B = (A + k\Delta^{-C})^{-1} \Delta^{-C} - new_A.CopyFromMat(A); - new_A.AddMat(step_size, Delta_C, kNoTrans); - new_A.InvertDouble(); - B.AddMatMat(1.0, new_A, kNoTrans, Delta_C, kNoTrans, 0.0); - - BaseFloat d = m - step_size * n + stats.beta_ * TraceMat(B); - BaseFloat d2 = -n - stats.beta_ * TraceMatMat(B, B, kNoTrans); - if (std::fabs(d / d2) < 0.000001) { break; } // converged - - BaseFloat step_size_change = -(d / d2); - step_size += step_size_change; // Eq. (B.33) - - // Halve step size when the auxiliary function decreases. - do { - new_A.CopyFromMat(A); - new_A.AddMat(step_size, Delta_C, kNoTrans); - BaseFloat logdet = new_A.LogDet(); - obj_step_new = stats.beta_ * logdet + step_size * m - - 0.5 * step_size * step_size * n; - - if (obj_step_new - obj_step_old < -0.001) { - KALDI_WARN << "Objective function decreased (" << obj_step_old << "->" - << obj_step_new << "). Halving step size change (" - << step_size << " -> " << (step_size - (step_size_change/2)) - << ")"; - step_size_change /= 2; - step_size -= step_size_change; // take away half of our step - } // Facing numeric precision issues. Compute in double? - } while (obj_step_new - obj_step_old < -0.001 && step_size_change > 1e-05); - } - return step_size; -} - - -bool FmllrSgmmAccs::Update(const AmSgmm &sgmm, - const SgmmFmllrGlobalParams &globals, - const SgmmFmllrConfig &opts, - Matrix *out_xform, - BaseFloat *frame_count, BaseFloat *auxf_out) const { - BaseFloat auxf_improv = 0.0, logdet = 0.0; - KALDI_ASSERT(out_xform->NumRows() == dim_ && out_xform->NumCols() == dim_+1); - BaseFloat mincount = (globals.HasBasis() ? - std::min(opts.fmllr_min_count_basis, opts.fmllr_min_count_full) : - opts.fmllr_min_count); - bool using_subspace = (globals.HasBasis() ? - (stats_.beta_ < opts.fmllr_min_count_full) : false); - - if (globals.IsEmpty()) - KALDI_ERR << "Must set up pre-transforms before estimating FMLLR."; - - KALDI_VLOG(1) << "Mincount = " << mincount << "; Basis: " - << std::string(globals.HasBasis()? "yes; " : "no; ") - << "Using subspace: " << std::string(using_subspace? "yes; " - : "no; "); - - int32 num_bases = 0; - if (using_subspace) { - KALDI_ASSERT(globals.fmllr_bases_.size() != 0); - int32 max_bases = std::min(static_cast(globals.fmllr_bases_.size()), - opts.num_fmllr_bases); - num_bases = (opts.bases_occ_scale <= 0.0)? max_bases : - std::min(max_bases, static_cast(std::floor(opts.bases_occ_scale - * stats_.beta_))); - KALDI_VLOG(1) << "Have " << stats_.beta_ << " frames for speaker: Using " - << num_bases << " fMLLR bases."; - } - - // initialization just to get rid of compile errors. - BaseFloat auxf_old = 0, auxf_new = 0; - if (frame_count != NULL) *frame_count = stats_.beta_; - - // If occupancy is greater than the min count, update the transform - if (stats_.beta_ >= mincount) { - for (int32 iter = 0; iter < opts.fmllr_iters; iter++) { - Matrix grad(dim_, dim_ + 1, kSetZero); - Matrix G(dim_, dim_ + 1, kSetZero); - auxf_new = this->FmllrObjGradient(sgmm, *out_xform, &grad, &G); - - // For diagnostic purposes - KALDI_VLOG(3) << "Iter " << iter << ": Auxiliary function = " - << (auxf_new / stats_.beta_) << " per frame over " << stats_.beta_ - << " frames"; - - if (iter > 0) { - // For diagnostic purposes - KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: " - << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over " - << (stats_.beta_) << " frames"; - auxf_improv += auxf_new - auxf_old; - } - - Matrix pre_xformed_grad(dim_, dim_ + 1, kSetZero); - ApplyPreXformToGradient(globals, grad, &pre_xformed_grad); -// std::cout << "Pre-X Grad = " << pre_xformed_grad << std::endl; - - // Transform P_sk with the Hessian - Matrix hess_xformed_grad(dim_, dim_ + 1, kSetZero); - ApplyHessianXformToGradient(globals, pre_xformed_grad, - &hess_xformed_grad); -// std::cout << "Hess-X Grad = " << hess_xformed_grad << std::endl; - - // Update the actual FMLLR transform matrices - Matrix hess_xformed_delta(dim_, dim_ + 1, kUndefined); - if (using_subspace) { - // Note that in this case we can simply store the speaker-specific - // coefficients for each of the basis matrices. The current - // implementation stores the computed transform to simplify the code! - hess_xformed_delta.SetZero(); - for (int32 b = 0; b < num_bases; b++) { // Eq (B.20) - hess_xformed_delta.AddMat(TraceMatMat(globals.fmllr_bases_[b], - hess_xformed_grad, kTrans), - globals.fmllr_bases_[b], kNoTrans); - } - hess_xformed_delta.Scale(1 / stats_.beta_); - } else { - hess_xformed_delta.CopyFromMat(hess_xformed_grad); - hess_xformed_delta.Scale(1 / stats_.beta_); // Eq. (B.19) - } - -// std::cout << "Hess-X Delta = " << hess_xformed_delta << std::endl; - - // Transform Delta with the Hessian - Matrix pre_xformed_delta(dim_, dim_ + 1, kSetZero); - ApplyInvHessianXformToChange(globals, hess_xformed_delta, - &pre_xformed_delta); - - // Apply inverse pre-transform to Delta - Matrix delta(dim_, dim_ + 1, kSetZero); - ApplyInvPreXformToChange(globals, pre_xformed_delta, &delta); - -#ifdef KALDI_PARANOID - // Check whether co-ordinate transformation is correct. - { - BaseFloat tr1 = TraceMatMat(delta, grad, kTrans); - BaseFloat tr2 = TraceMatMat(pre_xformed_delta, pre_xformed_grad, - kTrans); - BaseFloat tr3 = TraceMatMat(hess_xformed_delta, hess_xformed_grad, - kTrans); - AssertEqual(tr1, tr2, 1e-5); - AssertEqual(tr2, tr3, 1e-5); - } -#endif - - // Calculate the optimal step size - SubMatrix A(*out_xform, 0, dim_, 0, dim_); - BaseFloat step_size = CalcFmllrStepSize(stats_, sgmm, delta, A, G, - opts.fmllr_iters); - - // Update: W <-- W + k \Delta Eq. (B.34) - out_xform->AddMat(step_size, delta, kNoTrans); - auxf_old = auxf_new; - - // Check the objective function change for the last iteration - if (iter == opts.fmllr_iters - 1) { - auxf_new = this->FmllrObjGradient(sgmm, *out_xform, NULL, NULL); - logdet = A.LogDet(); - // SubMatrix A points to the memory location of out_xform, and so will - // contain the updated value - - KALDI_VLOG(2) << "Iter " << iter << ": Auxiliary function improvement: " - << ((auxf_new - auxf_old) / stats_.beta_) << " per frame over " - << (stats_.beta_) << " frames"; - auxf_improv += auxf_new - auxf_old; - } - } - if (auxf_out != NULL) *auxf_out = auxf_improv; - auxf_improv /= (stats_.beta_ + 1.0e-10); - - KALDI_LOG << "Auxiliary function improvement for FMLLR = " << auxf_improv - << " per frame over " << stats_.beta_ << " frames. Log-determinant = " - << logdet; - return true; - } else { - KALDI_ASSERT(stats_.beta_ < mincount); -// std::cerr.precision(10); -// std::cerr.setf(std::ios::fixed,std::ios::floatfield); - KALDI_WARN << "Not updating FMLLR because count is " << stats_.beta_ - << " < " << (mincount); - if (auxf_out != NULL) *auxf_out = 0.0; - return false; - } // Do not use the transform if it does not have enough counts - KALDI_ASSERT(false); // Should never be reached. -} - -void EstimateSgmmFmllrSubspace(const SpMatrix &fmllr_grad_scatter, - int32 num_fmllr_bases, int32 feat_dim, - SgmmFmllrGlobalParams *globals, double min_eig) { - KALDI_ASSERT(num_fmllr_bases > 0 && feat_dim > 0); - if (num_fmllr_bases > feat_dim * (feat_dim + 1)) { - num_fmllr_bases = feat_dim * (feat_dim + 1); - KALDI_WARN << "Limiting number of fMLLR bases to be the same as transform " - << "dimension."; - } - - vector< Matrix > &fmllr_bases(globals->fmllr_bases_); - - Vector s(fmllr_grad_scatter.NumRows()); - Matrix U(fmllr_grad_scatter.NumRows(), - fmllr_grad_scatter.NumRows()); - try { - fmllr_grad_scatter.Eig(&s, &U); - SortSvd(&s, &U); // in case was not exactly sorted. - KALDI_VLOG(1) << "Eigenvalues (max 200) of CMLLR scatter are: " - << (SubVector(s, 0, - std::min(static_cast(200), - s.Dim()))); - -// for (int32 b = 2; b < num_fmllr_bases; b++) { -// if (s(b) < min_eig) { -// num_fmllr_bases = b; -// KALDI_WARN << "Limiting number of fMLLR bases to " << num_fmllr_bases -// << " because of small eigenvalues."; -// break; -// } -// } - - U.Transpose(); // Now the rows of U correspond to the basis vectors. - fmllr_bases.resize(num_fmllr_bases); - for (int32 b = 0; b < num_fmllr_bases; b++) { - fmllr_bases[b].Resize(feat_dim, feat_dim + 1, kSetZero); - fmllr_bases[b].CopyRowsFromVec(U.Row(b)); - } - KALDI_LOG << "Estimated " << num_fmllr_bases << " fMLLR basis matrices."; - } catch(const std::exception &e) { - KALDI_WARN << "Not estimating FMLLR bases because of a thrown exception:\n" - << e.what(); - fmllr_bases.resize(0); - } -} // End of EstimateSgmmFmllrSubspace - - -} // namespace kaldi - diff --git a/src/sgmm/fmllr-sgmm.h b/src/sgmm/fmllr-sgmm.h deleted file mode 100644 index 832093e39ad..00000000000 --- a/src/sgmm/fmllr-sgmm.h +++ /dev/null @@ -1,192 +0,0 @@ -// sgmm/fmllr-sgmm.h - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#ifndef KALDI_SGMM_FMLLR_SGMM_H_ -#define KALDI_SGMM_FMLLR_SGMM_H_ - -#include -#include - -#include "base/kaldi-common.h" -#include "sgmm/am-sgmm.h" -#include "transform/transform-common.h" -#include "util/kaldi-table.h" -#include "util/kaldi-holder.h" -#include "itf/options-itf.h" - -namespace kaldi { - -/** \struct SgmmFmllrConfig - * Configuration variables needed in the estimation of FMLLR for SGMMs. - */ -struct SgmmFmllrConfig { - int32 fmllr_iters; ///< Number of iterations in FMLLR estimation. - int32 step_iters; ///< Iterations to find optimal FMLLR step size. - /// Minimum occupancy count to estimate FMLLR using basis matrices. - BaseFloat fmllr_min_count_basis; - /// Minimum occupancy count to estimate FMLLR without basis matrices. - BaseFloat fmllr_min_count; - /// Minimum occupancy count to stop using FMLLR bases and switch to - /// regular FMLLR estimation. - BaseFloat fmllr_min_count_full; - /// Number of basis matrices to use for FMLLR estimation. Can only *reduce* - /// the number of bases present. Overridden by the 'bases_occ_scale' option. - int32 num_fmllr_bases; - /// Scale per-speaker count to determine number of CMLLR bases. - BaseFloat bases_occ_scale; - - SgmmFmllrConfig() { - fmllr_iters = 5; - step_iters = 10; - fmllr_min_count_basis = 100.0; - fmllr_min_count = 1000.0; - fmllr_min_count_full = 5000.0; - num_fmllr_bases = 50; - bases_occ_scale = 0.2; - } - - void Register(OptionsItf *opts); -}; - -inline void SgmmFmllrConfig::Register(OptionsItf *opts) { - std::string module = "SgmmFmllrConfig: "; - opts->Register("fmllr-iters", &fmllr_iters, module+ - "Number of iterations in FMLLR estimation."); - opts->Register("fmllr-step-iters", &step_iters, module+ - "Number of iterations to find optimal FMLLR step size."); - opts->Register("fmllr-min-count-bases", &fmllr_min_count_basis, module+ - "Minimum occupancy count to estimate FMLLR using basis matrices."); - opts->Register("fmllr-min-count", &fmllr_min_count, module+ - "Minimum occupancy count to estimate FMLLR (without bases)."); - opts->Register("fmllr-min-count-full", &fmllr_min_count_full, module+ - "Minimum occupancy count to stop using basis matrices for FMLLR."); - opts->Register("fmllr-num-bases", &num_fmllr_bases, module+ - "Number of FMLLR basis matrices."); - opts->Register("fmllr-bases-occ-scale", &bases_occ_scale, module+ - "Scale per-speaker count to determine number of CMLLR bases."); -} - - -/** \class SgmmFmllrGlobalParams - * Global adaptation parameters. - */ -class SgmmFmllrGlobalParams { - public: - void Init(const AmSgmm &sgmm, const Vector &state_occs); - void Write(std::ostream &out_stream, bool binary) const; - void Read(std::istream &in_stream, bool binary); - bool IsEmpty() const { - return (pre_xform_.NumRows() == 0 || inv_xform_.NumRows() == 0 || - mean_scatter_.Dim() == 0); - } - bool HasBasis() const { return fmllr_bases_.size() != 0; } - - /// Pre-transform matrix. Dim is [D][D+1]. - Matrix pre_xform_; - /// Inverse of pre-transform. Dim is [D][D+1]. - Matrix inv_xform_; - /// Diagonal of mean-scatter matrix. Dim is [D] - Vector mean_scatter_; - /// \tilde{W}_b. [b][d][d], dim is [B][D][D+1]. - std::vector< Matrix > fmllr_bases_; -}; - -inline void SgmmFmllrGlobalParams::Init(const AmSgmm &sgmm, - const Vector &state_occs) { - sgmm.ComputeFmllrPreXform(state_occs, &pre_xform_, &inv_xform_, - &mean_scatter_); -} - -/** \class FmllrSgmmAccs - * Class for computing the accumulators needed for the maximum-likelihood - * estimate of FMLLR transforms for a subspace GMM acoustic model. - */ -class FmllrSgmmAccs { - public: - FmllrSgmmAccs() : dim_(-1) {} - ~FmllrSgmmAccs() {} - - void Init(int32 dim, int32 num_gaussians); - void SetZero() { stats_.SetZero(); } - - void Write(std::ostream &out_stream, bool binary) const; - void Read(std::istream &in_stream, bool binary, bool add); - - /// Accumulation routine that computes the Gaussian posteriors and calls - /// the AccumulateFromPosteriors function with the computed posteriors. - /// The 'data' argument is not FMLLR-transformed and is needed in addition - /// to the the 'frame_vars' since the latter only contains a copy of the - /// transformed feature vector. - BaseFloat Accumulate(const AmSgmm &sgmm, - const SgmmPerSpkDerivedVars &spk, - const VectorBase &data, - const SgmmPerFrameDerivedVars &frame_vars, - int32 state_index, BaseFloat weight); - - void AccumulateFromPosteriors(const AmSgmm &sgmm, - const SgmmPerSpkDerivedVars &spk, - const VectorBase &data, - const std::vector &gauss_select, - const Matrix &posteriors, - int32 state_index); - - void AccumulateForFmllrSubspace(const AmSgmm &sgmm, - const SgmmFmllrGlobalParams &fmllr_globals, - SpMatrix *grad_scatter); - - BaseFloat FmllrObjGradient(const AmSgmm &sgmm, - const Matrix &xform, - Matrix *grad_out, - Matrix *G_out) const; - - /// Computes the FMLLR transform from the accumulated stats, using the - /// pre-transforms in fmllr_globals. Expects the transform matrix out_xform - /// to be initialized to the correct size. Returns true if the transform was - /// updated (i.e. had enough counts). - bool Update(const AmSgmm &model, - const SgmmFmllrGlobalParams &fmllr_globals, - const SgmmFmllrConfig &opts, Matrix *out_xform, - BaseFloat *frame_count, BaseFloat *auxf_improv) const; - - /// Accessors - int32 Dim() const { return dim_; } - const AffineXformStats &stats() const { return stats_; } - - private: - AffineXformStats stats_; ///< Accumulated stats - int32 dim_; ///< Dimension of feature vectors - - // Cannot have copy constructor and assigment operator - KALDI_DISALLOW_COPY_AND_ASSIGN(FmllrSgmmAccs); -}; - -/// Computes the fMLLR basis matrices given the scatter of the vectorized -/// gradients (eq: B.10). The result is stored in 'fmllr_globals'. -/// The actual number of bases may be less than 'num_fmllr_bases' depending -/// on the feature dimension and number of eigenvalues greater than 'min_eig'. -void EstimateSgmmFmllrSubspace(const SpMatrix &fmllr_grad_scatter, - int32 num_fmllr_bases, int32 feat_dim, - SgmmFmllrGlobalParams *fmllr_globals, - double min_eig = 0.0); - -} // namespace kaldi - -#endif // KALDI_SGMM_FMLLR_SGMM_H_ diff --git a/src/sgmm/sgmm-clusterable.cc b/src/sgmm/sgmm-clusterable.cc deleted file mode 100644 index f49f4f993f2..00000000000 --- a/src/sgmm/sgmm-clusterable.cc +++ /dev/null @@ -1,280 +0,0 @@ -// sgmm/sgmm-clusterable.cc - -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "sgmm/sgmm-clusterable.h" -#include "hmm/hmm-utils.h" - -namespace kaldi { - -void SgmmClusterable::Accumulate( - const SgmmPerFrameDerivedVars &per_frame_vars, - int32 j, // state index in original SGMM. - BaseFloat weight) { - Matrix post; - KALDI_ASSERT(weight >= 0.0); // Doesn't make sense to use negative weights here. - // Compute Gaussian-level posteriors. - // Note: "post" is indexed by Gaussian-selection index. - sgmm_.ComponentPosteriors(per_frame_vars, j, &post); - if (weight != 1.0) post.Scale(weight); - const std::vector &gselect = per_frame_vars.gselect; - for (int32 ki = 0; ki < gselect.size(); ki++) { - int32 i = gselect[ki]; - BaseFloat gamma = 0.0; // Sum the weight over all the vectors (index m) in - // the state. In sensible cases there should be just one vector per state - // at the point where we do this, though. - for (int32 m = 0; m < post.NumCols(); m++) gamma += post(ki, m); - gamma_(i) += gamma; - y_.AddVec(gamma, per_frame_vars.zti.Row(ki)); - } - // Invalidate my_H_, if present, since it's not efficient to - // keep it updated during accumulation. - if (my_H_.NumRows() != 0) - my_H_.Resize(0); -} - -BaseFloat SgmmClusterable::Objf() const { - // Objective function consists of the expected log-likelihood of - // a weight (assuming we estimate the weights directly as parameters - // instead of the whole subspace thing on the weights), plus - // the auxiliary function improvement we would get from estimating - // the state vector v_j starting from zero. Note: zero is an - // arbitrary starting point-- we could use any value as long as - // we were consistent. - KALDI_ASSERT(static_cast(H_.size()) == sgmm_.NumGauss()); - if (my_H_.NumRows() == 0.0) { - SgmmClusterable *s = static_cast(this->Copy()); // will - // set up my_H_, which we need. - BaseFloat ans = s->Objf(); - delete s; - return ans; - } - double ans = 0.0; - double tot_gamma = gamma_.Sum(), tot_gamma2 = 0.0; - if (tot_gamma == 0.0) return 0.0; - int32 I = gamma_.Dim(); - - for (int32 i = 0; i < I; i++) { - double gamma = gamma_(i); - if (gamma > 0.0) { // Note: should not be negative-- if it is, due to - double prob = gamma / tot_gamma; - if (prob > 0.0) { // Note: prob could be zero due to underflow-- this - // happened! [we can get tiny values due to floating-point roundoff - // while subtracting clusterable objects]. - ans += gamma * Log(gamma / tot_gamma); - } - } - tot_gamma2 += gamma; - } - if (tot_gamma2 == 0.0) - return 0.0; // No positive elements... maybe small negative ones were from - // round off. - - // objf improvement is y^T H^{-1} y. - // We'll try to compute this using Cholesky, first, which is more - // efficient; if this fails or appears to lead to big values, - // we'll back off to a more efficient SVD-based implementation. - try { - TpMatrix C(my_H_.NumRows()); - C.Cholesky(my_H_); - C.Invert(); - for (int32 i = 0; i < C.NumRows(); i++) - if (fabs(C(i, i)) > 100.0) { - KALDI_VLOG(3) << "Condion-number probably bad: element is " - << C(i, i); - throw std::runtime_error("Bad condition number"); // back off to SVD. - } - // Note: assuming things are well preconditioned, the elements - // C(i,i) should be of the rough magnitude 1/sqrt(count). - Vector yC(C.NumRows()); - // Note: if we decompose H = C C^T, then the line below - // does yC = C^{-1} y. Note: we are computing the inner - // product y^T H^{-1} y. H^{-1} = C^{-T} C^{-1}, so - // y^T H^{-1} y = y^T C^{-T} C^{-1} y = yC^T yC. - yC.AddTpVec(1.0, C, kNoTrans, y_, 0.0); - ans += 0.5 * VecVec(yC, yC); - } catch (...) { // Choleksy threw, or we detected bad condition. - // we'll do this using an SVD-based implementation that will - // deal with non-invertible matrices. - KALDI_VLOG(3) << "Backing off to SVD-based objective computation."; - Vector v(y_.Dim()); // Initialized automatically to zero. - ans += SolveQuadraticProblem(my_H_, y_, SolverOptions(), &v); // The objective function - // change from estimating this vector. - } - return ans; -} - -void SgmmClusterable::SetZero() { - gamma_.SetZero(); - y_.SetZero(); - my_H_.SetZero(); // Should work even if empty. -} - -void SgmmClusterable::Add(const Clusterable &other_in) { - const SgmmClusterable *other = - static_cast(&other_in); - gamma_.AddVec(1.0, other->gamma_); - y_.AddVec(1.0, other->y_); - if (!H_.empty()) { // we need to compute my_H_. - if (my_H_.NumRows() != 0 && other->my_H_.NumRows() != 0) - my_H_.AddSp(1.0, other->my_H_); - else { - my_H_.Resize(0); - ComputeH(); - } - } -} - -void SgmmClusterable::Sub(const Clusterable &other_in) { - const SgmmClusterable *other = - static_cast(&other_in); - gamma_.AddVec(-1.0, other->gamma_); - y_.AddVec(-1.0, other->y_); - if (!H_.empty()) { - if (my_H_.NumRows() != 0 && other->my_H_.NumRows() != 0) - my_H_.AddSp(-1.0, other->my_H_); - else { - my_H_.Resize(0); - ComputeH(); - } - } -} - -BaseFloat SgmmClusterable::Normalizer() const { - return gamma_.Sum(); -} - -Clusterable *SgmmClusterable::Copy() const { - SgmmClusterable *ans = new SgmmClusterable(sgmm_, H_); - ans->gamma_.CopyFromVec(gamma_); - ans->y_.CopyFromVec(y_); - if (!H_.empty()) { - if (my_H_.NumRows() == 0.0) ans->ComputeH(); - else { - ans->my_H_.Resize(my_H_.NumRows()); - ans->my_H_.CopyFromSp(my_H_); - } - } - return ans; -} - -void SgmmClusterable::Scale(BaseFloat f) { - KALDI_ASSERT(f >= 0.0); - gamma_.Scale(f); - y_.Scale(f); - if (my_H_.NumRows() != 0) my_H_.Scale(f); -} - -void SgmmClusterable::Write(std::ostream &os, bool binary) const { - gamma_.Write(os, binary); - y_.Write(os, binary); -} - -Clusterable *SgmmClusterable::ReadNew(std::istream &is, bool binary) const { - SgmmClusterable *ans = new SgmmClusterable(sgmm_, H_); - ans->gamma_.Read(is, binary); - ans->y_.Read(is, binary); - if (!H_.empty()) ans->ComputeH(); - return ans; -} - - -bool AccumulateSgmmTreeStats(const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const std::vector > &H, - int N, // context window size. - int P, // central position. - const std::vector &ci_phones, // must be sorted - const std::vector &alignment, - const std::vector > &gselect, - const SgmmPerSpkDerivedVars &per_spk_vars, - const Matrix &features, - std::map *stats) { - KALDI_ASSERT(IsSortedAndUniq(ci_phones)); - std::vector > split_alignment; - bool ans = SplitToPhones(trans_model, alignment, &split_alignment); - if (!ans) { - KALDI_WARN << "AccumulateTreeStats: bad alignment."; - return false; - } - int t = 0; - SgmmPerFrameDerivedVars per_frame_vars; - - KALDI_ASSERT(features.NumRows() == static_cast(alignment.size()) - && alignment.size() == gselect.size()); - for (int i = -N; i < static_cast(split_alignment.size()); i++) { - // consider window starting at i, only if i+P is within - // list of phones. - if (i + P >= 0 && i + P < static_cast(split_alignment.size())) { - int32 central_phone = trans_model.TransitionIdToPhone(split_alignment[i+P][0]); - bool is_ctx_dep = ! std::binary_search(ci_phones.begin(), - ci_phones.end(), - central_phone); - EventType evec; - for (int j = 0; j < N; j++) { - int phone; - if (i + j >= 0 && i + j < static_cast(split_alignment.size())) - phone = trans_model.TransitionIdToPhone(split_alignment[i+j][0]); - else - phone = 0; // ContextDependency class uses 0 to mean "out of window". - - if (is_ctx_dep || j == P) - evec.push_back(std::make_pair(static_cast(j), static_cast(phone))); - } - for (int j = 0; j < static_cast(split_alignment[i+P].size());j++) { - // for central phone of this window... - EventType evec_more(evec); - int32 pdf_id = trans_model.TransitionIdToPdf(split_alignment[i+P][j]), - pdf_class = trans_model.TransitionIdToPdfClass(split_alignment[i+P][j]); - // pdf_id represents the acoustic state in the current model. - // pdf_class will normally by 0, 1 or 2 for a 3-state HMM. - - std::pair pr(kPdfClass, pdf_class); - evec_more.push_back(pr); - std::sort(evec_more.begin(), evec_more.end()); // these must be sorted! - if (stats->count(evec_more) == 0) - (*stats)[evec_more] = new SgmmClusterable(am_sgmm, H); - - am_sgmm.ComputePerFrameVars(features.Row(t), gselect[t], per_spk_vars, 0.0, - &per_frame_vars); - BaseFloat weight = 1.0; // weight is one, since we have alignment. - (*stats)[evec_more]->Accumulate(per_frame_vars, pdf_id, weight); - t++; - } - } - } - KALDI_ASSERT(t == static_cast(alignment.size())); - return true; -} - -void SgmmClusterable::ComputeH() { - // We're computing my_H_, as a weighted sum of H_, with gamma_ as the - // weights. - KALDI_ASSERT(!H_.empty() && my_H_.NumRows() == 0); // Invalid to call this if H_ empty, - // or my_H_ already set up. - my_H_.Resize(H_[0].NumRows()); // will initialize to zero. - KALDI_ASSERT(static_cast(H_.size()) == gamma_.Dim()); - for (int32 i = 0; i < gamma_.Dim(); i++) { - double gamma = gamma_(i); - if (gamma > 0.0) my_H_.AddSp(gamma, H_[i]); - } -} - - -} // end namespace kaldi diff --git a/src/sgmm/sgmm-clusterable.h b/src/sgmm/sgmm-clusterable.h deleted file mode 100644 index 9a44fce1512..00000000000 --- a/src/sgmm/sgmm-clusterable.h +++ /dev/null @@ -1,112 +0,0 @@ -// sgmm/sgmm-clusterable.h - -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_SGMM_SGMM_CLUSTERABLE_H_ -#define KALDI_SGMM_SGMM_CLUSTERABLE_H_ - -#include -#include - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "itf/clusterable-itf.h" - -namespace kaldi { - -/// This header defines an object that can be used to create decision -/// trees using a form of SGMM statistics. It is analogous to the -/// GaussClusterable object, but uses the SGMM. The auxiliary function -/// it uses is related to the normal SGMM auxiliary function, but for -/// efficiency it uses a simpler model on the weights, which is equivalent -/// to assuming the weights w_{ji} [there no index m since we assume one -/// mixture per state!] are directly estimated using ML, instead of being -/// computed from v_j and w_i as in the actual SGMM. - -class SgmmClusterable: public Clusterable { - public: - SgmmClusterable(const AmSgmm &sgmm, - const std::vector< SpMatrix > &H): // H can be empty vector - // at initialization. Used to cache something from the model. - sgmm_(sgmm), - H_(H), - gamma_(sgmm.NumGauss()), - y_(sgmm.PhoneSpaceDim()) { } - virtual std::string Type() const { return "sgmm"; } - - /// compare with the Accumulate function of MleAmSgmmAccs - /// Note: the pdf-index j, relating to the original SGMM - /// in sgmm_, is only needed to select the right vector to - /// compute Gaussian-level alignments with. - void Accumulate(const SgmmPerFrameDerivedVars &frame_vars, - int32 j, - BaseFloat weight); - - virtual BaseFloat Objf() const; - virtual void SetZero(); - virtual void Add(const Clusterable &other_in); - virtual void Sub(const Clusterable &other_in); - virtual BaseFloat Normalizer() const; - virtual Clusterable *Copy() const; - virtual void Scale(BaseFloat f); - virtual void Write(std::ostream &os, bool binary) const; - virtual Clusterable *ReadNew(std::istream &is, bool binary) const; - virtual ~SgmmClusterable() {} - - const Vector &gamma () const { return gamma_; } - const Vector &y() const { return y_; } - private: - void ComputeH(); // Compute the quantity my_H_, from gamma_ and H_. - - const AmSgmm &sgmm_; // Reference to the SGMM object, needed to compute - // objective functions. - const std::vector< SpMatrix > &H_; // Reference to a vector of SpMatrix which - // should have been computed from the model using ComputeH(). Needed for Objf() function. - Vector gamma_; // Occupation counts for each Gaussian index. Comparable - // to the gamma_{jmi} statistics in the SGMM paper. - Vector y_; // Statistics comparable to the y_{jm} statistics in the SGMM - // paper. - - SpMatrix my_H_; // This quantity is a weighted sum over the H quantities, - // weighted by gamma_(i). It's only nonempty if the H_ matrix is nonempty. - // This quantity is never written to disk; it is to be viewed as a kind of - // cache, present only for purposes of fast objective-function computation. -}; - - -/// Comparable to AccumulateTreeStats, but this version -/// accumulates stats of type SgmmClusterable. Returns -/// true on success. -bool AccumulateSgmmTreeStats(const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const std::vector > &H, // this is a ref. to temp. - // storage needed in the clusterable class... can be empty - // during accumulation as it doesn't call Objf(). - int N, // context window size. - int P, // central position. - const std::vector &ci_phones, // must be sorted - const std::vector &alignment, - const std::vector > &gselect, - const SgmmPerSpkDerivedVars &per_spk_vars, - const Matrix &features, - std::map *stats); - - -} // end namespace kaldi - -#endif // KALDI_SGMM_SGMM_CLUSTERABLE_H_ diff --git a/src/sgmmbin/Makefile b/src/sgmmbin/Makefile deleted file mode 100644 index 556001910e1..00000000000 --- a/src/sgmmbin/Makefile +++ /dev/null @@ -1,31 +0,0 @@ - -all: -EXTRA_CXXFLAGS = -Wno-sign-compare -include ../kaldi.mk - -BINFILES = init-ubm sgmm-align-compiled sgmm-acc-stats-ali \ - sgmm-sum-accs sgmm-est sgmm-decode-faster sgmm-init sgmm-gselect \ - sgmm-est-fmllr sgmm-acc-stats sgmm-est-spkvecs sgmm-post-to-gpost \ - sgmm-acc-stats-gpost sgmm-est-spkvecs-gpost sgmm-comp-prexform \ - sgmm-est-fmllr-gpost sgmm-acc-fmllrbasis-ali sgmm-est-fmllrbasis \ - sgmm-calc-distances sgmm-normalize sgmm-latgen-simple \ - sgmm-latgen-faster sgmm-rescore-lattice sgmm-copy sgmm-write-ubm \ - sgmm-mixup sgmm-info sgmm-acc-tree-stats sgmm-sum-tree-stats \ - sgmm-build-tree sgmm-cluster-phones sgmm-init-from-tree-stats \ - sgmm-est-ebw sgmm-acc-stats2 sgmm-est-multi - -OBJFILES = - - - -TESTFILES = - - -ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \ - ../fstext/kaldi-fstext.a ../sgmm/kaldi-sgmm.a ../hmm/kaldi-hmm.a \ - ../feat/kaldi-feat.a ../transform/kaldi-transform.a \ - ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \ - ../base/kaldi-base.a - -include ../makefiles/default_rules.mk diff --git a/src/sgmmbin/init-ubm.cc b/src/sgmmbin/init-ubm.cc deleted file mode 100644 index 3a0d398b7f6..00000000000 --- a/src/sgmmbin/init-ubm.cc +++ /dev/null @@ -1,95 +0,0 @@ -// sgmmbin/init-ubm.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "util/kaldi-io.h" -#include "gmm/diag-gmm.h" -#include "gmm/full-gmm.h" -#include "gmm/am-diag-gmm.h" -#include "hmm/transition-model.h" - - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - typedef kaldi::BaseFloat BaseFloat; - - const char *usage = - "Cluster the Gaussians in a diagonal-GMM acoustic model\n" - "to a single full-covariance or diagonal-covariance GMM.\n" - "Usage: init-ubm [options] \n"; - - bool binary_write = true, fullcov_ubm = true; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("fullcov-ubm", &fullcov_ubm, "Write out full covariance UBM."); - kaldi::UbmClusteringOptions ubm_opts; - ubm_opts.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - ubm_opts.Check(); - - std::string model_in_filename = po.GetArg(1), - occs_in_filename = po.GetArg(2), - gmm_out_filename = po.GetArg(3); - - kaldi::AmDiagGmm am_gmm; - kaldi::TransitionModel trans_model; - { - bool binary_read; - kaldi::Input ki(model_in_filename, &binary_read); - trans_model.Read(ki.Stream(), binary_read); - am_gmm.Read(ki.Stream(), binary_read); - } - - kaldi::Vector state_occs; - state_occs.Resize(am_gmm.NumPdfs()); - { - bool binary_read; - kaldi::Input ki(occs_in_filename, &binary_read); - state_occs.Read(ki.Stream(), binary_read); - } - - kaldi::DiagGmm ubm; - ClusterGaussiansToUbm(am_gmm, state_occs, ubm_opts, &ubm); - if (fullcov_ubm) { - kaldi::FullGmm full_ubm; - full_ubm.CopyFromDiagGmm(ubm); - kaldi::Output ko(gmm_out_filename, binary_write); - full_ubm.Write(ko.Stream(), binary_write); - } else { - kaldi::Output ko(gmm_out_filename, binary_write); - ubm.Write(ko.Stream(), binary_write); - } - - KALDI_LOG << "Written UBM to " << gmm_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc b/src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc deleted file mode 100644 index 3c33e47dde2..00000000000 --- a/src/sgmmbin/sgmm-acc-fmllrbasis-ali.cc +++ /dev/null @@ -1,216 +0,0 @@ -// sgmmbin/sgmm-acc-fmllrbasis-ali.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "util/common-utils.h" -#include "hmm/transition-model.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/fmllr-sgmm.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - const char *usage = - "Accumulate stats for FMLLR bases training.\n" - "Usage: sgmm-acc-fmllrbasis-ali [options] " - " \n" - "e.g.: sgmm-acc-fmllrbasis-ali 1.mdl scp:train.scp ark:1.ali 1.acc\n"; - - ParseOptions po(usage); - bool binary_write = true; - std::string gselect_rspecifier, spkvecs_rspecifier, silphones_str; - BaseFloat sil_weight = 0.0; - kaldi::SgmmGselectConfig sgmm_opts; - po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("gselect", &gselect_rspecifier, - "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, - "Speaker vectors to use during aligment (rspecifier)"); - po.Register("sil-phone-list", &silphones_str, - "Colon-separated list of phones (to weigh differently)"); - po.Register("sil-weight", &sil_weight, "Weight for \"silence\" phones."); - sgmm_opts.Register(&po); - po.Read(argc, argv); - - if (po.NumArgs() != 5) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - alignments_rspecifier = po.GetArg(3), - spk2utt_rspecifier = po.GetArg(4), - accs_wxfilename = po.GetArg(5); - - typedef kaldi::int32 int32; - - AmSgmm am_sgmm; - TransitionModel trans_model; - SgmmFmllrGlobalParams fmllr_globals; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - fmllr_globals.Read(ki.Stream(), binary); - } - - SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier); - - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - - RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier); - - std::vector silence_phones; - if (!SplitStringToIntegers(silphones_str, ":", false, &silence_phones)) { - KALDI_ERR << "Silence-phones string has wrong format " - << silphones_str; - } - ConstIntegerSet silence_set(silence_phones); // faster lookup. - - - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - SpMatrix fmllr_grad_scatter; - int32 dim = am_sgmm.FeatureDim(); - fmllr_grad_scatter.Resize(dim * (dim + 1), kSetZero); - FmllrSgmmAccs spk_stats; - spk_stats.Init(dim, am_sgmm.NumGauss()); - - double tot_like = 0.0, tot_t = 0.0; - int32 num_done = 0, num_no_alignment = 0, num_other_error = 0; - - for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { - spk_stats.SetZero(); - string spk = spk2utt_reader.Key(); - const std::vector &uttlist = spk2utt_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(spk)) { - spk_vars.v_s = spkvecs_reader.Value(spk); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << spk; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - for (size_t i = 0; i < uttlist.size(); i++) { - std::string utt = uttlist[i]; - if (!alignments_reader.HasKey(utt)) { - num_no_alignment++; - continue; - } - const std::vector &alignment = alignments_reader.Value(utt); - - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Did not find features for utterance " << utt; - num_other_error++; - continue; - } - const Matrix &feats = feature_reader.Value(utt); - - if (alignment.size() != feats.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (alignment.size()) << - " vs. "<< (feats.NumRows()); - num_other_error++; - continue; - } - - bool have_gselect = false; - if (gselect_reader.IsOpen()) { - if (gselect_reader.HasKey(utt)) { - have_gselect = (gselect_reader.Value(utt).size() == feats.NumRows()); - if (!have_gselect) - KALDI_WARN << "Gaussian-selection info available for utterance " - << utt << " has wrong size."; - } else { - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt; - } - } - - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : NULL); - double file_like = 0.0, file_t = 0.0; - - - for (size_t i = 0; i < alignment.size(); i++) { - int32 tid = alignment[i]; // transition identifier. - int32 pdf_id = trans_model.TransitionIdToPdf(tid), - phone = trans_model.TransitionIdToPhone(tid); - BaseFloat weight = 1.0; - if (silence_set.count(phone) != 0) { // is a silence. - if (sil_weight > 0.0) - weight = sil_weight; - else - continue; - } - - std::vector this_gselect; - if (gselect != NULL) - this_gselect = (*gselect)[i]; - else - am_sgmm.GaussianSelection(sgmm_opts, feats.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(feats.Row(i), this_gselect, spk_vars, 0.0, - &per_frame_vars); - file_like += - spk_stats.Accumulate(am_sgmm, spk_vars, feats.Row(i), - per_frame_vars, pdf_id, weight); - file_t += weight; - } // end looping over all the frames in the utterance - KALDI_VLOG(1) << "Average likelihood for utterance " << utt << " is " - << (file_like/file_t) << " over " << file_t << " frames"; - tot_like += file_like; - tot_t += file_t; - num_done++; - if (num_done % 20 == 0) - KALDI_VLOG(1) << "After " << num_done << " utterances: Average " - << "likelihood per frame = " << (tot_like/tot_t) - << ", over " << tot_t << " frames"; - } // end looping over all utterance for a given speaker - spk_stats.AccumulateForFmllrSubspace(am_sgmm, fmllr_globals, &fmllr_grad_scatter); - } // end looping over all speakers - - KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment - << " with no alignments, " << num_other_error - << " with other errors."; - - KALDI_LOG << "Overall likelihood per frame frame = " << (tot_like/tot_t) - << " over " << tot_t << " frames."; - - { - Output ko(accs_wxfilename, binary_write); - fmllr_grad_scatter.Write(ko.Stream(), binary_write); - KALDI_LOG << "Written accs to: " << accs_wxfilename; - } - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-stats-ali.cc b/src/sgmmbin/sgmm-acc-stats-ali.cc deleted file mode 100644 index 99371fea829..00000000000 --- a/src/sgmmbin/sgmm-acc-stats-ali.cc +++ /dev/null @@ -1,191 +0,0 @@ -// sgmmbin/sgmm-acc-stats-ali.cc - -// Copyright 2009-2012 Saarland University (author: Arnab Ghoshal); -// Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - const char *usage = - "Accumulate stats for SGMM training.\n" - "Usage: sgmm-acc-stats-ali [options] " - " \n" - "e.g.: sgmm-acc-stats-ali 1.mdl 1.ali scp:train.scp ark:1.ali 1.acc\n"; - - ParseOptions po(usage); - bool binary = true; - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - std::string update_flags_str = "vMNwcSt"; - BaseFloat rand_prune = 1.0e-05; - kaldi::SgmmGselectConfig sgmm_opts; - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("rand-prune", &rand_prune, "Randomized pruning threshold for posteriors"); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS."); - sgmm_opts.Register(&po); - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str); - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - alignments_rspecifier = po.GetArg(3), - accs_wxfilename = po.GetArg(4); - - using namespace kaldi; - typedef kaldi::int32 int32; - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector transition_accs; - if (acc_flags & kaldi::kSgmmTransitions) - trans_model.InitStats(&transition_accs); - MleAmSgmmAccs sgmm_accs(rand_prune); - sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags); - - double tot_like = 0.0; - kaldi::int64 tot_t = 0; - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier); - - RandomAccessInt32VectorVectorReader gselect_reader; - if (!gselect_rspecifier.empty() && !gselect_reader.Open(gselect_rspecifier)) - KALDI_ERR << "Unable to open stream for gaussian-selection indices"; - - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - int32 num_done = 0, num_no_alignment = 0, num_other_error = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!alignments_reader.HasKey(utt)) { - num_no_alignment++; - } else { - const Matrix &mat = feature_reader.Value(); - const std::vector &alignment = alignments_reader.Value(utt); - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == mat.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)\n"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - if (alignment.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (alignment.size()) << - " vs. "<< (mat.NumRows()); - num_other_error++; - continue; - } - - num_done++; - BaseFloat tot_like_this_file = 0.0; - - for (size_t i = 0; i < alignment.size(); i++) { - int32 tid = alignment[i], // transition identifier. - pdf_id = trans_model.TransitionIdToPdf(tid); - if (acc_flags & kaldi::kSgmmTransitions) - trans_model.Accumulate(1.0, tid, &transition_accs); - std::vector this_gselect; - if (!gselect->empty()) this_gselect = (*gselect)[i]; - else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0, - &per_frame_vars); - tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars, - spk_vars.v_s, pdf_id, 1.0, - acc_flags); - } - - sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance. - - KALDI_VLOG(2) << "Average like for this file is " - << (tot_like_this_file/alignment.size()) << " over " - << alignment.size() <<" frames."; - tot_like += tot_like_this_file; - tot_t += alignment.size(); - if (num_done % 50 == 0) { - KALDI_LOG << "Processed " << num_done << " utterances; for utterance " - << utt << " avg. like is " - << (tot_like_this_file/alignment.size()) - << " over " << alignment.size() <<" frames."; - } - } - } - KALDI_LOG << "Overall like per frame (Gaussian only) = " - << (tot_like/tot_t) << " over " << tot_t << " frames."; - - KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment - << " with no alignments, " << num_other_error - << " with other errors."; - - { - Output ko(accs_wxfilename, binary); - // TODO(arnab): Ideally, we shouldn't be writing transition accs if not - // asked for, but that will complicate reading later. To be fixed? - transition_accs.Write(ko.Stream(), binary); - sgmm_accs.Write(ko.Stream(), binary); - } - KALDI_LOG << "Written accs."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-stats-gpost.cc b/src/sgmmbin/sgmm-acc-stats-gpost.cc deleted file mode 100644 index 844afff4360..00000000000 --- a/src/sgmmbin/sgmm-acc-stats-gpost.cc +++ /dev/null @@ -1,174 +0,0 @@ -// sgmmbin/sgmm-acc-stats-gpost.cc - -// Copyright 2009-2012 Saarland University (Author: Arnab Ghoshal) -// Microsoft Corporation; -// Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" - - - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - const char *usage = - "Accumulate stats for SGMM training, given Gaussian-level posteriors\n" - "Usage: sgmm-acc-stats-gpost [options] " - " \n" - "e.g.: sgmm-acc-stats-gpost 1.mdl 1.ali scp:train.scp ark, s, cs:- 1.acc\n"; - - ParseOptions po(usage); - bool binary = true; - std::string spkvecs_rspecifier, utt2spk_rspecifier; - std::string update_flags_str = "vMNwcSt"; - BaseFloat rand_prune = 1.0e-05; - - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors"); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to update: subset of vMNwcS."); - po.Read(argc, argv); - - kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - gpost_rspecifier = po.GetArg(3), - accs_wxfilename = po.GetArg(4); - - using namespace kaldi; - typedef kaldi::int32 int32; - - // Initialize the readers before the model, as this can avoid - // crashes on systems with low virtual memory. - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector transition_accs; - if (acc_flags & kaldi::kSgmmTransitions) - trans_model.InitStats(&transition_accs); - MleAmSgmmAccs sgmm_accs(rand_prune); - sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags); - - double tot_t = 0.0; - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - int32 num_done = 0, num_no_posterior = 0, num_other_error = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!gpost_reader.HasKey(utt)) { - num_no_posterior++; - } else { - const Matrix &mat = feature_reader.Value(); - const SgmmGauPost &gpost = gpost_reader.Value(utt); - - if (gpost.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (gpost.size()) << - " vs. "<< (mat.NumRows()); - num_other_error++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - num_done++; - BaseFloat tot_weight = 0.0; - - for (size_t i = 0; i < gpost.size(); i++) { - const std::vector &gselect = gpost[i].gselect; - am_sgmm.ComputePerFrameVars(mat.Row(i), gselect, spk_vars, 0.0, - &per_frame_vars); - - for (size_t j = 0; j < gpost[i].tids.size(); j++) { - int32 tid = gpost[i].tids[j], // transition identifier. - pdf_id = trans_model.TransitionIdToPdf(tid); - - BaseFloat weight = gpost[i].posteriors[j].Sum(); - if (acc_flags & kaldi::kSgmmTransitions) - trans_model.Accumulate(weight, tid, &transition_accs); - sgmm_accs.AccumulateFromPosteriors(am_sgmm, per_frame_vars, - gpost[i].posteriors[j], - spk_vars.v_s, - pdf_id, acc_flags); - tot_weight += weight; - } - } - - sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance. - - tot_t += tot_weight; - if (num_done % 50 == 0) - KALDI_LOG << "Processed " << num_done << " utterances"; - } - } - KALDI_LOG << "Overall number of frames is " << tot_t; - - KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior - << " with no posteriors, " << num_other_error - << " with other errors."; - - { - Output ko(accs_wxfilename, binary); - // TODO(arnab): Ideally, we shouldn't be writing transition accs if not - // asked for, but that will complicate reading later. To be fixed? - transition_accs.Write(ko.Stream(), binary); - sgmm_accs.Write(ko.Stream(), binary); - } - KALDI_LOG << "Written accs."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-stats.cc b/src/sgmmbin/sgmm-acc-stats.cc deleted file mode 100644 index 7ea3a8b13be..00000000000 --- a/src/sgmmbin/sgmm-acc-stats.cc +++ /dev/null @@ -1,211 +0,0 @@ -// sgmmbin/sgmm-acc-stats.cc - -// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal), -// 2014 Guoguo Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/posterior.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - const char *usage = - "Accumulate stats for SGMM training.\n" - "Usage: sgmm-acc-stats [options] " - " \n" - "e.g.: sgmm-acc-stats 1.mdl 1.ali scp:train.scp 'ark:ali-to-post 1.ali ark:-|' 1.acc\n"; - - ParseOptions po(usage); - bool binary = true; - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - std::string update_flags_str = "vMNwcSt"; - BaseFloat rand_prune = 1.0e-05; - SgmmGselectConfig sgmm_opts; - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors"); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate " - "stats for: subset of vMNwcS."); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - posteriors_rspecifier = po.GetArg(3), - accs_wxfilename = po.GetArg(4); - - using namespace kaldi; - typedef kaldi::int32 int32; - - // Initialize the readers before the model, as the model can - // be large, and we don't want to call fork() after reading it if - // virtual memory may be low. - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector transition_accs; - if (acc_flags & kaldi::kSgmmTransitions) - trans_model.InitStats(&transition_accs); - MleAmSgmmAccs sgmm_accs(rand_prune); - sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags); - - double tot_like = 0.0; - double tot_t = 0; - - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - int32 num_done = 0, num_no_posterior = 0, num_other_error = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!posteriors_reader.HasKey(utt)) { - num_no_posterior++; - } else { - const Matrix &mat = feature_reader.Value(); - const Posterior &posterior = posteriors_reader.Value(utt); - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == mat.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - if (posterior.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) << - " vs. "<< (mat.NumRows()); - num_other_error++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - num_done++; - BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0; - - Posterior pdf_posterior; - ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior); - for (size_t i = 0; i < posterior.size(); i++) { - if (posterior[i].empty()) - continue; - std::vector this_gselect; - if (!gselect->empty()) this_gselect = (*gselect)[i]; - else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0, - &per_frame_vars); - - // Accumulates for SGMM. - for (size_t j = 0; j < pdf_posterior[i].size(); j++) { - int32 pdf_id = pdf_posterior[i][j].first; - BaseFloat weight = pdf_posterior[i][j].second; - tot_like_this_file += sgmm_accs.Accumulate(am_sgmm, per_frame_vars, - spk_vars.v_s, pdf_id, - weight, acc_flags) - * weight; - tot_weight += weight; - } - - // Accumulates for transitions. - for (size_t j = 0; j < posterior[i].size(); j++) { - if (acc_flags & kaldi::kSgmmTransitions) { - int32 tid = posterior[i][j].first; - BaseFloat weight = posterior[i][j].second; - trans_model.Accumulate(weight, tid, &transition_accs); - } - } - } - - sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance. - - KALDI_VLOG(2) << "Average like for this file is " - << (tot_like_this_file/tot_weight) << " over " - << tot_weight <<" frames."; - tot_like += tot_like_this_file; - tot_t += tot_weight; - if (num_done % 50 == 0) { - KALDI_LOG << "Processed " << num_done << " utterances; for utterance " - << utt << " avg. like is " - << (tot_like_this_file/tot_weight) - << " over " << tot_weight <<" frames."; - } - } - } - KALDI_LOG << "Overall like per frame (Gaussian only) = " - << (tot_like/tot_t) << " over " << tot_t << " frames."; - - KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior - << " with no posteriors, " << num_other_error - << " with other errors."; - - { - Output ko(accs_wxfilename, binary); - // TODO(arnab): Ideally, we shouldn't be writing transition accs if not - // asked for, but that will complicate reading later. To be fixed? - transition_accs.Write(ko.Stream(), binary); - sgmm_accs.Write(ko.Stream(), binary); - } - KALDI_LOG << "Written accs."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-stats2.cc b/src/sgmmbin/sgmm-acc-stats2.cc deleted file mode 100644 index 2f835b727d1..00000000000 --- a/src/sgmmbin/sgmm-acc-stats2.cc +++ /dev/null @@ -1,217 +0,0 @@ -// sgmmbin/sgmm-acc-stats2.cc - -// Copyright 2009-2012 Saarland University (Author: Arnab Ghoshal), -// Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/posterior.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - const char *usage = - "Accumulate numerator and denominator stats for discriminative training\n" - "of SGMMs (input is posteriors of mixed sign)\n" - "Usage: sgmm-acc-stats2 [options] " - " \n" - "e.g.: sgmm-acc-stats2 1.mdl 1.ali scp:train.scp ark:1.posts num.acc den.acc\n"; - - ParseOptions po(usage); - bool binary = true; - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - std::string update_flags_str = "vMNwcSt"; - BaseFloat rand_prune = 1.0e-05; - SgmmGselectConfig sgmm_opts; - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors"); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to accumulate " - "stats for: subset of vMNwcS."); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - kaldi::SgmmUpdateFlagsType acc_flags = StringToSgmmUpdateFlags(update_flags_str); - - if (po.NumArgs() != 5) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - posteriors_rspecifier = po.GetArg(3), - num_accs_wxfilename = po.GetArg(4), - den_accs_wxfilename = po.GetArg(5); - - - using namespace kaldi; - typedef kaldi::int32 int32; - typedef kaldi::int64 int64; - - // Initialize the readers before the model, as the model can - // be large, and we don't want to call fork() after reading it if - // virtual memory may be low. - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector num_transition_accs, den_transition_accs; - if (acc_flags & kaldi::kSgmmTransitions) { - trans_model.InitStats(&num_transition_accs); - trans_model.InitStats(&den_transition_accs); - } - MleAmSgmmAccs num_sgmm_accs(rand_prune), den_sgmm_accs(rand_prune); - num_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags); - den_sgmm_accs.ResizeAccumulators(am_sgmm, acc_flags); - - double tot_like = 0.0, tot_weight = 0.0, tot_abs_weight = 0.0; - int64 tot_frames = 0; - - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - int32 num_done = 0, num_no_posterior = 0, num_other_error = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!posteriors_reader.HasKey(utt)) { - num_no_posterior++; - } else { - const Matrix &mat = feature_reader.Value(); - const Posterior &posterior = posteriors_reader.Value(utt); - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == mat.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - if (posterior.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) << - " vs. "<< (mat.NumRows()); - num_other_error++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - num_done++; - BaseFloat tot_like_this_file = 0.0, tot_weight_this_file = 0.0, - tot_abs_weight_this_file = 0.0; - - for (size_t i = 0; i < posterior.size(); i++) { - std::vector this_gselect; - if (!gselect->empty()) this_gselect = (*gselect)[i]; - else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0, - &per_frame_vars); - - for (size_t j = 0; j < posterior[i].size(); j++) { - int32 tid = posterior[i][j].first, // transition identifier. - pdf_id = trans_model.TransitionIdToPdf(tid); - BaseFloat weight = posterior[i][j].second, - abs_weight = std::abs(weight); - - if (acc_flags & kaldi::kSgmmTransitions) { - trans_model.Accumulate(abs_weight, tid, weight > 0 ? - &num_transition_accs : &den_transition_accs); - } - tot_like_this_file += - (weight > 0 ? num_sgmm_accs : den_sgmm_accs).Accumulate( - am_sgmm, per_frame_vars, spk_vars.v_s, pdf_id, - abs_weight, acc_flags) - * weight; - tot_weight_this_file += weight; - tot_abs_weight_this_file += abs_weight; - } - } - num_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); // no harm doing it per utterance. - den_sgmm_accs.CommitStatsForSpk(am_sgmm, spk_vars.v_s); - - tot_like += tot_like_this_file; - tot_weight += tot_weight_this_file; - tot_abs_weight += tot_abs_weight_this_file; - tot_frames += posterior.size(); - if (num_done % 50 == 0) - KALDI_LOG << "Processed " << num_done << " utterances."; - } - } - KALDI_LOG << "Overall weighted acoustic likelihood per frame was " - << (tot_like/tot_frames) << " over " << tot_frames << " frames; " - << "average weight per frame is " << (tot_weight/tot_frames) - << ", average abs(weight) per frame is " - << (tot_abs_weight/tot_frames); - - KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior - << " with no posteriors, " << num_other_error - << " with other errors."; - - { - Output ko(num_accs_wxfilename, binary); - // TODO(arnab): Ideally, we shouldn't be writing transition accs if not - // asked for, but that will complicate reading later. To be fixed? - num_transition_accs.Write(ko.Stream(), binary); - num_sgmm_accs.Write(ko.Stream(), binary); - } - { - Output ko(den_accs_wxfilename, binary); - den_transition_accs.Write(ko.Stream(), binary); - den_sgmm_accs.Write(ko.Stream(), binary); - } - KALDI_LOG << "Written accs."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-acc-tree-stats.cc b/src/sgmmbin/sgmm-acc-tree-stats.cc deleted file mode 100644 index a63a4ae6f5f..00000000000 --- a/src/sgmmbin/sgmm-acc-tree-stats.cc +++ /dev/null @@ -1,185 +0,0 @@ -// sgmmbin/sgmm-acc-tree-stats.cc - -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "tree/context-dep.h" -#include "tree/build-tree-utils.h" -#include "sgmm/sgmm-clusterable.h" -#include "hmm/transition-model.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; - typedef kaldi::int32 int32; - try { - const char *usage = - "Accumulate statistics for decision tree training.\n" - "This version accumulates statistics in the form of state-specific " - "SGMM stats; you need to use the program sgmm-build-tree to build " - "the tree (and sgmm-sum-tree-accs to sum the stats).\n" - "Usage: sgmm-acc-tree-stats [options] sgmm-model-in features-rspecifier " - "alignments-rspecifier [tree-accs-out]\n" - "e.g.: sgmm-acc-tree-stats --ci-phones=48:49 1.mdl scp:train.scp ark:1.ali 1.tacc\n"; - - ParseOptions po(usage); - bool binary = true; - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - string ci_phones_str; - int N = 3, P = 1; - SgmmGselectConfig sgmm_opts; - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("ci-phones", &ci_phones_str, "Colon-separated list of integer " - "indices of context-independent phones."); - po.Register("context-width", &N, "Context window size."); - po.Register("central-position", &P, - "Central context-window position (zero-based)"); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() < 3 || po.NumArgs() > 4) { - po.PrintUsage(); - exit(1); - } - - std::string sgmm_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - alignment_rspecifier = po.GetArg(3), - accs_wxfilename = po.GetOptArg(4); - - std::vector ci_phones; - if (ci_phones_str != "") { - SplitStringToIntegers(ci_phones_str, ":", false, &ci_phones); - std::sort(ci_phones.begin(), ci_phones.end()); - if (!IsSortedAndUniq(ci_phones) || ci_phones[0] == 0) { - KALDI_ERR << "Invalid set of ci_phones: " << ci_phones_str; - } - } - - TransitionModel trans_model; - AmSgmm am_sgmm; - std::vector > H; // Not initialized in this program-- not needed - // as we don't call Objf() from stats. - { - bool binary; - Input ki(sgmm_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - if (gselect_rspecifier.empty()) - KALDI_ERR << "--gselect option is required."; - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessInt32VectorReader alignment_reader(alignment_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - std::map tree_stats; - - int num_done = 0, num_err = 0; - - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!alignment_reader.HasKey(utt)) { - num_err++; - } else { - const Matrix &mat = feature_reader.Value(); - const std::vector &alignment = alignment_reader.Value(utt); - - if (!gselect_reader.HasKey(utt) || - - gselect_reader.Value(utt).size() != mat.NumRows()) { - KALDI_WARN << "No gselect information for utterance " << utt - << " (or wrong size)"; - num_err++; - continue; - } - - const std::vector > &gselect = - gselect_reader.Value(utt); - - if (alignment.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (alignment.size())<<" vs. "<< (mat.NumRows()); - num_err++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - } - } // else spk_vars is "empty" - - - // The work gets done here. - if (!AccumulateSgmmTreeStats(trans_model, - am_sgmm, - H, - N, P, - ci_phones, - alignment, - gselect, - spk_vars, - mat, - &tree_stats)) { - num_err++; - } else { - num_done++; - if (num_done % 1000 == 0) - KALDI_LOG << "Processed " << num_done << " utterances."; - } - } - } - - BuildTreeStatsType stats; // Converting from a map to a vector of pairs. - - for (std::map::const_iterator iter = tree_stats.begin(); - iter != tree_stats.end(); - iter++ ) { - stats.push_back(std::make_pair(iter->first, static_cast(iter->second))); - } - tree_stats.clear(); - - { - Output ko(accs_wxfilename, binary); - WriteBuildTreeStats(ko.Stream(), binary, stats); - } - KALDI_LOG << "Accumulated stats for " << num_done << " files, " - << num_err << " failed."; - KALDI_LOG << "Number of separate stats (context-dependent states) is " - << stats.size(); - DeleteBuildTreeStats(&stats); - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-align-compiled.cc b/src/sgmmbin/sgmm-align-compiled.cc deleted file mode 100644 index feeffe78840..00000000000 --- a/src/sgmmbin/sgmm-align-compiled.cc +++ /dev/null @@ -1,179 +0,0 @@ -// sgmmbin/sgmm-align-compiled.cc - -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "hmm/hmm-utils.h" -#include "fstext/fstext-lib.h" -#include "decoder/decoder-wrappers.h" -#include "decoder/training-graph-compiler.h" -#include "sgmm/decodable-am-sgmm.h" -#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - using fst::SymbolTable; - using fst::VectorFst; - using fst::StdArc; - - const char *usage = - "Align features given [SGMM-based] models.\n" - "Usage: sgmm-align-compiled [options] model-in graphs-rspecifier " - "feature-rspecifier alignments-wspecifier\n" - "e.g.: sgmm-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"; - - ParseOptions po(usage); - bool binary = true; - AlignConfig align_config; - BaseFloat acoustic_scale = 1.0; - BaseFloat transition_scale = 1.0; - BaseFloat self_loop_scale = 1.0; - BaseFloat log_prune = 5.0; - - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - SgmmGselectConfig sgmm_opts; - - align_config.Register(&po); - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("log-prune", &log_prune, "Pruning beam used to reduce number " - "of exp() evaluations."); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic " - "likelihoods"); - po.Register("transition-scale", &transition_scale, "Scaling factor for " - "some transition probabilities [see also self-loop-scale]."); - po.Register("self-loop-scale", &self_loop_scale, "Scaling factor for " - "self-loop versus non-self-loop probability mass [controls " - "most transition probabilities.]"); - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices " - "(rspecifier)"); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - fst_rspecifier = po.GetArg(2), - feature_rspecifier = po.GetArg(3), - alignment_wspecifier = po.GetArg(4); - - TransitionModel trans_model; - AmSgmm am_sgmm; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - SequentialTableReader fst_reader(fst_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - Int32VectorWriter alignment_writer(alignment_wspecifier); - - int32 num_done = 0, num_err = 0, num_retry = 0; - double tot_like = 0.0; - kaldi::int64 frame_count = 0; - - for (; !fst_reader.Done(); fst_reader.Next()) { - std::string utt = fst_reader.Key(); - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "No features found for utterance " << utt; - num_err++; - continue; - } - VectorFst decode_fst(fst_reader.Value()); - // stops copy-on-write of the fst by deleting the fst inside the reader, - // since we're about to mutate the fst by adding transition probs. - fst_reader.FreeCurrent(); - - const Matrix &features = feature_reader.Value(utt); - if (features.NumRows() == 0) { - KALDI_WARN << "Empty features for utterance " << utt; - num_err++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_err++; - continue; - } - } // else spk_vars is "empty" - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == features.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - { // Add transition-probs to the FST. - std::vector disambig_syms; // empty. - AddTransitionProbs(trans_model, disambig_syms, - transition_scale, self_loop_scale, - &decode_fst); - } - - DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, trans_model, - features, *gselect, log_prune, acoustic_scale); - - AlignUtteranceWrapper(align_config, utt, - acoustic_scale, &decode_fst, &sgmm_decodable, - &alignment_writer, NULL, - &num_done, &num_err, &num_retry, - &tot_like, &frame_count); - } - - KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) - << " over " << frame_count<< " frames."; - KALDI_LOG << "Retried " << num_retry << " out of " - << (num_done + num_err) << " utterances."; - KALDI_LOG << "Done " << num_done << ", errors on " << num_err; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-build-tree.cc b/src/sgmmbin/sgmm-build-tree.cc deleted file mode 100644 index de63e60f56f..00000000000 --- a/src/sgmmbin/sgmm-build-tree.cc +++ /dev/null @@ -1,201 +0,0 @@ -// sgmmbin/sgmm-build-tree.cc - -// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "hmm/hmm-topology.h" -#include "tree/context-dep.h" -#include "tree/build-tree.h" -#include "tree/build-tree-utils.h" -#include "sgmm/sgmm-clusterable.h" -#include "sgmm/estimate-am-sgmm.h" -#include "util/text-utils.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - using namespace kaldi; - typedef kaldi::int32 int32; - - const char *usage = - "Train decision tree\n" - "Usage: sgmm-build-tree [options] " - " []\n" - "e.g.: sgmm-build-tree 0.sgmm streeacc roots.txt 1.qst tree\n"; - - bool binary = true; - int32 P = 1, N = 3; - - BaseFloat thresh = 300.0; - BaseFloat cluster_thresh = -1.0; // negative means use smallest split in splitting phase as thresh. - int32 max_leaves = 0; - std::string occs_out_filename; - - ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("context-width", &N, "Context window size [must match " - "acc-tree-stats]"); - po.Register("central-position", &P, "Central position in context window " - "[must match acc-tree-stats]"); - po.Register("max-leaves", &max_leaves, "Maximum number of leaves to be " - "used in tree-buliding (if positive)"); - po.Register("thresh", &thresh, "Log-likelihood change threshold for " - "tree-building"); - po.Register("cluster-thresh", &cluster_thresh, "Log-likelihood change " - "threshold for clustering after tree-building"); - - po.Read(argc, argv); - - if (po.NumArgs() != 5) { - po.PrintUsage(); - exit(1); - } - - std::string sgmm_filename = po.GetArg(1), - stats_filename = po.GetArg(2), - roots_filename = po.GetArg(3), - questions_filename = po.GetArg(4), - tree_out_filename = po.GetArg(5); - - // Following 2 variables derived from roots file. - // phone_sets is sets of phones that share their roots. - // Just one phone each for normal systems. - std::vector > phone_sets; - std::vector is_shared_root; - std::vector is_split_root; - { - Input ki(roots_filename.c_str()); - ReadRootsFile(ki.Stream(), &phone_sets, &is_shared_root, &is_split_root); - } - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(sgmm_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - const HmmTopology &topo = trans_model.GetTopo(); - std::vector > H; - am_sgmm.ComputeH(&H); - - BuildTreeStatsType stats; - { - bool binary_in; - SgmmClusterable sc(am_sgmm, H); // dummy stats needed to provide - // type info, and access to am_sgmm and H. - Input ki(stats_filename, &binary_in); - ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats); - } - KALDI_LOG << "Number of separate statistics is " << stats.size(); - - Questions qo; - { - bool binary_in; - try { - Input ki(questions_filename, &binary_in); - qo.Read(ki.Stream(), binary_in); - } catch (const std::exception &e) { - KALDI_ERR << "Error reading questions file "< phone2num_pdf_classes; - topo.GetPhoneToNumPdfClasses(&phone2num_pdf_classes); - - EventMap *to_pdf = NULL; - - //////// Build the tree. //////////// - - to_pdf = BuildTree(qo, - phone_sets, - phone2num_pdf_classes, - is_shared_root, - is_split_root, - stats, - thresh, - max_leaves, - cluster_thresh, - P); - - { // This block is to warn about low counts. - std::vector split_stats; - SplitStatsByMap(stats, *to_pdf, - &split_stats); - for (size_t i = 0; i < split_stats.size(); i++) - if (SumNormalizer(split_stats[i]) < 100.0) - KALDI_VLOG(1) << "For pdf-id " << i << ", low count " - << SumNormalizer(split_stats[i]); - } - - ContextDependency ctx_dep(N, P, to_pdf); // takes ownership - // of pointer "to_pdf", so set it NULL. - to_pdf = NULL; - - WriteKaldiObject(ctx_dep, tree_out_filename, binary); - - { // This block is just doing some checks. - - std::vector all_phones; - for (size_t i = 0; i < phone_sets.size(); i++) - all_phones.insert(all_phones.end(), - phone_sets[i].begin(), phone_sets[i].end()); - SortAndUniq(&all_phones); - if (all_phones != topo.GetPhones()) { - std::ostringstream ss; - WriteIntegerVector(ss, false, all_phones); - ss << " vs. "; - WriteIntegerVector(ss, false, topo.GetPhones()); - KALDI_WARN << "Mismatch between phone sets provided in roots file, and those in topology: " << ss.str(); - } - std::vector seen_phones; - PossibleValues(P, stats, &seen_phones); // get phones seen in the data. - - std::vector unseen_phones; // diagnostic. - for (size_t i = 0; i < all_phones.size(); i++) - if (!std::binary_search(seen_phones.begin(), seen_phones.end(), all_phones[i])) - unseen_phones.push_back(all_phones[i]); - for (size_t i = 0; i < seen_phones.size(); i++) - if (!std::binary_search(all_phones.begin(), all_phones.end(), seen_phones[i])) - KALDI_ERR << "Phone " << (seen_phones[i]) - << " appears in stats but is not listed in roots file."; - if (!unseen_phones.empty()) { - std::ostringstream ss; - for (size_t i = 0; i < unseen_phones.size(); i++) - ss << unseen_phones[i] << ' '; - // Note, unseen phones is just a warning as in certain kinds of - // systems, this can be OK (e.g. where phone encodes position and - // stress information). - KALDI_WARN << "Saw no stats for following phones: " << ss.str(); - } - } - - KALDI_LOG << "Wrote tree"; - - DeleteBuildTreeStats(&stats); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/src/sgmmbin/sgmm-calc-distances.cc b/src/sgmmbin/sgmm-calc-distances.cc deleted file mode 100644 index a621b6217b4..00000000000 --- a/src/sgmmbin/sgmm-calc-distances.cc +++ /dev/null @@ -1,74 +0,0 @@ -// sgmmbin/sgmm-calc-distances.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - - const char *usage = - "Compute matrix of approximated K-L divergences between states\n" - "Only works properly if a single substate per state.\n" - "Usage: sgmm-calc-distances [options] model-in occs-in distances-out\n"; - - bool binary = true; - ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Read(argc, argv); - - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - occs_in_filename = po.GetArg(2), - distances_out_filename = po.GetArg(3); - - - AmSgmm am_sgmm; - { - bool binary; - Input ki(model_in_filename, &binary); - TransitionModel trans_model; - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector occs; - ReadKaldiObject(occs_in_filename, &occs); - - Matrix dists(am_sgmm.NumPdfs(), am_sgmm.NumPdfs()); - AmSgmmFunctions::ComputeDistances(am_sgmm, occs, &dists); - - Output ko(distances_out_filename, binary); - dists.Write(ko.Stream(), binary); - - KALDI_LOG << "Wrote distances to " << distances_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-cluster-phones.cc b/src/sgmmbin/sgmm-cluster-phones.cc deleted file mode 100644 index fce3d43e113..00000000000 --- a/src/sgmmbin/sgmm-cluster-phones.cc +++ /dev/null @@ -1,148 +0,0 @@ -// sgmmbin/sgmm-cluster-phones.cc - -// Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "tree/context-dep.h" -#include "tree/build-tree.h" -#include "tree/build-tree-utils.h" -#include "tree/context-dep.h" -#include "sgmm/sgmm-clusterable.h" -#include "hmm/transition-model.h" -#include "util/text-utils.h" - - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - using namespace kaldi; - typedef kaldi::int32 int32; - - const char *usage = - "Cluster phones (or sets of phones) into sets for various purposes\n" - "Usage: sgmm-cluster-phones [options] \n" - "e.g.: sgmm-cluster-phones 0.sgmm 1.tacc phonesets.txt questions.txt\n"; - // Format of phonesets.txt is e.g. - // 1 - // 2 3 4 - // 5 6 - // ... - // Format of questions.txt output is similar, but with more lines (and the same phone - // may appear on multiple lines). - - // bool binary = true; - int32 P = 1, N = 3; // Note: N does not matter. - std::string pdf_class_list_str = "1"; // 1 is just the central position of 3. - std::string mode = "questions"; - int32 num_classes = -1; - - ParseOptions po(usage); - // po.Register("binary", &binary, "Write output in binary mode"); - po.Register("central-position", &P, "Central position in context window [must match acc-tree-stats]"); - po.Register("context-width", &N, "Does not have any effect-- included for scripting convenience."); - po.Register("pdf-class-list", &pdf_class_list_str, "Colon-separated list of HMM positions to consider [Default = 1: just central position for 3-state models]."); - po.Register("mode", &mode, "Mode of operation: \"questions\"->sets suitable for decision trees; \"k-means\"->k-means algorithm, output k classes (set num-classes options)\n"); - po.Register("num-classes", &num_classes, "For k-means mode, number of classes."); - - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string sgmm_rxfilename = po.GetArg(1), - stats_rxfilename = po.GetArg(2), - phone_sets_rxfilename = po.GetArg(3), - phone_sets_wxfilename = po.GetArg(4); - - AmSgmm am_sgmm; - { - TransitionModel trans_model; - bool binary_in; - Input ki(sgmm_rxfilename, &binary_in); - trans_model.Read(ki.Stream(), binary_in); - am_sgmm.Read(ki.Stream(), binary_in); - } - std::vector > H; - am_sgmm.ComputeH(&H); - - BuildTreeStatsType stats; - { // Read tree stats. - bool binary_in; - SgmmClusterable sc(am_sgmm, H); // dummy needed to provide type and sgmm ref. - Input ki(stats_rxfilename, &binary_in); - ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats); - } - KALDI_LOG << "Number of separate states in stats is " - << stats.size(); - - std::vector pdf_class_list; - if (!SplitStringToIntegers(pdf_class_list_str, ":", false, &pdf_class_list) - || pdf_class_list.empty()) { - KALDI_ERR << "Invalid pdf-class-list string [expecting colon-separated list of integers]: " - << pdf_class_list_str; - } - - std::vector > phone_sets; - if (!ReadIntegerVectorVectorSimple(phone_sets_rxfilename, &phone_sets)) - KALDI_ERR << "Could not read phone sets from " - << PrintableRxfilename(phone_sets_rxfilename); - - if (phone_sets.size() == 0) - KALDI_ERR << "No phone sets in phone sets file "; - - std::vector > phone_sets_out; - - if (mode == "questions") { - if (num_classes != -1) - KALDI_ERR << "num-classes option is not (currently) compatible " - "with \"questions\" mode."; - AutomaticallyObtainQuestions(stats, - phone_sets, - pdf_class_list, - P, - &phone_sets_out); - } else if (mode == "k-means") { - if (num_classes <= 1 || - static_cast(num_classes) > phone_sets.size()) - KALDI_ERR << "num-classes invalid: num_classes is " << num_classes - << ", number of phone sets is " << phone_sets.size(); - KMeansClusterPhones(stats, - phone_sets, - pdf_class_list, - P, - num_classes, - &phone_sets_out); - } - - if (!WriteIntegerVectorVectorSimple(phone_sets_wxfilename, phone_sets_out)) - KALDI_ERR << "Error writing questions to " - << PrintableWxfilename(phone_sets_wxfilename); - else - KALDI_LOG << "Wrote questions to "< \n"; - - bool binary = true; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Read(argc, argv); - - if (po.NumArgs() < 3) { - po.PrintUsage(); - exit(1); - } - - std::string sgmm_in_filename = po.GetArg(1), - occs_filename = po.GetArg(2), - sgmm_out_filename = po.GetArg(3); - - kaldi::AmSgmm sgmm_in; - kaldi::TransitionModel trans_model; - { - bool binary_read; - kaldi::Input ki(sgmm_in_filename, &binary_read); - trans_model.Read(ki.Stream(), binary_read); - sgmm_in.Read(ki.Stream(), binary_read); - } - - kaldi::Vector occs; - { - bool binary_read; - kaldi::Input ki(occs_filename, &binary_read); - occs.Read(ki.Stream(), binary_read); - } - - kaldi::SgmmFmllrGlobalParams fmllr_globals; - sgmm_in.ComputeFmllrPreXform(occs, &fmllr_globals.pre_xform_, - &fmllr_globals.inv_xform_, - &fmllr_globals.mean_scatter_); - - { - kaldi::Output ko(sgmm_out_filename, binary); - trans_model.Write(ko.Stream(), binary); - sgmm_in.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll); - fmllr_globals.Write(ko.Stream(), binary); - } - - KALDI_LOG << "Written model to " << sgmm_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-copy.cc b/src/sgmmbin/sgmm-copy.cc deleted file mode 100644 index df1f960ed95..00000000000 --- a/src/sgmmbin/sgmm-copy.cc +++ /dev/null @@ -1,74 +0,0 @@ -// sgmmbin/sgmm-copy.cc - -// Copyright 2009-2012 Microsoft Corporation -// Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Copy SGMM (possibly changing binary/text format)\n" - "Usage: sgmm-copy [options] \n" - "e.g.: sgmm-copy --binary=false 1.mdl 1_text.mdl\n"; - - bool binary_write = true; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - - po.Read(argc, argv); - if (po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string model_in_filename = po.GetArg(1), - model_out_filename = po.GetArg(2); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - { - Output ko(model_out_filename, binary_write); - trans_model.Write(ko.Stream(), binary_write); - am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll); - } - - - KALDI_LOG << "Written model to " << model_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-decode-faster.cc b/src/sgmmbin/sgmm-decode-faster.cc deleted file mode 100644 index b20808e144e..00000000000 --- a/src/sgmmbin/sgmm-decode-faster.cc +++ /dev/null @@ -1,218 +0,0 @@ -// sgmmbin/sgmm-decode-faster.cc - -// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "fstext/fstext-lib.h" -#include "decoder/faster-decoder.h" -#include "sgmm/decodable-am-sgmm.h" -#include "base/timer.h" -#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - using fst::SymbolTable; - using fst::VectorFst; - using fst::StdArc; - - const char *usage = - "Decode features using SGMM-based model.\n" - "Usage: sgmm-decode-faster [options] " - " [alignments-wspecifier]\n"; - ParseOptions po(usage); - bool allow_partial = true; - BaseFloat acoustic_scale = 0.1; - BaseFloat log_prune = 5.0; - string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier, - utt2spk_rspecifier; - - FasterDecoderOptions decoder_opts; - decoder_opts.Register(&po, true); // true == include obscure settings. - kaldi::SgmmGselectConfig sgmm_opts; - sgmm_opts.Register(&po); - - po.Register("acoustic-scale", &acoustic_scale, - "Scaling factor for acoustic likelihoods"); - po.Register("log-prune", &log_prune, - "Pruning beam used to reduce number of exp() evaluations."); - po.Register("word-symbol-table", &word_syms_filename, - "Symbol table for words [for debug output]"); - po.Register("gselect", &gselect_rspecifier, - "rspecifier for precomputed per-frame Gaussian indices."); - po.Register("spk-vecs", &spkvecs_rspecifier, - "rspecifier for speaker vectors"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("allow-partial", &allow_partial, - "Produce output even when final state was not reached"); - po.Read(argc, argv); - - if (po.NumArgs() < 4 || po.NumArgs() > 5) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - fst_in_filename = po.GetArg(2), - feature_rspecifier = po.GetArg(3), - words_wspecifier = po.GetArg(4), - alignment_wspecifier = po.GetOptArg(5); - - TransitionModel trans_model; - kaldi::AmSgmm am_sgmm; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Int32VectorWriter words_writer(words_wspecifier); - Int32VectorWriter alignment_writer(alignment_wspecifier); - - fst::SymbolTable *word_syms = NULL; - if (word_syms_filename != "") - if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) - KALDI_ERR << "Could not read symbol table from file " - << word_syms_filename; - - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - - // It's important that we initialize decode_fst after feature_reader, as it - // can prevent crashes on systems installed without enough virtual memory. - // It has to do with what happens on UNIX systems if you call fork() on a - // large process: the page-table entries are duplicated, which requires a - // lot of virtual memory. - VectorFst *decode_fst = fst::ReadFstKaldi(fst_in_filename); - - BaseFloat tot_like = 0.0; - kaldi::int64 frame_count = 0; - int num_success = 0, num_fail = 0; - FasterDecoder decoder(*decode_fst, decoder_opts); - - Timer timer; - const std::vector > empty_gselect; - - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - Matrix features(feature_reader.Value()); - feature_reader.FreeCurrent(); - if (features.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << utt; - num_fail++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_fail++; - continue; - } - } // else spk_vars is "empty" - - bool has_gselect = false; - if (gselect_reader.IsOpen()) { - has_gselect = gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == features.NumRows(); - if (!has_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - } - const std::vector > *gselect = - (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, - trans_model, features, *gselect, - log_prune, acoustic_scale); - decoder.Decode(&sgmm_decodable); - - VectorFst decoded; // linear FST. - - if ( (allow_partial || decoder.ReachedFinal()) - && decoder.GetBestPath(&decoded) ) { - if (!decoder.ReachedFinal()) - KALDI_WARN << "Decoder did not reach end-state, " - << "outputting partial traceback since --allow-partial=true"; - num_success++; - std::vector alignment; - std::vector words; - LatticeWeight weight; - frame_count += features.NumRows(); - - GetLinearSymbolSequence(decoded, &alignment, &words, &weight); - - words_writer.Write(utt, words); - if (alignment_writer.IsOpen()) - alignment_writer.Write(utt, alignment); - if (word_syms != NULL) { - std::cerr << utt << ' '; - for (size_t i = 0; i < words.size(); i++) { - std::string s = word_syms->Find(words[i]); - if (s == "") - KALDI_ERR << "Word-id " << words[i] << " not in symbol table."; - std::cerr << s << ' '; - } - std::cerr << '\n'; - } - BaseFloat like = -weight.Value1() -weight.Value2(); - tot_like += like; - KALDI_LOG << "Log-like per frame for utterance " << utt << " is " - << (like / features.NumRows()) << " over " - << features.NumRows() << " frames."; - } else { - num_fail++; - KALDI_WARN << "Did not successfully decode utterance " << utt - << ", len = " << features.NumRows(); - } - } - double elapsed = timer.Elapsed(); - KALDI_LOG << "Time taken [excluding initialization] "<< elapsed - << "s: real-time factor assuming 100 frames/sec is " - << (elapsed*100.0/frame_count); - KALDI_LOG << "Done " << num_success << " utterances, failed for " - << num_fail; - KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count) - << " over " << frame_count << " frames."; - - delete word_syms; - delete decode_fst; - return (num_success != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/src/sgmmbin/sgmm-est-ebw.cc b/src/sgmmbin/sgmm-est-ebw.cc deleted file mode 100644 index 71c7255c238..00000000000 --- a/src/sgmmbin/sgmm-est-ebw.cc +++ /dev/null @@ -1,118 +0,0 @@ -// sgmmbin/sgmm-est-ebw.cc - -// Copyright 2012 Johns Hopkins Univerity (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "thread/kaldi-thread.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm-ebw.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - typedef kaldi::int32 int32; - using std::string; - try { - const char *usage = - "Estimate SGMM model parameters discriminatively using Extended\n" - "Baum-Welch style of update\n" - "Usage: sgmm-est-ebw [options] \n"; - - - string update_flags_str = "vMNwcSt"; - bool binary_write = true; - string write_flags_str = "gsnu"; - EbwAmSgmmOptions opts; - - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to " - "update: subset of vMNwcSt."); - po.Register("write-flags", &write_flags_str, "Which SGMM parameters to " - "write: subset of gsnu"); - po.Register("num-threads", &g_num_threads, "Number of threads to use in " - "weight update and normalizer computation"); - opts.Register(&po); - - po.Read(argc, argv); - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - string model_in_filename = po.GetArg(1), - num_stats_filename = po.GetArg(2), - den_stats_filename = po.GetArg(3), - model_out_filename = po.GetArg(4); - - SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str); - SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - MleAmSgmmAccs sgmm_num_accs; - { - bool binary; - Vector transition_accs; // won't be used. - Input ki(num_stats_filename, &binary); - transition_accs.Read(ki.Stream(), binary); - sgmm_num_accs.Read(ki.Stream(), binary, false); // false == add; doesn't matter. - } - MleAmSgmmAccs sgmm_den_accs; - { - bool binary; - Vector transition_accs; // won't be used. - Input ki(den_stats_filename, &binary); - transition_accs.Read(ki.Stream(), binary); - sgmm_den_accs.Read(ki.Stream(), binary, false); // false == add; doesn't matter. - } - - sgmm_num_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics. - sgmm_den_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics. - - { // Update SGMM. - BaseFloat auxf_impr, count; - kaldi::EbwAmSgmmUpdater sgmm_updater(opts); - sgmm_updater.Update(sgmm_num_accs, sgmm_den_accs, &am_sgmm, - update_flags, &auxf_impr, &count); - KALDI_LOG << "Overall auxf impr/frame from SGMM update is " << (auxf_impr/count) - << " over " << count << " frames."; - } - - { - Output ko(model_out_filename, binary_write); - trans_model.Write(ko.Stream(), binary_write); - am_sgmm.Write(ko.Stream(), binary_write, write_flags); - } - - KALDI_LOG << "Wrote model to " << model_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/src/sgmmbin/sgmm-est-fmllr-gpost.cc b/src/sgmmbin/sgmm-est-fmllr-gpost.cc deleted file mode 100644 index 1190c6d5b73..00000000000 --- a/src/sgmmbin/sgmm-est-fmllr-gpost.cc +++ /dev/null @@ -1,261 +0,0 @@ -// sgmmbin/sgmm-est-fmllr-gpost.cc - -// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; -#include -using std::vector; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/fmllr-sgmm.h" -#include "hmm/transition-model.h" - -namespace kaldi { - -void AccumulateForUtterance(const Matrix &feats, - const SgmmGauPost &gpost, - const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const SgmmPerSpkDerivedVars &spk_vars, - BaseFloat logdet, - FmllrSgmmAccs *spk_stats) { -// kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - for (size_t i = 0; i < gpost.size(); i++) { -// am_sgmm.ComputePerFrameVars(feats.Row(i), gpost[i].gselect, spk_vars, -// logdet, &per_frame_vars); - - for (size_t j = 0; j < gpost[i].tids.size(); j++) { - int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]); - spk_stats->AccumulateFromPosteriors(am_sgmm, spk_vars, feats.Row(i), - gpost[i].gselect, - gpost[i].posteriors[j], pdf_id); - } - } -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - using namespace kaldi; - const char *usage = - "Estimate FMLLR transform for SGMMs, either per utterance or for the " - "supplied set of speakers (with spk2utt option).\n" - "Reads Gaussian-level posteriors. Writes to a table of matrices.\n" - "Usage: sgmm-est-fmllr-gpost [options] " - " \n"; - - ParseOptions po(usage); - string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier; - BaseFloat min_count = 100; - SgmmFmllrConfig fmllr_opts; - - po.Register("spk2utt", &spk2utt_rspecifier, - "File to read speaker to utterance-list map from."); - po.Register("spkvec-min-count", &min_count, - "Minimum count needed to estimate speaker vectors"); - po.Register("spk-vecs", &spkvecs_rspecifier, - "Speaker vectors to use during aligment (rspecifier)"); - po.Register("input-fmllr", &fmllr_rspecifier, - "Initial FMLLR transform per speaker (rspecifier)"); - fmllr_opts.Register(&po); - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - string model_rxfilename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - gpost_rspecifier = po.GetArg(3), - fmllr_wspecifier = po.GetArg(4); - - TransitionModel trans_model; - AmSgmm am_sgmm; - SgmmFmllrGlobalParams fmllr_globals; - { - bool binary; - Input ki(model_rxfilename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - fmllr_globals.Read(ki.Stream(), binary); - } - - RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier); - - RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier); - - RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier); - - BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier); - - int32 dim = am_sgmm.FeatureDim(); - FmllrSgmmAccs spk_stats; - spk_stats.Init(dim, am_sgmm.NumGauss()); - Matrix fmllr_xform(dim, dim + 1); - BaseFloat logdet = 0.0; - double tot_impr = 0.0, tot_t = 0.0; - int32 num_done = 0, num_no_gpost = 0, num_other_error = 0; - std::vector > empty_gselect; - - if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation - SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - - for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { - spk_stats.SetZero(); - string spk = spk2utt_reader.Key(); - const vector &uttlist = spk2utt_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(spk)) { - spk_vars.v_s = spkvecs_reader.Value(spk); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << spk; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - if (fmllr_reader.IsOpen()) { - if (fmllr_reader.HasKey(spk)) { - fmllr_xform.CopyFromMat(fmllr_reader.Value(spk)); - logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet(); - } else { - KALDI_WARN << "Cannot find FMLLR transform for " << spk; - fmllr_xform.SetUnit(); - logdet = 0.0; - } - } else { - fmllr_xform.SetUnit(); - logdet = 0.0; - } - - for (size_t i = 0; i < uttlist.size(); i++) { - std::string utt = uttlist[i]; - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Did not find features for utterance " << utt; - continue; - } - if (!gpost_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posteriors for utterance " << utt; - num_no_gpost++; - continue; - } - const Matrix &feats = feature_reader.Value(utt); - const SgmmGauPost &gpost = gpost_reader.Value(utt); - if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "gpost vector has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - - AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, - logdet, &spk_stats); - num_done++; - } // end looping over all utterances of the current speaker - - BaseFloat impr, spk_frame_count; - // Compute the FMLLR transform and write it out. - spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform, - &spk_frame_count, &impr); - fmllr_writer.Write(spk, fmllr_xform); - tot_impr += impr; - tot_t += spk_frame_count; - } // end looping over speakers - } else { // per-utterance adaptation - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - if (!gpost_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posts for utterance " - << utt; - num_no_gpost++; - continue; - } - const Matrix &feats = feature_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - if (fmllr_reader.IsOpen()) { - if (fmllr_reader.HasKey(utt)) { - fmllr_xform.CopyFromMat(fmllr_reader.Value(utt)); - logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet(); - } else { - KALDI_WARN << "Cannot find FMLLR transform for " << utt; - fmllr_xform.SetUnit(); - logdet = 0.0; - } - } else { - fmllr_xform.SetUnit(); - logdet = 0.0; - } - - const SgmmGauPost &gpost = gpost_reader.Value(utt); - - if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "gpost has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - spk_stats.SetZero(); - AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, - logdet, &spk_stats); - num_done++; - - BaseFloat impr, spk_frame_count; - // Compute the FMLLR transform and write it out. - spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform, - &spk_frame_count, &impr); - fmllr_writer.Write(utt, fmllr_xform); - tot_impr += impr; - tot_t += spk_frame_count; - } - } - - KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost - << " with no gposts, " << num_other_error << " with other errors."; - KALDI_LOG << "Num frames " << tot_t << ", auxf impr per frame is " - << (tot_impr / tot_t); - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - diff --git a/src/sgmmbin/sgmm-est-fmllr.cc b/src/sgmmbin/sgmm-est-fmllr.cc deleted file mode 100644 index 2ad2c8d62cf..00000000000 --- a/src/sgmmbin/sgmm-est-fmllr.cc +++ /dev/null @@ -1,318 +0,0 @@ -// sgmmbin/sgmm-est-fmllr.cc - -// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; -#include -using std::vector; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/fmllr-sgmm.h" -#include "hmm/transition-model.h" -#include "hmm/posterior.h" - -namespace kaldi { - -void AccumulateForUtterance(const Matrix &feats, - const Matrix &transformed_feats, // if already fMLLR - const std::vector > &gselect, - const SgmmGselectConfig &sgmm_config, - const Posterior &post, - const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const SgmmPerSpkDerivedVars &spk_vars, - BaseFloat logdet, - FmllrSgmmAccs *spk_stats) { - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - Posterior pdf_post; - ConvertPosteriorToPdfs(trans_model, post, &pdf_post); - for (size_t t = 0; t < post.size(); t++) { - std::vector this_gselect; - if (!gselect.empty()) { - KALDI_ASSERT(t < gselect.size()); - this_gselect = gselect[t]; - } else { - am_sgmm.GaussianSelection(sgmm_config, feats.Row(t), &this_gselect); - } - // per-frame vars only used for computing posteriors... use the - // transformed feats for this, if available. - am_sgmm.ComputePerFrameVars(transformed_feats.Row(t), this_gselect, spk_vars, - 0.0 /*fMLLR logdet*/, &per_frame_vars); - - - for (size_t j = 0; j < pdf_post[t].size(); j++) { - int32 pdf_id = pdf_post[t][j].first; - Matrix posteriors; - am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id, - &posteriors); - posteriors.Scale(pdf_post[t][j].second); - spk_stats->AccumulateFromPosteriors(am_sgmm, spk_vars, feats.Row(t), - this_gselect, - posteriors, pdf_id); - } - } -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - using namespace kaldi; - const char *usage = - "Estimate FMLLR transform for SGMMs, either per utterance or for the " - "supplied set of speakers (with spk2utt option).\n" - "Reads state-level posteriors. Writes to a table of matrices.\n" - "Usage: sgmm-est-fmllr [options] " - " \n"; - - ParseOptions po(usage); - string spk2utt_rspecifier, spkvecs_rspecifier, fmllr_rspecifier, - gselect_rspecifier; - BaseFloat min_count = 100; - SgmmFmllrConfig fmllr_opts; - SgmmGselectConfig sgmm_opts; - - po.Register("spk2utt", &spk2utt_rspecifier, - "File to read speaker to utterance-list map from."); - po.Register("spkvec-min-count", &min_count, - "Minimum count needed to estimate speaker vectors"); - po.Register("spk-vecs", &spkvecs_rspecifier, - "Speaker vectors to use during aligment (rspecifier)"); - po.Register("input-fmllr", &fmllr_rspecifier, - "Initial FMLLR transform per speaker (rspecifier)"); - po.Register("gselect", &gselect_rspecifier, - "Precomputed Gaussian indices (rspecifier)"); - fmllr_opts.Register(&po); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - string model_rxfilename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - post_rspecifier = po.GetArg(3), - fmllr_wspecifier = po.GetArg(4); - - TransitionModel trans_model; - AmSgmm am_sgmm; - SgmmFmllrGlobalParams fmllr_globals; - { - bool binary; - Input ki(model_rxfilename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - fmllr_globals.Read(ki.Stream(), binary); - } - - RandomAccessPosteriorReader post_reader(post_rspecifier); - RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatMatrixReader fmllr_reader(fmllr_rspecifier); - - BaseFloatMatrixWriter fmllr_writer(fmllr_wspecifier); - - int32 dim = am_sgmm.FeatureDim(); - FmllrSgmmAccs spk_stats; - spk_stats.Init(dim, am_sgmm.NumGauss()); - Matrix fmllr_xform(dim, dim + 1); - BaseFloat logdet = 0.0; - double tot_impr = 0.0, tot_t = 0.0; - int32 num_done = 0, num_err = 0; - std::vector > empty_gselect; - - if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation - SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - - for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { - spk_stats.SetZero(); - string spk = spk2utt_reader.Key(); - const vector &uttlist = spk2utt_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(spk)) { - spk_vars.v_s = spkvecs_reader.Value(spk); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << spk; - num_err++; - continue; - } - } // else spk_vars is "empty" - - if (fmllr_reader.IsOpen()) { - if (fmllr_reader.HasKey(spk)) { - fmllr_xform.CopyFromMat(fmllr_reader.Value(spk)); - logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet(); - } else { - KALDI_WARN << "Cannot find FMLLR transform for " << spk; - fmllr_xform.SetUnit(); - logdet = 0.0; - } - } else { - fmllr_xform.SetUnit(); - logdet = 0.0; - } - - for (size_t i = 0; i < uttlist.size(); i++) { - std::string utt = uttlist[i]; - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Did not find features for utterance " << utt; - num_err++; - continue; - } - if (!post_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posteriors for utterance " << utt; - num_err++; - continue; - } - const Matrix &feats = feature_reader.Value(utt); - const Posterior &post = post_reader.Value(utt); - if (static_cast(post.size()) != feats.NumRows()) { - KALDI_WARN << "posterior vector has wrong size " << (post.size()) - << " vs. " << (feats.NumRows()); - num_err++; - continue; - } - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == feats.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - Matrix transformed_feats(feats); - for (int32 r = 0; r < transformed_feats.NumRows(); r++) { - SubVector row(transformed_feats, r); - ApplyAffineTransform(fmllr_xform, &row); - } - AccumulateForUtterance(feats, transformed_feats, *gselect, sgmm_opts, - post, trans_model, am_sgmm, spk_vars, - logdet, &spk_stats); - num_done++; - } // end looping over all utterances of the current speaker - - BaseFloat impr, spk_frame_count; - // Compute the FMLLR transform and write it out. - spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform, - &spk_frame_count, &impr); - fmllr_writer.Write(spk, fmllr_xform); - tot_impr += impr; - tot_t += spk_frame_count; - } // end looping over speakers - } else { // per-utterance adaptation - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - if (!post_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posts for utterance " - << utt; - num_err++; - continue; - } - const Matrix &feats = feature_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_err++; - continue; - } - } // else spk_vars is "empty" - - if (fmllr_reader.IsOpen()) { - if (fmllr_reader.HasKey(utt)) { - fmllr_xform.CopyFromMat(fmllr_reader.Value(utt)); - logdet = fmllr_xform.Range(0, dim, 0, dim).LogDet(); - } else { - KALDI_WARN << "Cannot find FMLLR transform for " << utt; - fmllr_xform.SetUnit(); - logdet = 0.0; - } - } else { - fmllr_xform.SetUnit(); - logdet = 0.0; - } - - const Posterior &post = post_reader.Value(utt); - - if (static_cast(post.size()) != feats.NumRows()) { - KALDI_WARN << "post has wrong size " << (post.size()) - << " vs. " << (feats.NumRows()); - num_err++; - continue; - } - spk_stats.SetZero(); - - Matrix transformed_feats(feats); - for (int32 r = 0; r < transformed_feats.NumRows(); r++) { - SubVector row(transformed_feats, r); - ApplyAffineTransform(fmllr_xform, &row); - } - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == feats.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - AccumulateForUtterance(feats, transformed_feats, *gselect, sgmm_opts, - post, trans_model, am_sgmm, spk_vars, - logdet, &spk_stats); - num_done++; - - BaseFloat impr, spk_frame_count; - // Compute the FMLLR transform and write it out. - spk_stats.Update(am_sgmm, fmllr_globals, fmllr_opts, &fmllr_xform, - &spk_frame_count, &impr); - fmllr_writer.Write(utt, fmllr_xform); - tot_impr += impr; - tot_t += spk_frame_count; - } - } - - KALDI_LOG << "Done " << num_done << " files, " << num_err << " with errors."; - KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t) - << " per frame, over " << tot_t << " frames."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - diff --git a/src/sgmmbin/sgmm-est-fmllrbasis.cc b/src/sgmmbin/sgmm-est-fmllrbasis.cc deleted file mode 100644 index 155d4ed7a1b..00000000000 --- a/src/sgmmbin/sgmm-est-fmllrbasis.cc +++ /dev/null @@ -1,93 +0,0 @@ -// sgmmbin/sgmm-est-fmllrbasis.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/common-utils.h" -#include "matrix/matrix-lib.h" -#include "hmm/transition-model.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/fmllr-sgmm.h" - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - - const char *usage = - "Sum multiple accumulated stats files for SGMM training.\n" - "Usage: sgmm-est-fmllrbasis [options] " - " [stats-in2 ...]\n"; - - bool binary = true; - int32 num_bases = 50; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode."); - po.Register("num-bases", &num_bases, - "Number of fMLLR basis matrices to estimate."); - po.Read(argc, argv); - - if (po.NumArgs() < 3) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - model_out_filename = po.GetArg(2); - - kaldi::AmSgmm am_sgmm; - kaldi::TransitionModel trans_model; - kaldi::SgmmFmllrGlobalParams fmllr_globals; - { - bool binary_read; - kaldi::Input ki(model_in_filename, &binary_read); - trans_model.Read(ki.Stream(), binary_read); - am_sgmm.Read(ki.Stream(), binary_read); - fmllr_globals.Read(ki.Stream(), binary_read); - } - - kaldi::SpMatrix fmllr_grad_scatter; - int32 dim = am_sgmm.FeatureDim(); - fmllr_grad_scatter.Resize(dim * (dim + 1), kaldi::kSetZero); - - for (int i = 3, max = po.NumArgs(); i <= max; i++) { - std::string stats_in_filename = po.GetArg(i); - bool binary_read; - kaldi::Input ki(stats_in_filename, &binary_read); - fmllr_grad_scatter.Read(ki.Stream(), binary_read, - true /* add read values */); - } - - kaldi::EstimateSgmmFmllrSubspace(fmllr_grad_scatter, num_bases, dim, - &fmllr_globals); - - // Write out the accs - { - kaldi::Output ko(model_out_filename, binary); - trans_model.Write(ko.Stream(), binary); - am_sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll); - fmllr_globals.Write(ko.Stream(), binary); - } - - KALDI_LOG << "Written model to " << model_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-est-multi.cc b/src/sgmmbin/sgmm-est-multi.cc deleted file mode 100644 index 845714b7708..00000000000 --- a/src/sgmmbin/sgmm-est-multi.cc +++ /dev/null @@ -1,233 +0,0 @@ -// sgmmbin/sgmm-est-multi.cc - -// Copyright 2009-2012 Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" -#include "sgmm/estimate-am-sgmm-multi.h" - -int main(int argc, char *argv[]) { - using namespace kaldi; - // Memory for these will be freed in the catch block in case of exceptions. - std::vector sgmms_in; - std::vector sgmm_accs_in; - std::vector trans_models_in; - - try { - typedef kaldi::int32 int32; - const char *usage = - "Estimate multiple SGMM models from corresponding stats, such that the" - " global parameters\n(phone-, speaker-, and weight-projections and " - "covariances) are tied across models.\n" - "Usage: sgmm-est-multi [options] [ " - " ...]\n"; - - bool binary_write = true; - std::string update_flags_str = "vMNwcSt"; - std::string write_flags_str = "gsnu"; - kaldi::MleTransitionUpdateConfig tcfg; - kaldi::MleAmSgmmOptions sgmm_opts; - std::string split_substates = ""; // Space-seperated list of #substates - std::vector split_substates_int; // The above string split on space - int32 increase_phn_dim = 0; - int32 increase_spk_dim = 0; - bool remove_speaker_space = false; - BaseFloat perturb_factor = 0.01; - BaseFloat power = 0.2; - BaseFloat max_cond = 100; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - // The split-substates option also takes a single integer: the same number - // of substates for all models. - po.Register("split-substates", &split_substates, "Space-separated string " - "with target number of substates for each model."); - po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space " - "dimension as far as allowed towards this target."); - po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space " - "dimension as far as allowed towards this target."); - po.Register("remove-speaker-space", &remove_speaker_space, - "Remove speaker-specific projections N"); - po.Register("power", &power, "Exponent for substate occupancies used while " - "splitting substates."); - po.Register("perturb-factor", &perturb_factor, "Perturbation factor for " - "state vectors while splitting substates."); - po.Register("max-cond-split", &max_cond, "Max condition number of smoothing " - "matrix used in substate splitting."); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to " - "update: subset of vMNwcSt."); - po.Register("write-flags", &write_flags_str, "Which SGMM parameters to " - "write: subset of gsnu"); - tcfg.Register(&po); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - if (po.NumArgs() <= 0 || (po.NumArgs() % 4 != 0)) { - po.PrintUsage(); - exit(1); - } - // How many 4-tuples of model, stats, output model, output occs - int32 num_models = po.NumArgs()/4; - sgmms_in.resize(num_models, NULL); - sgmm_accs_in.resize(num_models, NULL); - trans_models_in.resize(num_models, NULL); - - if (!split_substates.empty()) { - SplitStringToIntegers(split_substates, " ", true /*omit empty strings*/, - &split_substates_int); - if (split_substates_int.size() == 1) { // Same #substates for all models - int32 tmp_int = split_substates_int[0]; - split_substates_int.resize(num_models, tmp_int); - } - if (split_substates_int.size() != num_models) { - KALDI_ERR << "Found " << split_substates_int.size() << " splitting " - << "targets; expecting 1 or " << num_models; - } - } - - SgmmUpdateFlagsType update_flags = StringToSgmmUpdateFlags(update_flags_str); - SgmmWriteFlagsType write_flags = StringToSgmmWriteFlags(write_flags_str); - - std::vector model_out_filenames(num_models); - std::vector occs_out_filenames(num_models); - int32 phn_dim, spk_dim, num_gauss, feat_dim; - - for (int i = 0; i < num_models; ++i) { - std::string model_in_filename = po.GetArg(i*4+1), - stats_filename = po.GetArg(i*4+2); - model_out_filenames[i] = po.GetArg(i*4+3); - occs_out_filenames[i] = po.GetArg(i*4+4); - - AmSgmm *am_sgmm = new AmSgmm(); - TransitionModel *trans_model = new TransitionModel(); - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model->Read(ki.Stream(), binary); - am_sgmm->Read(ki.Stream(), binary); - } - if (i == 0) { - phn_dim = am_sgmm->PhoneSpaceDim(); - spk_dim = am_sgmm->SpkSpaceDim(); - num_gauss = am_sgmm->NumGauss(); - feat_dim = am_sgmm->FeatureDim(); - } else { - if (am_sgmm->PhoneSpaceDim() != phn_dim) { - KALDI_ERR << "File '" << model_in_filename << "': mismatched " - << "phone-space dim: expecting " << phn_dim << ", found " - << am_sgmm->PhoneSpaceDim(); - } - if (am_sgmm->SpkSpaceDim() != spk_dim) { - KALDI_ERR << "File '" << model_in_filename << "': mismatched " - << "speaker-space dim: expecting " << spk_dim << ", found " - << am_sgmm->SpkSpaceDim(); - } - if (am_sgmm->NumGauss() != num_gauss) { - KALDI_ERR << "File '" << model_in_filename << "': mismatched UBM " - << "size: expecting " << num_gauss << ", found " - << am_sgmm->NumGauss(); - } - if (am_sgmm->FeatureDim() != feat_dim) { - KALDI_ERR << "File '" << model_in_filename << "': mismatched feature " - << "dim: expecting " << feat_dim << ", found " - << am_sgmm->FeatureDim(); - } - } - sgmms_in[i] = am_sgmm; - trans_models_in[i] = trans_model; - - Vector transition_accs; - MleAmSgmmAccs *sgmm_accs = new MleAmSgmmAccs(); - { - bool binary; - Input ki(stats_filename, &binary); - transition_accs.Read(ki.Stream(), binary); - sgmm_accs->Read(ki.Stream(), binary, false); - } - // Check consistency and print some diagnostics. - sgmm_accs->Check(*am_sgmm, true); - sgmm_accs_in[i] = sgmm_accs; - - if (update_flags & kSgmmTransitions) { // Update transition model. - BaseFloat objf_impr, count; - KALDI_LOG << "Updating transitions for model: " << model_in_filename; - trans_model->MleUpdate(transition_accs, tcfg, &objf_impr, &count); - KALDI_LOG << "Transition model update: average " << (objf_impr/count) - << " log-like improvement per frame over " << (count) - << " frames"; - } - } - - { // Update all the SGMMs together. - kaldi::MleAmSgmmUpdaterMulti multi_sgmm_updater(*sgmms_in[0], sgmm_opts); - multi_sgmm_updater.Update(sgmm_accs_in, sgmms_in, update_flags); - } - - for (int i = 0; i < num_models; ++i) { - Vector state_occs; - sgmm_accs_in[i]->GetStateOccupancies(&state_occs); - - if (!split_substates.empty()) { - sgmms_in[i]->SplitSubstates(state_occs, split_substates_int[i], perturb_factor, - power, max_cond); - sgmms_in[i]->ComputeDerivedVars(); // recompute normalizers... - } - - { - kaldi::Output ko(occs_out_filenames[i], false /* no binary write */); - state_occs.Write(ko.Stream(), false /* no binary write */); - } - - if (increase_phn_dim != 0 || increase_spk_dim != 0) { - // Feature normalizing transform matrix used to initialize the new columns - // of the phonetic- or speaker-space projection matrices. - kaldi::Matrix norm_xform; - ComputeFeatureNormalizer(sgmms_in[i]->full_ubm(), &norm_xform); - if (increase_phn_dim != 0) - sgmms_in[i]->IncreasePhoneSpaceDim(increase_phn_dim, norm_xform); - if (increase_spk_dim != 0) - sgmms_in[i]->IncreaseSpkSpaceDim(increase_spk_dim, norm_xform); - } - if (remove_speaker_space) { - KALDI_LOG << "Removing speaker space (projections N_)"; - sgmms_in[i]->RemoveSpeakerSpace(); - } - - { - Output ko(model_out_filenames[i], binary_write); - trans_models_in[i]->Write(ko.Stream(), binary_write); - sgmms_in[i]->Write(ko.Stream(), binary_write, write_flags); - KALDI_LOG << "Written model to " << model_out_filenames[i]; - } - } - return 0; - } catch(const std::exception& e) { - kaldi::DeletePointers(&sgmms_in); - kaldi::DeletePointers(&sgmm_accs_in); - kaldi::DeletePointers(&trans_models_in); - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-est-spkvecs-gpost.cc b/src/sgmmbin/sgmm-est-spkvecs-gpost.cc deleted file mode 100644 index 5f4e9078673..00000000000 --- a/src/sgmmbin/sgmm-est-spkvecs-gpost.cc +++ /dev/null @@ -1,223 +0,0 @@ -// sgmmbin/sgmm-est-spkvecs-gpost.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; -#include -using std::vector; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/transition-model.h" - -namespace kaldi { - -void AccumulateForUtterance(const Matrix &feats, - const SgmmGauPost &gpost, - const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const SgmmPerSpkDerivedVars &spk_vars, - MleSgmmSpeakerAccs *spk_stats) { - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - for (size_t i = 0; i < gpost.size(); i++) { - am_sgmm.ComputePerFrameVars(feats.Row(i), - gpost[i].gselect, spk_vars, 0.0, - &per_frame_vars); - - for (size_t j = 0; j < gpost[i].tids.size(); j++) { - int32 pdf_id = trans_model.TransitionIdToPdf(gpost[i].tids[j]); - spk_stats->AccumulateFromPosteriors(am_sgmm, per_frame_vars, - gpost[i].posteriors[j], pdf_id); - } - } -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - using namespace kaldi; - const char *usage = - "Estimate SGMM speaker vectors, either per utterance or for the " - "supplied set of speakers (with spk2utt option).\n" - "Reads Gaussian-level posteriors. Writes to a table of vectors.\n" - "Usage: sgmm-est-spkvecs-gpost [options] " - " \n"; - - ParseOptions po(usage); - string spk2utt_rspecifier, spkvecs_rspecifier; - BaseFloat min_count = 100; - BaseFloat rand_prune = 1.0e-05; - - po.Register("spk2utt", &spk2utt_rspecifier, - "File to read speaker to utterance-list map from."); - po.Register("spkvec-min-count", &min_count, - "Minimum count needed to estimate speaker vectors"); - po.Register("rand-prune", &rand_prune, "Randomized pruning parameter for posteriors (more->faster)."); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)"); - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - string model_rxfilename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - gpost_rspecifier = po.GetArg(3), - vecs_wspecifier = po.GetArg(4); - - TransitionModel trans_model; - AmSgmm am_sgmm; - { - bool binary; - Input ki(model_rxfilename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - MleSgmmSpeakerAccs spk_stats(am_sgmm, rand_prune); - - RandomAccessSgmmGauPostReader gpost_reader(gpost_rspecifier); - - RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier); - - BaseFloatVectorWriter vecs_writer(vecs_wspecifier); - - double tot_impr = 0.0, tot_t = 0.0; - int32 num_done = 0, num_no_gpost = 0, num_other_error = 0; - - if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation - SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - - for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { - spk_stats.Clear(); - string spk = spk2utt_reader.Key(); - const vector &uttlist = spk2utt_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(spk)) { - spk_vars.v_s = spkvecs_reader.Value(spk); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << spk; - } - } // else spk_vars is "empty" - - for (size_t i = 0; i < uttlist.size(); i++) { - std::string utt = uttlist[i]; - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Did not find features for utterance " << utt; - continue; - } - if (!gpost_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posteriors for utterance " << utt; - num_no_gpost++; - continue; - } - const Matrix &feats = feature_reader.Value(utt); - const SgmmGauPost &gpost = gpost_reader.Value(utt); - if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "gpost vector has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - - AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, &spk_stats); - num_done++; - } // end looping over all utterances of the current speaker - - BaseFloat impr, spk_tot_t; - { // Compute the spk_vec and write it out. - Vector spk_vec(am_sgmm.SpkSpaceDim(), kSetZero); - if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s); - spk_stats.Update(min_count, &spk_vec, &impr, &spk_tot_t); - vecs_writer.Write(spk, spk_vec); - } - KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is " - << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames.\n"; - tot_impr += impr; - tot_t += spk_tot_t; - } // end looping over speakers - } else { // per-utterance adaptation - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - if (!gpost_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posts for utterance " - << utt; - num_no_gpost++; - continue; - } - const Matrix &feats = feature_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - } - } // else spk_vars is "empty" - const SgmmGauPost &gpost = gpost_reader.Value(utt); - - if (static_cast(gpost.size()) != feats.NumRows()) { - KALDI_WARN << "gpost has wrong size " << (gpost.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - num_done++; - - spk_stats.Clear(); - - AccumulateForUtterance(feats, gpost, trans_model, am_sgmm, spk_vars, &spk_stats); - - BaseFloat impr, utt_tot_t; - { // Compute the spk_vec and write it out. - Vector spk_vec(am_sgmm.SpkSpaceDim(), kSetZero); - if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s); - spk_stats.Update(min_count, &spk_vec, &impr, &utt_tot_t); - vecs_writer.Write(utt, spk_vec); - } - KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is " - << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames."; - tot_impr += impr; - tot_t += utt_tot_t; - } - } - - KALDI_LOG << "Done " << num_done << " files, " << num_no_gpost - << " with no gposts, " << num_other_error << " with other errors."; - KALDI_LOG << "Overall auxf impr per frame is " << (tot_impr / tot_t) - << " over " << tot_t << " frames."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - diff --git a/src/sgmmbin/sgmm-est-spkvecs.cc b/src/sgmmbin/sgmm-est-spkvecs.cc deleted file mode 100644 index c71897d13c5..00000000000 --- a/src/sgmmbin/sgmm-est-spkvecs.cc +++ /dev/null @@ -1,257 +0,0 @@ -// sgmmbin/sgmm-est-spkvecs.cc - -// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; -#include -using std::vector; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/transition-model.h" -#include "hmm/posterior.h" - -namespace kaldi { - -void AccumulateForUtterance(const Matrix &feats, - const Posterior &post, - const TransitionModel &trans_model, - const AmSgmm &am_sgmm, - const SgmmGselectConfig &gselect_opts, - const vector< vector > &gselect, - const SgmmPerSpkDerivedVars &spk_vars, - MleSgmmSpeakerAccs *spk_stats) { - kaldi::SgmmPerFrameDerivedVars per_frame_vars; - - Posterior pdf_post; - ConvertPosteriorToPdfs(trans_model, post, &pdf_post); - for (size_t i = 0; i < post.size(); i++) { - std::vector this_gselect; - if (!gselect.empty()) - this_gselect = gselect[i]; - else - am_sgmm.GaussianSelection(gselect_opts, feats.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(feats.Row(i), this_gselect, spk_vars, 0.0, &per_frame_vars); - - for (size_t j = 0; j < pdf_post[i].size(); j++) { - int32 pdf_id = pdf_post[i][j].first; - spk_stats->Accumulate(am_sgmm, per_frame_vars, pdf_id, pdf_post[i][j].second); - } - } -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - using namespace kaldi; - const char *usage = - "Estimate SGMM speaker vectors, either per utterance or for the " - "supplied set of speakers (with spk2utt option).\n" - "Reads Gaussian-level posteriors. Writes to a table of vectors.\n" - "Usage: sgmm-est-spkvecs [options] " - " \n"; - - ParseOptions po(usage); - string gselect_rspecifier, spk2utt_rspecifier, spkvecs_rspecifier; - BaseFloat min_count = 100; - BaseFloat rand_prune = 1.0e-05; - SgmmGselectConfig gselect_opts; - - gselect_opts.Register(&po); - po.Register("gselect", &gselect_rspecifier, - "File to read precomputed per-frame Gaussian indices from."); - po.Register("spk2utt", &spk2utt_rspecifier, - "File to read speaker to utterance-list map from."); - po.Register("spkvec-min-count", &min_count, - "Minimum count needed to estimate speaker vectors"); - po.Register("rand-prune", &rand_prune, "Pruning threshold for posteriors"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors to use during aligment (rspecifier)"); - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - string model_rxfilename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - post_rspecifier = po.GetArg(3), - vecs_wspecifier = po.GetArg(4); - - TransitionModel trans_model; - AmSgmm am_sgmm; - { - bool binary; - Input ki(model_rxfilename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - MleSgmmSpeakerAccs spk_stats(am_sgmm, rand_prune); - - RandomAccessPosteriorReader post_reader(post_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - - RandomAccessBaseFloatVectorReader spkvecs_reader(spkvecs_rspecifier); - - BaseFloatVectorWriter vecs_writer(vecs_wspecifier); - - double tot_impr = 0.0, tot_t = 0.0; - int32 num_done = 0, num_no_post = 0, num_other_error = 0; - std::vector > empty_gselect; - - if (!spk2utt_rspecifier.empty()) { // per-speaker adaptation - SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - - for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { - spk_stats.Clear(); - string spk = spk2utt_reader.Key(); - const vector &uttlist = spk2utt_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(spk)) { - spk_vars.v_s = spkvecs_reader.Value(spk); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << spk; - } - } // else spk_vars is "empty" - - for (size_t i = 0; i < uttlist.size(); i++) { - std::string utt = uttlist[i]; - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Did not find features for utterance " << utt; - continue; - } - if (!post_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posteriors for utterance " << utt; - num_no_post++; - continue; - } - const Matrix &feats = feature_reader.Value(utt); - const Posterior &post = post_reader.Value(utt); - if (static_cast(post.size()) != feats.NumRows()) { - KALDI_WARN << "Posterior vector has wrong size " << (post.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - bool has_gselect = false; - if (gselect_reader.IsOpen()) { - has_gselect = gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == feats.NumRows(); - if (!has_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - } - const std::vector > *gselect = - (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - AccumulateForUtterance(feats, post, trans_model, am_sgmm, gselect_opts, *gselect, spk_vars, &spk_stats); - num_done++; - } // end looping over all utterances of the current speaker - - BaseFloat impr, spk_tot_t; - { // Compute the spk_vec and write it out. - Vector spk_vec(am_sgmm.SpkSpaceDim(), kSetZero); - if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s); - spk_stats.Update(min_count, &spk_vec, &impr, &spk_tot_t); - vecs_writer.Write(spk, spk_vec); - } - KALDI_LOG << "For speaker " << spk << ", auxf-impr from speaker vector is " - << (impr/spk_tot_t) << ", over " << spk_tot_t << " frames."; - tot_impr += impr; - tot_t += spk_tot_t; - } // end looping over speakers - } else { // per-utterance adaptation - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - if (!post_reader.HasKey(utt)) { - KALDI_WARN << "Did not find posts for utterance " - << utt; - num_no_post++; - continue; - } - const Matrix &feats = feature_reader.Value(); - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - } - } // else spk_vars is "empty" - const Posterior &post = post_reader.Value(utt); - - if (static_cast(post.size()) != feats.NumRows()) { - KALDI_WARN << "Posterior has wrong size " << (post.size()) - << " vs. " << (feats.NumRows()); - num_other_error++; - continue; - } - num_done++; - - spk_stats.Clear(); - bool has_gselect = false; - if (gselect_reader.IsOpen()) { - has_gselect = gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == feats.NumRows(); - if (!has_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - } - const std::vector > *gselect = - (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - AccumulateForUtterance(feats, post, trans_model, am_sgmm, gselect_opts, *gselect, spk_vars, &spk_stats); - - BaseFloat impr, utt_tot_t; - { // Compute the spk_vec and write it out. - Vector spk_vec(am_sgmm.SpkSpaceDim(), kSetZero); - if (spk_vars.v_s.Dim() != 0) spk_vec.CopyFromVec(spk_vars.v_s); - spk_stats.Update(min_count, &spk_vec, &impr, &utt_tot_t); - vecs_writer.Write(utt, spk_vec); - } - KALDI_LOG << "For utterance " << utt << ", auxf-impr from speaker vectors is " - << (impr/utt_tot_t) << ", over " << utt_tot_t << " frames."; - tot_impr += impr; - tot_t += utt_tot_t; - } - } - - KALDI_LOG << "Overall auxf impr per frame is " - << (tot_impr / tot_t) << " over " << tot_t << " frames."; - KALDI_LOG << "Done " << num_done << " files, " << num_no_post - << " with no posts, " << num_other_error << " with other errors."; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - diff --git a/src/sgmmbin/sgmm-est.cc b/src/sgmmbin/sgmm-est.cc deleted file mode 100644 index fdb0bc36125..00000000000 --- a/src/sgmmbin/sgmm-est.cc +++ /dev/null @@ -1,172 +0,0 @@ -// sgmmbin/sgmm-est.cc - -// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "thread/kaldi-thread.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Estimate SGMM model parameters from accumulated stats.\n" - "Usage: sgmm-est [options] \n"; - - bool binary_write = true; - std::string update_flags_str = "vMNwcSt"; - std::string write_flags_str = "gsnu"; - kaldi::MleTransitionUpdateConfig tcfg; - kaldi::MleAmSgmmOptions sgmm_opts; - int32 split_substates = 0; - int32 increase_phn_dim = 0; - int32 increase_spk_dim = 0; - bool remove_speaker_space = false; - BaseFloat perturb_factor = 0.01; - BaseFloat power = 0.2; - BaseFloat max_cond = 100; - std::string occs_out_filename; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("split-substates", &split_substates, "Increase number of " - "substates to this overall target."); - po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space " - "dimension as far as allowed towards this target."); - po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space " - "dimension as far as allowed towards this target."); - po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific " - "projections N"); - po.Register("power", &power, "Exponent for substate occupancies used while " - "splitting substates."); - po.Register("perturb-factor", &perturb_factor, "Perturbation factor for " - "state vectors while splitting substates."); - po.Register("max-cond-split", &max_cond, "Max condition number of smoothing " - "matrix used in substate splitting."); - po.Register("write-occs", &occs_out_filename, "File to write pdf " - "occupantion counts to."); - po.Register("update-flags", &update_flags_str, "Which SGMM parameters to " - "update: subset of vMNwcSt."); - po.Register("write-flags", &write_flags_str, "Which SGMM parameters to " - "write: subset of gsnu"); - po.Register("num-threads", &g_num_threads, "Number of threads to use in " - "weight update and normalizer computation"); - tcfg.Register(&po); - sgmm_opts.Register(&po); - - po.Read(argc, argv); - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - std::string model_in_filename = po.GetArg(1), - stats_filename = po.GetArg(2), - model_out_filename = po.GetArg(3); - - kaldi::SgmmUpdateFlagsType update_flags = - StringToSgmmUpdateFlags(update_flags_str); - kaldi::SgmmWriteFlagsType write_flags = - StringToSgmmWriteFlags(write_flags_str); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - Vector transition_accs; - MleAmSgmmAccs sgmm_accs; - { - bool binary; - Input ki(stats_filename, &binary); - transition_accs.Read(ki.Stream(), binary); - sgmm_accs.Read(ki.Stream(), binary, true); // true == add; doesn't matter here. - } - - if (update_flags & kSgmmTransitions) { // Update transition model. - BaseFloat objf_impr, count; - trans_model.MleUpdate(transition_accs, tcfg, &objf_impr, &count); - KALDI_LOG << "Transition model update: Overall " << (objf_impr/count) - << " log-like improvement per frame over " << (count) - << " frames."; - } - - sgmm_accs.Check(am_sgmm, true); // Will check consistency and print some diagnostics. - - { // Do the update. - kaldi::MleAmSgmmUpdater updater(sgmm_opts); - updater.Update(sgmm_accs, &am_sgmm, update_flags); - } - - if (split_substates != 0 || !occs_out_filename.empty()) { // get state occs - Vector pdf_occs; - sgmm_accs.GetStateOccupancies(&pdf_occs); - - if (split_substates != 0) { - am_sgmm.SplitSubstates(pdf_occs, split_substates, perturb_factor, - power, max_cond); - am_sgmm.ComputeDerivedVars(); // recompute normalizers... - } - - if (!occs_out_filename.empty()) { - bool binary_write = false; - kaldi::Output ko(occs_out_filename, binary_write); - pdf_occs.Write(ko.Stream(), binary_write); - } - } - - if (increase_phn_dim != 0 || increase_spk_dim != 0) { - // Feature normalizing transform matrix used to initialize the new columns - // of the phonetic- or speaker-space projection matrices. - kaldi::Matrix norm_xform; - ComputeFeatureNormalizer(am_sgmm.full_ubm(), &norm_xform); - if (increase_phn_dim != 0) - am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform); - if (increase_spk_dim != 0) - am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform); - } - if (remove_speaker_space) { - KALDI_LOG << "Removing speaker space (projections N_)"; - am_sgmm.RemoveSpeakerSpace(); - } - - { - Output ko(model_out_filename, binary_write); - trans_model.Write(ko.Stream(), binary_write); - am_sgmm.Write(ko.Stream(), binary_write, write_flags); - } - - - KALDI_LOG << "Written model to " << model_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-gselect.cc b/src/sgmmbin/sgmm-gselect.cc deleted file mode 100644 index 7234406f9ce..00000000000 --- a/src/sgmmbin/sgmm-gselect.cc +++ /dev/null @@ -1,125 +0,0 @@ -// sgmmbin/sgmm-gselect.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - const char *usage = - "Precompute Gaussian indices for SGMM training " - "Usage: sgmm-gselect [options] \n" - "e.g.: sgmm-gselect 1.sgmm \"ark:feature-command |\" ark:1.gs\n" - "Note: you can do the same thing by combining the programs sgmm-write-ubm, fgmm-global-to-gmm,\n" - "gmm-gselect and fgmm-gselect\n"; - - ParseOptions po(usage); - kaldi::SgmmGselectConfig sgmm_opts; - std::string preselect_rspecifier; - std::string likelihood_wspecifier; - po.Register("preselect", &preselect_rspecifier, "Rspecifier for sets of Gaussians to " - "limit gselect to (e.g. for gender dependent systems)"); - po.Register("write-likes", &likelihood_wspecifier, "Wspecifier for likelihoods per " - "utterance"); - sgmm_opts.Register(&po); - po.Read(argc, argv); - - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - gselect_wspecifier = po.GetArg(3); - - using namespace kaldi; - typedef kaldi::int32 int32; - - AmSgmm am_sgmm; - { - bool binary; - Input ki(model_filename, &binary); - TransitionModel trans_model; - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - double tot_like = 0.0; - kaldi::int64 tot_t = 0; - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - Int32VectorVectorWriter gselect_writer(gselect_wspecifier); - BaseFloatWriter likelihood_writer(likelihood_wspecifier); - RandomAccessInt32VectorReader preselect_reader(preselect_rspecifier); - - int32 num_done = 0, num_err = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - int32 tot_t_this_file = 0; double tot_like_this_file = 0; - std::string utt = feature_reader.Key(); - const Matrix &mat = feature_reader.Value(); - std::vector > gselect_vec(mat.NumRows()); - tot_t_this_file += mat.NumRows(); - if(preselect_rspecifier != "") { // e.g. gender dependent. - if (!preselect_reader.HasKey(utt)) { - KALDI_WARN << "No preselect information for utterance " << utt; - num_err++; - continue; - } - const std::vector &preselect = preselect_reader.Value(utt); - KALDI_ASSERT(!preselect.empty()); - for (int32 i = 0; i < mat.NumRows(); i++) - tot_like_this_file += - am_sgmm.GaussianSelectionPreselect(sgmm_opts, mat.Row(i), - preselect, &(gselect_vec[i])); - } else { - for (int32 i = 0; i < mat.NumRows(); i++) - tot_like_this_file += am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &(gselect_vec[i])); - } - gselect_writer.Write(utt, gselect_vec); - if (num_done % 10 == 0) - KALDI_LOG << "For " << num_done << "'th file, average UBM likelihood over " - << tot_t_this_file << " frames is " - << (tot_like_this_file/tot_t_this_file); - tot_t += tot_t_this_file; - tot_like += tot_like_this_file; - - if(likelihood_wspecifier != "") - likelihood_writer.Write(utt, tot_like_this_file); - num_done++; - } - - KALDI_LOG << "Done " << num_done << " files, " << num_err - << " with errors, average UBM log-likelihood is " - << (tot_like/tot_t) << " over " << tot_t << " frames."; - - - if (num_done != 0) return 0; - else return 1; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-info.cc b/src/sgmmbin/sgmm-info.cc deleted file mode 100644 index c5e5dc70686..00000000000 --- a/src/sgmmbin/sgmm-info.cc +++ /dev/null @@ -1,110 +0,0 @@ -// sgmmbin/sgmm-info.cc - -// Copyright 2012 Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Print various information about an SGMM.\n" - "Usage: sgmm-info [options] [model-in2 ... ]\n"; - - bool sgmm_detailed = false; - bool trans_detailed = false; - - ParseOptions po(usage); - po.Register("sgmm-detailed", &sgmm_detailed, - "Print detailed information about substates."); - po.Register("trans-detailed", &trans_detailed, - "Print detailed information about transition model."); - - po.Read(argc, argv); - if (po.NumArgs() < 1) { - po.PrintUsage(); - exit(1); - } - - for (int i = 1, max = po.NumArgs(); i <= max; ++i) { - std::string model_in_filename = po.GetArg(i); - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - { - using namespace std; - cout.setf(ios::left); - cout << "\nModel file: " << model_in_filename << endl; - cout << " SGMM information:\n" - << setw(40) << " # of HMM states" << am_sgmm.NumPdfs() << endl - << setw(40) << " # of Gaussians per state" << am_sgmm.NumGauss() << endl - << setw(40) << " Dimension of phone vector space" - << am_sgmm.PhoneSpaceDim() << endl - << setw(40) << " Dimension of speaker vector space" - << am_sgmm.SpkSpaceDim() << endl - << setw(40) << " Dimension of feature vectors" - << am_sgmm.FeatureDim() << endl; - int32 total_substates = 0; - for (int32 j = 0; j < am_sgmm.NumPdfs(); j++) { - total_substates += am_sgmm.NumSubstates(j); - if (sgmm_detailed) { - cout << " # of substates for state " << setw(13) << j - << am_sgmm.NumSubstates(j) << endl; - } - } - cout << setw(40) << " Total # of substates " << total_substates << endl; - - cout << "\nTransition model information:\n" - << setw(40) << " # of HMM states" << trans_model.NumPdfs() << endl - << setw(40) << " # of transition states" - << trans_model.NumTransitionStates() << endl; - int32 total_indices = 0; - for (int32 s = 0; s < trans_model.NumTransitionStates(); s++) { - total_indices += trans_model.NumTransitionIndices(s); - if (trans_detailed) { - cout << " # of transition ids for state " << setw(8) << s - << trans_model.NumTransitionIndices(s) << endl; - } - } - cout << setw(40) << " Total # of transition ids " << total_indices - << endl; - } - } - - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-init-from-tree-stats.cc b/src/sgmmbin/sgmm-init-from-tree-stats.cc deleted file mode 100644 index 0802507c126..00000000000 --- a/src/sgmmbin/sgmm-init-from-tree-stats.cc +++ /dev/null @@ -1,147 +0,0 @@ -// sgmmbin/sgmm-init-from-tree-stats.cc - -// Copyright 2012 Arnab Ghoshal Johns Hopkins University (Author: Daniel Povey) -// Copyright 2009-2011 Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/common-utils.h" -#include "gmm/am-diag-gmm.h" -#include "sgmm/am-sgmm.h" -#include "sgmm/sgmm-clusterable.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/transition-model.h" -#include "tree/context-dep.h" -#include "tree/build-tree-utils.h" - - - -namespace kaldi { -void InitAndOutputSgmm(const HmmTopology &topo, - const AmSgmm &am_sgmm, - const ContextDependency &ctx_dep, - const std::vector > &H, - const BuildTreeStatsType &stats, - const std::string &sgmm_wxfilename, - bool binary) { - int32 num_pdfs = ctx_dep.NumPdfs(); - AmSgmm am_sgmm_out; - am_sgmm_out.CopyGlobalsInitVecs(am_sgmm, am_sgmm.PhoneSpaceDim(), - am_sgmm.SpkSpaceDim(), num_pdfs); - MleAmSgmmOptions opts; // Use default options; we can change this later - // if we need to use any non-default options. - MleAmSgmmUpdater updater(opts); - - std::vector split_stats; - SplitStatsByMap(stats, ctx_dep.ToPdfMap(), &split_stats); - // Make sure each leaf has stats. - for (size_t i = 0; i < split_stats.size(); i++) - KALDI_ASSERT(! split_stats[i].empty() && "Tree has leaves with no stats." - " Modify your roots file as necessary to fix this."); - std::vector summed_stats; - SumStatsVec(split_stats, &summed_stats); - - std::vector &summed_sgmm_stats = - *(reinterpret_cast*> (&summed_stats)); - - for (int32 iter = 0; iter < 5; iter++) { // Update for - // several iterations; we're starting from zero so we won't - // converge exactly on the first iteration. - updater.UpdatePhoneVectorsCheckedFromClusterable(summed_sgmm_stats, - H, - &am_sgmm_out); - } - DeletePointers(&summed_stats); - - TransitionModel trans_model_out(ctx_dep, topo); - { - Output ko(sgmm_wxfilename, binary); - am_sgmm_out.ComputeNormalizers(); - trans_model_out.Write(ko.Stream(), binary); - am_sgmm_out.Write(ko.Stream(), binary, kSgmmWriteAll); - } -} - -} - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - - const char *usage = - "Initialize an SGMM from a previously built SGMM, a tree, \n" - "and SGMM-type tree stats\n" - "Usage: sgmm-init-from-tree-stats [options] \n"; - - bool binary = true; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string sgmm_in_filename = po.GetArg(1), - tree_in_filename = po.GetArg(2), - tree_stats_filename = po.GetArg(3), - sgmm_out_filename = po.GetArg(4); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(sgmm_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - const HmmTopology &topo = trans_model.GetTopo(); - std::vector > H; - am_sgmm.ComputeH(&H); - - ContextDependency ctx_dep; - { - bool binary_in; - Input ki(tree_in_filename.c_str(), &binary_in); - ctx_dep.Read(ki.Stream(), binary_in); - } - - BuildTreeStatsType stats; - { - bool binary_in; - SgmmClusterable sc(am_sgmm, H); // dummy stats needed to provide - // type info, and access to am_sgmm and H. - Input ki(tree_stats_filename, &binary_in); - ReadBuildTreeStats(ki.Stream(), binary_in, sc, &stats); - } - KALDI_LOG << "Number of separate statistics is " << stats.size(); - - InitAndOutputSgmm(topo, am_sgmm, ctx_dep, H, stats, - sgmm_out_filename, binary); - - KALDI_LOG << "Written model to " << sgmm_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-init.cc b/src/sgmmbin/sgmm-init.cc deleted file mode 100644 index f90ca3e5650..00000000000 --- a/src/sgmmbin/sgmm-init.cc +++ /dev/null @@ -1,111 +0,0 @@ -// sgmmbin/sgmm-init.cc - -// Copyright 2012 Arnab Ghoshal -// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/common-utils.h" -#include "gmm/am-diag-gmm.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "tree/context-dep.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - - const char *usage = - "Initialize an SGMM from a trained full-covariance UBM and a specified" - " model topology.\n" - "Usage: sgmm-init [options] \n" - "The argument can be a UBM (the default case) or another\n" - "SGMM (if the --init-from-sgmm flag is used).\n"; - - bool binary = true, init_from_sgmm = false; - int32 phn_space_dim = 0, spk_space_dim = 0; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Register("phn-space-dim", &phn_space_dim, "Phonetic space dimension."); - po.Register("spk-space-dim", &spk_space_dim, "Speaker space dimension."); - po.Register("init-from-sgmm", &init_from_sgmm, - "Initialize from another SGMM (instead of a UBM)."); - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string topo_in_filename = po.GetArg(1), - tree_in_filename = po.GetArg(2), - init_model_filename = po.GetArg(3), - sgmm_out_filename = po.GetArg(4); - - ContextDependency ctx_dep; - { - bool binary_in; - Input ki(tree_in_filename.c_str(), &binary_in); - ctx_dep.Read(ki.Stream(), binary_in); - } - - - HmmTopology topo; - ReadKaldiObject(topo_in_filename, &topo); - - TransitionModel trans_model(ctx_dep, topo); - - kaldi::AmSgmm sgmm; - if (init_from_sgmm) { - kaldi::AmSgmm init_sgmm; - { - bool binary_read; - TransitionModel tmp_trans; - kaldi::Input ki(init_model_filename, &binary_read); - tmp_trans.Read(ki.Stream(), binary_read); - init_sgmm.Read(ki.Stream(), binary_read); - } - sgmm.CopyGlobalsInitVecs(init_sgmm, phn_space_dim, spk_space_dim, - trans_model.NumPdfs()); - } else { - kaldi::FullGmm ubm; - { - bool binary_read; - kaldi::Input ki(init_model_filename, &binary_read); - ubm.Read(ki.Stream(), binary_read); - } - sgmm.InitializeFromFullGmm(ubm, trans_model.NumPdfs(), phn_space_dim, - spk_space_dim); - } - sgmm.ComputeNormalizers(); - - { - kaldi::Output ko(sgmm_out_filename, binary); - trans_model.Write(ko.Stream(), binary); - sgmm.Write(ko.Stream(), binary, kaldi::kSgmmWriteAll); - } - - KALDI_LOG << "Written model to " << sgmm_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-latgen-faster.cc b/src/sgmmbin/sgmm-latgen-faster.cc deleted file mode 100644 index 3162b1f72f9..00000000000 --- a/src/sgmmbin/sgmm-latgen-faster.cc +++ /dev/null @@ -1,271 +0,0 @@ -// sgmmbin/sgmm-latgen-faster.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation; -// Johns Hopkins University (author: Daniel Povey) -// 2014 Guoguo Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "fstext/fstext-lib.h" -#include "decoder/decoder-wrappers.h" -#include "sgmm/decodable-am-sgmm.h" -#include "base/timer.h" - -namespace kaldi { - -// the reference arguments at the beginning are not const as the style guide -// requires, but are best viewed as inputs. -bool ProcessUtterance(LatticeFasterDecoder &decoder, - const AmSgmm &am_sgmm, - const TransitionModel &trans_model, - const SgmmGselectConfig &sgmm_opts, - double log_prune, - double acoustic_scale, - const Matrix &features, - RandomAccessInt32VectorVectorReader &gselect_reader, - RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader, - const fst::SymbolTable *word_syms, - const std::string &utt, - bool determinize, - bool allow_partial, - Int32VectorWriter *alignments_writer, - Int32VectorWriter *words_writer, - CompactLatticeWriter *compact_lattice_writer, - LatticeWriter *lattice_writer, - double *like_ptr) { // puts utterance's like in like_ptr on success. - using fst::VectorFst; - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance"; - return false; // We could use zero, but probably the user would want to know about this - // (this would normally be a script error or some kind of failure). - } - } - bool has_gselect = false; - if (gselect_reader.IsOpen()) { - has_gselect = gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == features.NumRows(); - if (!has_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - } - std::vector > empty_gselect; - const std::vector > *gselect = - (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, - trans_model, features, *gselect, - log_prune, acoustic_scale); - - return DecodeUtteranceLatticeFaster( - decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale, - determinize, allow_partial, alignments_writer, words_writer, - compact_lattice_writer, lattice_writer, like_ptr); -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - using fst::SymbolTable; - using fst::VectorFst; - using fst::StdArc; - - const char *usage = - "Decode features using SGMM-based model.\n" - "Usage: sgmm-latgen-faster [options] (|) " - " [ [] ]\n"; - ParseOptions po(usage); - BaseFloat acoustic_scale = 0.1; - bool allow_partial = false; - BaseFloat log_prune = 5.0; - string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier, - utt2spk_rspecifier; - - LatticeFasterDecoderConfig decoder_opts; - SgmmGselectConfig sgmm_opts; - decoder_opts.Register(&po); - sgmm_opts.Register(&po); - - po.Register("acoustic-scale", &acoustic_scale, - "Scaling factor for acoustic likelihoods"); - po.Register("log-prune", &log_prune, - "Pruning beam used to reduce number of exp() evaluations."); - po.Register("word-symbol-table", &word_syms_filename, - "Symbol table for words [for debug output]"); - po.Register("allow-partial", &allow_partial, - "Produce output even when final state was not reached"); - po.Register("gselect", &gselect_rspecifier, - "rspecifier for precomputed per-frame Gaussian indices."); - po.Register("spk-vecs", &spkvecs_rspecifier, - "rspecifier for speaker vectors"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Read(argc, argv); - - if (po.NumArgs() < 4 || po.NumArgs() > 6) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - fst_in_str = po.GetArg(2), - feature_rspecifier = po.GetArg(3), - lattice_wspecifier = po.GetArg(4), - words_wspecifier = po.GetOptArg(5), - alignment_wspecifier = po.GetOptArg(6); - - TransitionModel trans_model; - kaldi::AmSgmm am_sgmm; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - CompactLatticeWriter compact_lattice_writer; - LatticeWriter lattice_writer; - bool determinize = decoder_opts.determinize_lattice; - if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) - : lattice_writer.Open(lattice_wspecifier))) - KALDI_ERR << "Could not open table for writing lattices: " - << lattice_wspecifier; - - Int32VectorWriter words_writer(words_wspecifier); - - Int32VectorWriter alignment_writer(alignment_wspecifier); - - fst::SymbolTable *word_syms = NULL; - if (word_syms_filename != "") - if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) - KALDI_ERR << "Could not read symbol table from file " - << word_syms_filename; - - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - - BaseFloat tot_like = 0.0; - kaldi::int64 frame_count = 0; - int num_success = 0, num_fail = 0; - - Timer timer; - - if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) { // a single FST. - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - // It's important that we initialize decode_fst after feature_reader, as it - // can prevent crashes on systems installed without enough virtual memory. - // It has to do with what happens on UNIX systems if you call fork() on a - // large process: the page-table entries are duplicated, which requires a - // lot of virtual memory. - VectorFst *decode_fst = fst::ReadFstKaldi(fst_in_str); - timer.Reset(); // exclude graph loading time. - - { - LatticeFasterDecoder decoder(*decode_fst, decoder_opts); - - const std::vector > empty_gselect; - - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - const Matrix &features(feature_reader.Value()); - if (features.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << utt; - num_fail++; - continue; - } - double like; - if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune, acoustic_scale, - features, gselect_reader, spkvecs_reader, word_syms, - utt, determinize, allow_partial, - &alignment_writer, &words_writer, &compact_lattice_writer, - &lattice_writer, &like)) { - tot_like += like; - frame_count += features.NumRows(); - KALDI_LOG << "Log-like per frame for utterance " << utt << " is " - << (like / features.NumRows()) << " over " - << features.NumRows() << " frames."; - num_success++; - } else { num_fail++; } - } - } - delete decode_fst; // only safe to do this after decoder goes out of scope. - } else { // We have different FSTs for different utterances. - SequentialTableReader fst_reader(fst_in_str); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - for (; !fst_reader.Done(); fst_reader.Next()) { - std::string utt = fst_reader.Key(); - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "Not decoding utterance " << utt - << " because no features available."; - num_fail++; - continue; - } - const Matrix &features = feature_reader.Value(utt); - if (features.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << utt; - num_fail++; - continue; - } - LatticeFasterDecoder decoder(fst_reader.Value(), decoder_opts); - double like; - if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune, acoustic_scale, - features, gselect_reader, spkvecs_reader, word_syms, - utt, determinize, allow_partial, - &alignment_writer, &words_writer, &compact_lattice_writer, - &lattice_writer, &like)) { - tot_like += like; - frame_count += features.NumRows(); - KALDI_LOG << "Log-like per frame for utterance " << utt << " is " - << (like / features.NumRows()) << " over " - << features.NumRows() << " frames."; - num_success++; - } else { num_fail++; } - } - } - double elapsed = timer.Elapsed(); - KALDI_LOG << "Time taken [excluding initialization] "<< elapsed - << "s: real-time factor assuming 100 frames/sec is " - << (elapsed*100.0/frame_count); - KALDI_LOG << "Done " << num_success << " utterances, failed for " - << num_fail; - KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count) - << " over " << frame_count << " frames."; - - delete word_syms; - return (num_success != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-latgen-simple.cc b/src/sgmmbin/sgmm-latgen-simple.cc deleted file mode 100644 index 69e272ba9c6..00000000000 --- a/src/sgmmbin/sgmm-latgen-simple.cc +++ /dev/null @@ -1,232 +0,0 @@ -// sgmmbin/sgmm-latgen-simple.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation -// 2013 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include -using std::string; - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "fstext/fstext-lib.h" -#include "decoder/decoder-wrappers.h" -#include "sgmm/decodable-am-sgmm.h" -#include "base/timer.h" - -namespace kaldi { - -// the reference arguments at the beginning are not const as the style guide -// requires, but are best viewed as inputs. -bool ProcessUtterance(LatticeSimpleDecoder &decoder, - const AmSgmm &am_sgmm, - const TransitionModel &trans_model, - const SgmmGselectConfig &sgmm_opts, - double log_prune, - double acoustic_scale, - const Matrix &features, - RandomAccessInt32VectorVectorReader &gselect_reader, - RandomAccessBaseFloatVectorReaderMapped &spkvecs_reader, - const fst::SymbolTable *word_syms, - const std::string &utt, - bool determinize, - bool allow_partial, - Int32VectorWriter *alignments_writer, - Int32VectorWriter *words_writer, - CompactLatticeWriter *compact_lattice_writer, - LatticeWriter *lattice_writer, - double *like_ptr) { // puts utterance's like in like_ptr on success. - using fst::VectorFst; - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt << ", not decoding this utterance"; - return false; // We could use zero, but probably the user would want to know about this - // (this would normally be a script error or some kind of failure). - } - } - bool has_gselect = false; - if (gselect_reader.IsOpen()) { - has_gselect = gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == features.NumRows(); - if (!has_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - } - std::vector > empty_gselect; - const std::vector > *gselect = - (has_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - DecodableAmSgmmScaled sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, - trans_model, features, *gselect, - log_prune, acoustic_scale); - - return DecodeUtteranceLatticeSimple( - decoder, sgmm_decodable, trans_model, word_syms, utt, acoustic_scale, - determinize, allow_partial, alignments_writer, words_writer, - compact_lattice_writer, lattice_writer, like_ptr); -} - -} // end namespace kaldi - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - using fst::SymbolTable; - using fst::VectorFst; - using fst::StdArc; - - const char *usage = - "Decode features using SGMM-based model.\n" - "Usage: sgmm-latgen-simple [options] " - " [ [] ]\n"; - ParseOptions po(usage); - BaseFloat acoustic_scale = 0.1; - bool allow_partial = false; - BaseFloat log_prune = 5.0; - string word_syms_filename, gselect_rspecifier, spkvecs_rspecifier, - utt2spk_rspecifier; - - LatticeSimpleDecoderConfig decoder_opts; - SgmmGselectConfig sgmm_opts; - decoder_opts.Register(&po); - sgmm_opts.Register(&po); - - po.Register("acoustic-scale", &acoustic_scale, - "Scaling factor for acoustic likelihoods"); - po.Register("log-prune", &log_prune, - "Pruning beam used to reduce number of exp() evaluations."); - po.Register("word-symbol-table", &word_syms_filename, - "Symbol table for words [for debug output]"); - po.Register("allow-partial", &allow_partial, - "Produce output even when final state was not reached"); - po.Register("gselect", &gselect_rspecifier, - "rspecifier for precomputed per-frame Gaussian indices."); - po.Register("spk-vecs", &spkvecs_rspecifier, - "rspecifier for speaker vectors"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Read(argc, argv); - - if (po.NumArgs() < 4 || po.NumArgs() > 6) { - po.PrintUsage(); - exit(1); - } - - std::string model_in_filename = po.GetArg(1), - fst_in_filename = po.GetArg(2), - feature_rspecifier = po.GetArg(3), - lattice_wspecifier = po.GetArg(4), - words_wspecifier = po.GetOptArg(5), - alignment_wspecifier = po.GetOptArg(6); - - TransitionModel trans_model; - kaldi::AmSgmm am_sgmm; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - CompactLatticeWriter compact_lattice_writer; - LatticeWriter lattice_writer; - bool determinize = decoder_opts.determinize_lattice; - if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier) - : lattice_writer.Open(lattice_wspecifier))) - KALDI_ERR << "Could not open table for writing lattices: " - << lattice_wspecifier; - - Int32VectorWriter words_writer(words_wspecifier); - - Int32VectorWriter alignment_writer(alignment_wspecifier); - - fst::SymbolTable *word_syms = NULL; - if (word_syms_filename != "") - if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename))) - KALDI_ERR << "Could not read symbol table from file " - << word_syms_filename; - - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - - // It's important that we initialize decode_fst after feature_reader, as it - // can prevent crashes on systems installed without enough virtual memory. - // It has to do with what happens on UNIX systems if you call fork() on a - // large process: the page-table entries are duplicated, which requires a - // lot of virtual memory. - VectorFst *decode_fst = fst::ReadFstKaldi(fst_in_filename); - - BaseFloat tot_like = 0.0; - kaldi::int64 frame_count = 0; - int num_success = 0, num_fail = 0; - LatticeSimpleDecoder decoder(*decode_fst, decoder_opts); - - Timer timer; - - for (; !feature_reader.Done(); feature_reader.Next()) { - string utt = feature_reader.Key(); - Matrix features(feature_reader.Value()); - feature_reader.FreeCurrent(); - if (features.NumRows() == 0) { - KALDI_WARN << "Zero-length utterance: " << utt; - num_fail++; - continue; - } - double like; - if (ProcessUtterance(decoder, am_sgmm, trans_model, sgmm_opts, log_prune, - acoustic_scale, features, gselect_reader, - spkvecs_reader, word_syms, utt, determinize, - allow_partial, &alignment_writer, &words_writer, - &compact_lattice_writer, &lattice_writer, &like)) { - tot_like += like; - frame_count += features.NumRows(); - KALDI_LOG << "Log-like per frame for utterance " << utt << " is " - << (like / features.NumRows()) << " over " - << features.NumRows() << " frames."; - num_success++; - } else num_fail++; - } - double elapsed = timer.Elapsed(); - KALDI_LOG << "Time taken [excluding initialization] "<< elapsed - << "s: real-time factor assuming 100 frames/sec is " - << (elapsed*100.0/frame_count); - KALDI_LOG << "Done " << num_success << " utterances, failed for " - << num_fail; - KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count) - << " over " << frame_count << " frames."; - - delete word_syms; - delete decode_fst; - return (num_success != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-mixup.cc b/src/sgmmbin/sgmm-mixup.cc deleted file mode 100644 index 36731587317..00000000000 --- a/src/sgmmbin/sgmm-mixup.cc +++ /dev/null @@ -1,145 +0,0 @@ -// sgmmbin/sgmm-mixup.cc - -// Copyright 2009-2011 Saarland University -// Author: Arnab Ghoshal - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" - - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Increase number of sub-states or dimensions in SGMM\n" - "Usage: sgmm-mixup [options] \n" - "E.g. of mixing up:\n" - " sgmm-mixup --read-occs=1.occs --num-substates=10000 1.mdl 2.mdl\n" - "E.g. of increasing phonetic dim:\n" - " sgmm-mixup --increase-phn-dim=50 1.mdl 2.mdl\n" - "E.g. of increasing speaker dim:\n" - " sgmm-mixup --increase-spk-dim=50 1.mdl 2.mdl\n" - "E.g. of removing speaker space:\n" - " sgmm-mixup --remove-speaker-space 1.mdl 2.mdl\n" - "These modes may be combined.\n"; - - bool binary_write = true; - std::string write_flags_str = "gsnu"; - int32 split_substates = 0; - int32 increase_phn_dim = 0; - int32 increase_spk_dim = 0; - bool remove_speaker_space = false; - BaseFloat perturb_factor = 0.01; - BaseFloat power = 0.2; - BaseFloat max_cond = 100; - std::string occs_in_filename; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - po.Register("split-substates", &split_substates, "Increase number of " - "substates to this overall target."); - po.Register("increase-phn-dim", &increase_phn_dim, "Increase phone-space " - "dimension as far as allowed towards this target."); - po.Register("increase-spk-dim", &increase_spk_dim, "Increase speaker-space " - "dimension as far as allowed towards this target."); - po.Register("remove-speaker-space", &remove_speaker_space, "Remove speaker-specific " - "projections N"); - po.Register("power", &power, "Exponent for substate occupancies used while " - "splitting substates."); - po.Register("perturb-factor", &perturb_factor, "Perturbation factor for " - "state vectors while splitting substates."); - po.Register("max-cond-split", &max_cond, "Max condition number of smoothing " - "matrix used in substate splitting."); - po.Register("write-flags", &write_flags_str, "Which SGMM parameters to " - "write: subset of gsnu"); - po.Register("read-occs", &occs_in_filename, "Read occupancies from this file " - "(required for mixing up)"); - - po.Read(argc, argv); - if (po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string model_in_filename = po.GetArg(1), - model_out_filename = po.GetArg(2); - - kaldi::SgmmWriteFlagsType write_flags = - StringToSgmmWriteFlags(write_flags_str); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - if (split_substates != 0) { - if (occs_in_filename.empty()) - KALDI_ERR << "The --split-substates option requires the --read-occs option"; - - Vector state_occs; - { - bool binary_in; - kaldi::Input ki(occs_in_filename, &binary_in); - state_occs.Read(ki.Stream(), binary_in); - } - - am_sgmm.SplitSubstates(state_occs, split_substates, perturb_factor, - power, max_cond); - am_sgmm.ComputeDerivedVars(); // recompute normalizers... - } - - if (increase_phn_dim != 0 || increase_spk_dim != 0) { - // Feature normalizing transform matrix used to initialize the new columns - // of the phonetic- or speaker-space projection matrices. - kaldi::Matrix norm_xform; - ComputeFeatureNormalizer(am_sgmm.full_ubm(), &norm_xform); - if (increase_phn_dim != 0) - am_sgmm.IncreasePhoneSpaceDim(increase_phn_dim, norm_xform); - if (increase_spk_dim != 0) - am_sgmm.IncreaseSpkSpaceDim(increase_spk_dim, norm_xform); - } - - if (remove_speaker_space) { - KALDI_LOG << "Removing speaker space (projections N_)"; - am_sgmm.RemoveSpeakerSpace(); - } - - { - Output ko(model_out_filename, binary_write); - trans_model.Write(ko.Stream(), binary_write); - am_sgmm.Write(ko.Stream(), binary_write, write_flags); - } - - KALDI_LOG << "Written model to " << model_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-normalize.cc b/src/sgmmbin/sgmm-normalize.cc deleted file mode 100644 index c41141207dc..00000000000 --- a/src/sgmmbin/sgmm-normalize.cc +++ /dev/null @@ -1,85 +0,0 @@ -// sgmmbin/sgmm-normalize.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Renormalize SGMM so that within certain subsets of UBM Gaussians (typically \n" - "corresponding to gender), probabilities sum to one; write it out, including\n" - "normalizers." - "Note: gaussians-rspecifier will normally be \"ark:foo\" where foo looks like\n" - " m 0 1 2 3 4 5\n" - " f 6 7 8 9 10\n" - "Usage: sgmm-normalize [options] \n"; - - bool binary_write = true; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - - po.Read(argc, argv); - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - std::string model_in_filename = po.GetArg(1), - gaussians_rspecifier = po.GetArg(2), - model_out_filename = po.GetArg(3); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - std::vector > norm_sets; - SequentialInt32VectorReader vec_reader(gaussians_rspecifier); - for (;!vec_reader.Done(); vec_reader.Next()) - norm_sets.push_back(vec_reader.Value()); - - am_sgmm.ComputeNormalizersNormalized(norm_sets); - - { - Output ko(model_out_filename, binary_write); - trans_model.Write(ko.Stream(), binary_write); - am_sgmm.Write(ko.Stream(), binary_write, kSgmmWriteAll); - } - - - KALDI_LOG << "Written model to " << model_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-post-to-gpost.cc b/src/sgmmbin/sgmm-post-to-gpost.cc deleted file mode 100644 index 9395b04fe74..00000000000 --- a/src/sgmmbin/sgmm-post-to-gpost.cc +++ /dev/null @@ -1,190 +0,0 @@ -// sgmmbin/sgmm-post-to-gpost.cc - -// Copyright 2009-2012 Saarland University Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -// 2014 Guoguo Chen - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/posterior.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - try { - const char *usage = - "Convert posteriors to Gaussian-level posteriors for SGMM training.\n" - "Usage: sgmm-post-to-gpost [options] " - " \n" - "e.g.: sgmm-post-to-gpost 1.mdl 1.ali scp:train.scp 'ark:ali-to-post ark:1.ali ark:-|' ark:-"; - - ParseOptions po(usage); - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - SgmmGselectConfig sgmm_opts; - po.Register("gselect", &gselect_rspecifier, "Precomputed Gaussian indices (rspecifier)"); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - sgmm_opts.Register(&po); - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - feature_rspecifier = po.GetArg(2), - posteriors_rspecifier = po.GetArg(3), - gpost_wspecifier = po.GetArg(4); - - using namespace kaldi; - typedef kaldi::int32 int32; - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - double tot_like = 0.0; - kaldi::int64 tot_t = 0; - - SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); - RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier); - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - - SgmmPerFrameDerivedVars per_frame_vars; - - SgmmGauPostWriter gpost_writer(gpost_wspecifier); - - int32 num_done = 0, num_no_posterior = 0, num_other_error = 0; - for (; !feature_reader.Done(); feature_reader.Next()) { - std::string utt = feature_reader.Key(); - if (!posteriors_reader.HasKey(utt)) { - num_no_posterior++; - } else { - const Matrix &mat = feature_reader.Value(); - Posterior posterior = posteriors_reader.Value(utt); - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == mat.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - if (posterior.size() != mat.NumRows()) { - KALDI_WARN << "Alignments has wrong size "<< (posterior.size()) << - " vs. "<< (mat.NumRows()); - num_other_error++; - continue; - } - - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_other_error++; - continue; - } - } // else spk_vars is "empty" - - num_done++; - BaseFloat tot_like_this_file = 0.0, tot_weight = 0.0; - - SgmmGauPost gpost(posterior.size()); // posterior.size() == T. - - SortPosteriorByPdfs(trans_model, &posterior); - int32 prev_pdf_id = -1; - BaseFloat prev_like = 0; - Matrix prev_posterior; - for (size_t i = 0; i < posterior.size(); i++) { - - std::vector this_gselect; - if (!gselect->empty()) this_gselect = (*gselect)[i]; - else am_sgmm.GaussianSelection(sgmm_opts, mat.Row(i), &this_gselect); - am_sgmm.ComputePerFrameVars(mat.Row(i), this_gselect, spk_vars, 0.0, &per_frame_vars); - - gpost[i].gselect = this_gselect; - gpost[i].tids.resize(posterior[i].size()); - gpost[i].posteriors.resize(posterior[i].size()); - - prev_pdf_id = -1; // Only cache for the same frame. - for (size_t j = 0; j < posterior[i].size(); j++) { - int32 tid = posterior[i][j].first, // transition identifier. - pdf_id = trans_model.TransitionIdToPdf(tid); - BaseFloat weight = posterior[i][j].second; - gpost[i].tids[j] = tid; - - if (pdf_id != prev_pdf_id) { - // First time see this pdf-id for this frame, update the cached - // variables. - prev_pdf_id = pdf_id; - prev_like = am_sgmm.ComponentPosteriors(per_frame_vars, pdf_id, - &prev_posterior); - } - - gpost[i].posteriors[j] = prev_posterior; - tot_like_this_file += prev_like * weight; - tot_weight += weight; - gpost[i].posteriors[j].Scale(weight); - } - } - - KALDI_LOG << "Average like for this file is " - << (tot_like_this_file/posterior.size()) << " over " - << posterior.size() <<" frames."; - tot_like += tot_like_this_file; - tot_t += posterior.size(); - if (num_done % 10 == 0) - KALDI_LOG << "Avg like per frame so far is " - << (tot_like/tot_t); - gpost_writer.Write(utt, gpost); - } - } - - KALDI_LOG << "Overall like per frame (Gaussian only) = " - << (tot_like/tot_t) << " over " << tot_t << " frames."; - - KALDI_LOG << "Done " << num_done << " files, " << num_no_posterior - << " with no posteriors, " << num_other_error - << " with other errors."; - - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-rescore-lattice.cc b/src/sgmmbin/sgmm-rescore-lattice.cc deleted file mode 100644 index 6ad50415182..00000000000 --- a/src/sgmmbin/sgmm-rescore-lattice.cc +++ /dev/null @@ -1,165 +0,0 @@ -// sgmmbin/sgmm-rescore-lattice.cc - -// Copyright 2009-2011 Saarland University (Author: Arnab Ghoshal) -// Cisco Systems (Author: Neha Agrawal) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "util/stl-utils.h" -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" -#include "fstext/fstext-lib.h" -#include "lat/kaldi-lattice.h" -#include "lat/lattice-functions.h" -#include "sgmm/decodable-am-sgmm.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - typedef kaldi::int64 int64; - using fst::SymbolTable; - using fst::VectorFst; - using fst::StdArc; - - const char *usage = - "Replace the acoustic scores on a lattice using a new model.\n" - "Usage: sgmm-rescore-lattice [options] " - " \n" - " e.g.: sgmm-rescore-lattice 1.mdl ark:1.lats scp:trn.scp ark:2.lats\n"; - - kaldi::BaseFloat old_acoustic_scale = 0.0; - bool speedup = false; - BaseFloat log_prune = 5.0; - std::string gselect_rspecifier, spkvecs_rspecifier, utt2spk_rspecifier; - SgmmGselectConfig sgmm_opts; - kaldi::ParseOptions po(usage); - po.Register("old-acoustic-scale", &old_acoustic_scale, - "Add the current acoustic scores with some scale."); - po.Register("log-prune", &log_prune, - "Pruning beam used to reduce number of exp() evaluations."); - po.Register("spk-vecs", &spkvecs_rspecifier, "Speaker vectors (rspecifier)"); - po.Register("utt2spk", &utt2spk_rspecifier, - "rspecifier for utterance to speaker map"); - po.Register("gselect", &gselect_rspecifier, - "Precomputed Gaussian indices (rspecifier)"); - po.Register("speedup", &speedup, - "If true, enable a faster version of the computation that " - "saves times when there is only one pdf-id on a single frame " - "by only sometimes (randomly) computing the probabilities, and " - "then scaling them up to preserve corpus-level diagnostics."); - - sgmm_opts.Register(&po); - - po.Read(argc, argv); - - if (po.NumArgs() != 4) { - po.PrintUsage(); - exit(1); - } - - std::string model_filename = po.GetArg(1), - lats_rspecifier = po.GetArg(2), - feature_rspecifier = po.GetArg(3), - lats_wspecifier = po.GetArg(4); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier); - RandomAccessBaseFloatVectorReaderMapped spkvecs_reader(spkvecs_rspecifier, - utt2spk_rspecifier); - RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); - // Read as regular lattice - SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier); - // Write as compact lattice. - CompactLatticeWriter compact_lattice_writer(lats_wspecifier); - - int32 num_done = 0, num_err = 0; - for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) { - std::string utt = compact_lattice_reader.Key(); - if (!feature_reader.HasKey(utt)) { - KALDI_WARN << "No feature found for utterance " << utt << ". Skipping"; - num_err++; - continue; - } - - CompactLattice clat = compact_lattice_reader.Value(); - compact_lattice_reader.FreeCurrent(); - if (old_acoustic_scale != 1.0) - fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &clat); - - const Matrix &feats = feature_reader.Value(utt); - - // Get speaker vectors - SgmmPerSpkDerivedVars spk_vars; - if (spkvecs_reader.IsOpen()) { - if (spkvecs_reader.HasKey(utt)) { - spk_vars.v_s = spkvecs_reader.Value(utt); - am_sgmm.ComputePerSpkDerivedVars(&spk_vars); - } else { - KALDI_WARN << "Cannot find speaker vector for " << utt; - num_err++; - continue; - } - } // else spk_vars is "empty" - - bool have_gselect = !gselect_rspecifier.empty() - && gselect_reader.HasKey(utt) - && gselect_reader.Value(utt).size() == feats.NumRows(); - if (!gselect_rspecifier.empty() && !have_gselect) - KALDI_WARN << "No Gaussian-selection info available for utterance " - << utt << " (or wrong size)"; - std::vector > empty_gselect; - const std::vector > *gselect = - (have_gselect ? &gselect_reader.Value(utt) : &empty_gselect); - - DecodableAmSgmm sgmm_decodable(sgmm_opts, am_sgmm, spk_vars, - trans_model, feats, *gselect, - log_prune); - - if (!speedup) { - if (kaldi::RescoreCompactLattice(&sgmm_decodable, &clat)) { - compact_lattice_writer.Write(utt, clat); - num_done++; - } else num_err++; - } else { - BaseFloat speedup_factor = 100.0; - if (kaldi::RescoreCompactLatticeSpeedup(trans_model, speedup_factor, - &sgmm_decodable, - &clat)) { - compact_lattice_writer.Write(utt, clat); - num_done++; - } else num_err++; - } - } - - KALDI_LOG << "Done " << num_done << " lattices, errors on " - << num_err; - return (num_done != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} diff --git a/src/sgmmbin/sgmm-sum-accs.cc b/src/sgmmbin/sgmm-sum-accs.cc deleted file mode 100644 index 8562536d9cf..00000000000 --- a/src/sgmmbin/sgmm-sum-accs.cc +++ /dev/null @@ -1,69 +0,0 @@ -// sgmmbin/sgmm-sum-accs.cc - -// Copyright 2009-2011 Saarland University; Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "util/common-utils.h" -#include "sgmm/estimate-am-sgmm.h" -#include "hmm/transition-model.h" - - -int main(int argc, char *argv[]) { - try { - typedef kaldi::int32 int32; - - const char *usage = - "Sum multiple accumulated stats files for SGMM training.\n" - "Usage: sgmm-sum-accs [options] stats-out stats-in1 stats-in2 ...\n"; - - bool binary = true; - kaldi::ParseOptions po(usage); - po.Register("binary", &binary, "Write output in binary mode"); - po.Read(argc, argv); - - if (po.NumArgs() < 2) { - po.PrintUsage(); - exit(1); - } - - std::string stats_out_filename = po.GetArg(1); - kaldi::Vector transition_accs; - kaldi::MleAmSgmmAccs sgmm_accs; - - for (int i = 2, max = po.NumArgs(); i <= max; i++) { - std::string stats_in_filename = po.GetArg(i); - bool binary_read; - kaldi::Input ki(stats_in_filename, &binary_read); - transition_accs.Read(ki.Stream(), binary_read, true /* add values */); - sgmm_accs.Read(ki.Stream(), binary_read, true /* add values */); - } - - // Write out the accs - { - kaldi::Output ko(stats_out_filename, binary); - transition_accs.Write(ko.Stream(), binary); - sgmm_accs.Write(ko.Stream(), binary); - } - - KALDI_LOG << "Written stats to " << stats_out_filename; - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-sum-tree-stats.cc b/src/sgmmbin/sgmm-sum-tree-stats.cc deleted file mode 100644 index a1eae2a0bcf..00000000000 --- a/src/sgmmbin/sgmm-sum-tree-stats.cc +++ /dev/null @@ -1,100 +0,0 @@ -// sgmmbin/sgmm-sum-tree-stats.cc - -// Copyright 2012 Johns Hopkins University (Author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "tree/context-dep.h" -#include "tree/build-tree-utils.h" -#include "sgmm/sgmm-clusterable.h" - - -int main(int argc, char *argv[]) { - using namespace kaldi; - typedef kaldi::int32 int32; - try { - const char *usage = - "Sum SGMM-type statistics used for phonetic decision tree building.\n" - "Usage: sgmm-sum-tree-stats [options] tree-accs-out trea-accs-in1 tree-accs-in2 ...\n" - "e.g.: sgmm-sum-tree-stats treeacc 1.streeacc 2.streeacc 3.streeacc\n"; - - ParseOptions po(usage); - bool binary = true; - - po.Register("binary", &binary, "Write output in binary mode"); - po.Read(argc, argv); - - if (po.NumArgs() < 2) { - po.PrintUsage(); - exit(1); - } - - std::string treeacc_wxfilename = po.GetArg(1); - - std::map tree_stats; - - AmSgmm am_sgmm; // dummy variable needed to initialize stats. - std::vector > H; // also needed to initialize stats, - // but never accessed in this program. - - // typedef std::vector > BuildTreeStatsType; - for (int32 arg = 2; arg <= po.NumArgs(); arg++) { - std::string treeacc_rxfilename = po.GetArg(arg); - bool binary_in; - Input ki(treeacc_rxfilename, &binary_in); - BuildTreeStatsType stats_array; - SgmmClusterable example(am_sgmm, H); // Needed for its type information. - ReadBuildTreeStats(ki.Stream(), binary_in, example, &stats_array); - for (BuildTreeStatsType::iterator iter = stats_array.begin(); - iter != stats_array.end(); ++iter) { - EventType e = iter->first; - Clusterable *c = iter->second; - std::map::iterator map_iter = tree_stats.find(e); - if (map_iter == tree_stats.end()) { // Not already present. - tree_stats[e] = c; - } else { - map_iter->second->Add(*c); - delete c; - } - } - } - - BuildTreeStatsType stats; // all the stats, in vectorized form. - - for (std::map::const_iterator iter = tree_stats.begin(); - iter != tree_stats.end(); - iter++ ) { - stats.push_back(std::make_pair(iter->first, iter->second)); - } - tree_stats.clear(); - - { - Output ko(treeacc_wxfilename, binary); - WriteBuildTreeStats(ko.Stream(), binary, stats); - } - KALDI_LOG << "Wrote summed sgmm-treeaaccs: number of separate objects was " - << stats.size(); - DeleteBuildTreeStats(&stats); - return (stats.size() != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/sgmmbin/sgmm-write-ubm.cc b/src/sgmmbin/sgmm-write-ubm.cc deleted file mode 100644 index 3f994f11a03..00000000000 --- a/src/sgmmbin/sgmm-write-ubm.cc +++ /dev/null @@ -1,71 +0,0 @@ -// sgmmbin/sgmm-write-ubm.cc - -// Copyright 2009-2011 Microsoft Corporation - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#include "base/kaldi-common.h" -#include "util/common-utils.h" - -#include "sgmm/am-sgmm.h" -#include "hmm/transition-model.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - typedef kaldi::int32 int32; - const char *usage = - "Write out the full-covariance UBM of the SGMM\n" - "Usage: sgmm-write-ubm [options] \n" - "e.g.: sgmm-write-ubm 1.mdl 1.ubm\n"; - - bool binary_write = true; - - ParseOptions po(usage); - po.Register("binary", &binary_write, "Write output in binary mode"); - - po.Read(argc, argv); - if (po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - std::string model_in_filename = po.GetArg(1), - ubm_out_filename = po.GetArg(2); - - AmSgmm am_sgmm; - TransitionModel trans_model; - { - bool binary; - Input ki(model_in_filename, &binary); - trans_model.Read(ki.Stream(), binary); - am_sgmm.Read(ki.Stream(), binary); - } - - { - Output ko(ubm_out_filename, binary_write); - am_sgmm.full_ubm().Write(ko.Stream(), binary_write); - } - - KALDI_LOG << "Written UBM to " << ubm_out_filename; - return 0; - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - - diff --git a/src/tree/clusterable-classes.h b/src/tree/clusterable-classes.h index 817d0c65bc3..d19e17f6b68 100644 --- a/src/tree/clusterable-classes.h +++ b/src/tree/clusterable-classes.h @@ -27,10 +27,6 @@ namespace kaldi { -// Note: see sgmm/sgmm-clusterable.h for an SGMM-based clusterable -// class. We didn't include it here, to avoid adding an extra -// dependency to this directory. - /// \addtogroup clustering_group /// @{