diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS index 3537852a827..2c72a482914 100644 --- a/egs/reverb/s5/RESULTS +++ b/egs/reverb/s5/RESULTS @@ -1,150 +1,299 @@ -#################### -exp/tri2a/decode_bg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 89.13 -RealData_dt_for_1ch_near_room1_A 90.27 -SimData_dt_for_1ch_far_room1_A 22.44 -SimData_dt_for_1ch_far_room2_A 88.44 -SimData_dt_for_1ch_far_room3_A 91.27 -SimData_dt_for_1ch_near_room1_A 12.19 -SimData_dt_for_1ch_near_room2_A 42.74 -SimData_dt_for_1ch_near_room3_A 49.31 -Avg_Real(2) 89.70 -Avg_Sim(6) 51.06 - -exp/tri2a/decode_bg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 88.45 -RealData_et_for_1ch_near_room1_A 88.66 -SimData_et_for_1ch_far_room1_A 22.72 -SimData_et_for_1ch_far_room2_A 81.53 -SimData_et_for_1ch_far_room3_A 89.25 -SimData_et_for_1ch_near_room1_A 14.37 -SimData_et_for_1ch_near_room2_A 40.46 -SimData_et_for_1ch_near_room3_A 51.50 -Avg_Real(2) 88.56 -Avg_Sim(6) 49.97 - -#################### -exp/tri2a_mc/decode_bg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 53.38 -RealData_dt_for_1ch_near_room1_A 56.27 -SimData_dt_for_1ch_far_room1_A 16.96 -SimData_dt_for_1ch_far_room2_A 44.15 -SimData_dt_for_1ch_far_room3_A 49.88 -SimData_dt_for_1ch_near_room1_A 15.00 -SimData_dt_for_1ch_near_room2_A 21.81 -SimData_dt_for_1ch_near_room3_A 25.10 -Avg_Real(2) 54.83 -Avg_Sim(6) 28.82 - -exp/tri2a_mc/decode_bg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 52.94 -RealData_et_for_1ch_near_room1_A 55.35 -SimData_et_for_1ch_far_room1_A 18.91 -SimData_et_for_1ch_far_room2_A 37.33 -SimData_et_for_1ch_far_room3_A 46.69 -SimData_et_for_1ch_near_room1_A 17.77 -SimData_et_for_1ch_near_room2_A 21.23 -SimData_et_for_1ch_near_room3_A 26.17 -Avg_Real(2) 54.14 -Avg_Sim(6) 28.02 - -#################### -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 46.27 -RealData_dt_for_1ch_near_room1_A 48.85 -SimData_dt_for_1ch_far_room1_A 15.59 -SimData_dt_for_1ch_far_room2_A 35.86 -SimData_dt_for_1ch_far_room3_A 39.54 -SimData_dt_for_1ch_near_room1_A 12.78 -SimData_dt_for_1ch_near_room2_A 17.75 -SimData_dt_for_1ch_near_room3_A 20.23 -Avg_Real(2) 47.56 -Avg_Sim(6) 23.62 - -exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 48.11 -RealData_et_for_1ch_near_room1_A 48.42 -SimData_et_for_1ch_far_room1_A 16.57 -SimData_et_for_1ch_far_room2_A 31.54 -SimData_et_for_1ch_far_room3_A 39.32 -SimData_et_for_1ch_near_room1_A 14.31 -SimData_et_for_1ch_near_room2_A 18.42 -SimData_et_for_1ch_near_room3_A 21.03 -Avg_Real(2) 48.27 -Avg_Sim(6) 23.53 - -#################### -exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 34.04 -RealData_dt_for_1ch_near_room1_A 33.37 -SimData_dt_for_1ch_far_room1_A 10.57 -SimData_dt_for_1ch_far_room2_A 22.63 -SimData_dt_for_1ch_far_room3_A 25.00 -SimData_dt_for_1ch_near_room1_A 7.57 -SimData_dt_for_1ch_near_room2_A 10.97 -SimData_dt_for_1ch_near_room3_A 12.59 -Avg_Real(2) 33.70 -Avg_Sim(6) 14.89 - -exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 33.49 -RealData_et_for_1ch_near_room1_A 34.72 -SimData_et_for_1ch_far_room1_A 10.03 -SimData_et_for_1ch_far_room2_A 20.16 -SimData_et_for_1ch_far_room3_A 25.08 -SimData_et_for_1ch_near_room1_A 8.45 -SimData_et_for_1ch_near_room2_A 11.16 -SimData_et_for_1ch_near_room3_A 12.88 -Avg_Real(2) 34.11 -Avg_Sim(6) 14.63 - -#################### -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 31.17 -RealData_dt_for_1ch_near_room1_A 31.82 -SimData_dt_for_1ch_far_room1_A 8.53 -SimData_dt_for_1ch_far_room2_A 17.43 -SimData_dt_for_1ch_far_room3_A 21.04 -SimData_dt_for_1ch_near_room1_A 6.78 -SimData_dt_for_1ch_near_room2_A 8.97 -SimData_dt_for_1ch_near_room3_A 10.01 -Avg_Real(2) 31.50 -Avg_Sim(6) 12.13 - -exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 31.20 -RealData_et_for_1ch_near_room1_A 30.98 -SimData_et_for_1ch_far_room1_A 8.42 -SimData_et_for_1ch_far_room2_A 17.63 -SimData_et_for_1ch_far_room3_A 20.71 -SimData_et_for_1ch_near_room1_A 7.03 -SimData_et_for_1ch_near_room2_A 9.50 -SimData_et_for_1ch_near_room3_A 11.11 -Avg_Real(2) 31.09 -Avg_Sim(6) 12.40 - -#################### -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt* -RealData_dt_for_1ch_far_room1_A 30.42 -RealData_dt_for_1ch_near_room1_A 31.50 -SimData_dt_for_1ch_far_room1_A 8.24 -SimData_dt_for_1ch_far_room2_A 17.25 -SimData_dt_for_1ch_far_room3_A 20.72 -SimData_dt_for_1ch_near_room1_A 6.76 -SimData_dt_for_1ch_near_room2_A 8.87 -SimData_dt_for_1ch_near_room3_A 9.92 -Avg_Real(2) 30.96 -Avg_Sim(6) 11.96 - -exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et* -RealData_et_for_1ch_far_room1_A 30.89 -RealData_et_for_1ch_near_room1_A 31.01 -SimData_et_for_1ch_far_room1_A 8.20 -SimData_et_for_1ch_far_room2_A 17.34 -SimData_et_for_1ch_far_room3_A 20.56 -SimData_et_for_1ch_near_room1_A 6.91 -SimData_et_for_1ch_near_room2_A 9.50 -SimData_et_for_1ch_near_room3_A 10.93 -Avg_Real(2) 30.95 -Avg_Sim(6) 12.24 +######################################## +GMM RESULTs: +######################################## +No Front-End +######################################## +exp/tri3/decode_dt_real_1ch +%WER 34.18 [ 500 / 1463, 24 ins, 125 del, 351 sub ] exp/tri3/decode_dt_real_1ch/wer_17_1.0_far_room1 +%WER 29.63 [ 475 / 1603, 24 ins, 127 del, 324 sub ] exp/tri3/decode_dt_real_1ch/wer_15_0.5_near_room1 + +exp/tri3/decode_dt_simu_1ch +%WER 6.85 [ 279 / 4071, 38 ins, 40 del, 201 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_1.0_far_room1 +%WER 18.31 [ 743 / 4058, 65 ins, 156 del, 522 sub ] exp/tri3/decode_dt_simu_1ch/wer_14_0.5_far_room2 +%WER 19.78 [ 800 / 4045, 76 ins, 147 del, 577 sub ] exp/tri3/decode_dt_simu_1ch/wer_13_0.0_far_room3 +%WER 5.58 [ 227 / 4071, 33 ins, 34 del, 160 sub ] exp/tri3/decode_dt_simu_1ch/wer_13_1.0_near_room1 +%WER 7.49 [ 304 / 4058, 51 ins, 33 del, 220 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_0.0_near_room2 +%WER 7.96 [ 322 / 4045, 32 ins, 64 del, 226 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_1.0_near_room3 + +exp/tri3/decode_et_real_1ch +%WER 33.09 [ 980 / 2962, 103 ins, 157 del, 720 sub ] exp/tri3/decode_et_real_1ch/wer_13_0.0_far_room1 +%WER 33.18 [ 1039 / 3131, 104 ins, 194 del, 741 sub ] exp/tri3/decode_et_real_1ch/wer_16_0.0_near_room1 + +exp/tri3/decode_et_simu_1ch +%WER 7.43 [ 439 / 5907, 72 ins, 48 del, 319 sub ] exp/tri3/decode_et_simu_1ch/wer_16_0.5_far_room1 +%WER 18.34 [ 1142 / 6226, 120 ins, 208 del, 814 sub ] exp/tri3/decode_et_simu_1ch/wer_12_0.5_far_room2 +%WER 21.85 [ 1282 / 5868, 110 ins, 278 del, 894 sub ] exp/tri3/decode_et_simu_1ch/wer_14_0.5_far_room3 +%WER 7.35 [ 434 / 5907, 76 ins, 46 del, 312 sub ] exp/tri3/decode_et_simu_1ch/wer_17_1.0_near_room1 +%WER 9.35 [ 582 / 6226, 86 ins, 69 del, 427 sub ] exp/tri3/decode_et_simu_1ch/wer_14_0.0_near_room2 +%WER 10.24 [ 601 / 5868, 93 ins, 87 del, 421 sub ] exp/tri3/decode_et_simu_1ch/wer_13_0.0_near_room3 + +1ch - WPE +######################################## +exp/tri3/decode_dt_real_1ch_wpe +%WER 33.01 [ 483 / 1463, 41 ins, 85 del, 357 sub ] exp/tri3/decode_dt_real_1ch_wpe/wer_17_0.0_far_room1 +%WER 27.32 [ 438 / 1603, 31 ins, 98 del, 309 sub ] exp/tri3/decode_dt_real_1ch_wpe/wer_16_0.0_near_room1 + +exp/tri3/decode_dt_simu_1ch_wpe +%WER 6.53 [ 266 / 4071, 38 ins, 36 del, 192 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_13_1.0_far_room1 +%WER 17.62 [ 715 / 4058, 40 ins, 186 del, 489 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_15_1.0_far_room2 +%WER 19.04 [ 770 / 4045, 70 ins, 146 del, 554 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_15_0.0_far_room3 +%WER 5.50 [ 224 / 4071, 31 ins, 33 del, 160 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_14_1.0_near_room1 +%WER 7.76 [ 315 / 4058, 60 ins, 36 del, 219 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_11_0.5_near_room2 +%WER 7.89 [ 319 / 4045, 30 ins, 64 del, 225 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_14_1.0_near_room3 + +exp/tri3/decode_et_real_1ch_wpe +%WER 30.08 [ 891 / 2962, 89 ins, 164 del, 638 sub ] exp/tri3/decode_et_real_1ch_wpe/wer_17_0.0_far_room1 +%WER 30.57 [ 957 / 3131, 105 ins, 162 del, 690 sub ] exp/tri3/decode_et_real_1ch_wpe/wer_17_0.0_near_room1 + +exp/tri3/decode_et_simu_1ch_wpe +%WER 6.97 [ 412 / 5907, 71 ins, 52 del, 289 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_far_room1 +%WER 16.59 [ 1033 / 6226, 91 ins, 217 del, 725 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_13_1.0_far_room2 +%WER 20.60 [ 1209 / 5868, 92 ins, 285 del, 832 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_16_0.5_far_room3 +%WER 7.48 [ 442 / 5907, 93 ins, 41 del, 308 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_near_room1 +%WER 8.77 [ 546 / 6226, 76 ins, 59 del, 411 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_14_0.0_near_room2 +%WER 9.20 [ 540 / 5868, 63 ins, 113 del, 364 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_near_room3 + +2ch - WPE+BeamformIt +######################################## +exp/tri3/decode_dt_real_2ch_beamformit +%WER 29.67 [ 434 / 1463, 45 ins, 70 del, 319 sub ] exp/tri3/decode_dt_real_2ch_beamformit/wer_17_0.5_far_room1 +%WER 24.08 [ 386 / 1603, 38 ins, 87 del, 261 sub ] exp/tri3/decode_dt_real_2ch_beamformit/wer_13_1.0_near_room1 + +exp/tri3/decode_dt_simu_2ch_beamformit +%WER 6.76 [ 275 / 4071, 60 ins, 43 del, 172 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_16_0.5_far_room1 +%WER 11.93 [ 484 / 4058, 68 ins, 67 del, 349 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_14_0.0_far_room2 +%WER 14.36 [ 581 / 4045, 77 ins, 105 del, 399 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_13_0.5_far_room3 +%WER 6.24 [ 254 / 4071, 41 ins, 40 del, 173 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_16_1.0_near_room1 +%WER 7.00 [ 284 / 4058, 54 ins, 33 del, 197 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_14_0.5_near_room2 +%WER 7.17 [ 290 / 4045, 44 ins, 50 del, 196 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_15_1.0_near_room3 + +exp/tri3/decode_et_real_2ch_beamformit +%WER 23.94 [ 709 / 2962, 92 ins, 108 del, 509 sub ] exp/tri3/decode_et_real_2ch_beamformit/wer_16_0.0_far_room1 +%WER 23.09 [ 723 / 3131, 78 ins, 144 del, 501 sub ] exp/tri3/decode_et_real_2ch_beamformit/wer_16_1.0_near_room1 + +exp/tri3/decode_et_simu_2ch_beamformit +%WER 7.18 [ 424 / 5907, 74 ins, 47 del, 303 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_15_1.0_far_room1 +%WER 12.14 [ 756 / 6226, 92 ins, 122 del, 542 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_11_1.0_far_room2 +%WER 15.20 [ 892 / 5868, 123 ins, 161 del, 608 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_14_0.0_far_room3 +%WER 7.62 [ 450 / 5907, 87 ins, 51 del, 312 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_17_1.0_near_room1 +%WER 7.53 [ 469 / 6226, 52 ins, 69 del, 348 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_17_1.0_near_room2 +%WER 8.08 [ 474 / 5868, 62 ins, 87 del, 325 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_15_1.0_near_room3 + +8ch - WPE+BeamformIt +######################################## +exp/tri3/decode_dt_real_8ch_beamformit +%WER 20.92 [ 306 / 1463, 44 ins, 43 del, 219 sub ] exp/tri3/decode_dt_real_8ch_beamformit/wer_13_1.0_far_room1 +%WER 17.53 [ 281 / 1603, 29 ins, 46 del, 206 sub ] exp/tri3/decode_dt_real_8ch_beamformit/wer_16_1.0_near_room1 + +exp/tri3/decode_dt_simu_8ch_beamformit +%WER 6.07 [ 247 / 4071, 39 ins, 40 del, 168 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_16_1.0_far_room1 +%WER 6.68 [ 271 / 4058, 45 ins, 44 del, 182 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_15_1.0_far_room2 +%WER 5.91 [ 239 / 4045, 35 ins, 39 del, 165 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_14_1.0_far_room3 +%WER 6.76 [ 275 / 4071, 56 ins, 39 del, 180 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_15_1.0_near_room1 +%WER 6.83 [ 277 / 4058, 81 ins, 31 del, 165 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_14_1.0_near_room2 +%WER 5.91 [ 239 / 4045, 43 ins, 36 del, 160 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_17_1.0_near_room3 + +exp/tri3/decode_et_real_8ch_beamformit +%WER 15.87 [ 470 / 2962, 66 ins, 81 del, 323 sub ] exp/tri3/decode_et_real_8ch_beamformit/wer_15_1.0_far_room1 +%WER 15.08 [ 472 / 3131, 81 ins, 69 del, 322 sub ] exp/tri3/decode_et_real_8ch_beamformit/wer_16_1.0_near_room1 + +exp/tri3/decode_et_simu_8ch_beamformit +%WER 7.03 [ 415 / 5907, 66 ins, 47 del, 302 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_15_1.0_far_room1 +%WER 7.31 [ 455 / 6226, 67 ins, 62 del, 326 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_16_0.5_far_room2 +%WER 7.29 [ 428 / 5868, 71 ins, 63 del, 294 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_14_1.0_far_room3 +%WER 7.43 [ 439 / 5907, 80 ins, 47 del, 312 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_17_1.0_near_room1 +%WER 7.00 [ 436 / 6226, 75 ins, 64 del, 297 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_17_1.0_near_room2 +%WER 6.99 [ 410 / 5868, 62 ins, 62 del, 286 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_16_1.0_near_room3 + +######################################## +TDNN RESULTs: +######################################## + +exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt* +######################################## + +No Front-End +######################################## +%WER 20.51 [ 300 / 1463, 20 ins, 80 del, 200 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch/wer_9_1.0_far_room1 +%WER 17.90 [ 287 / 1603, 13 ins, 85 del, 189 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch/wer_11_0.5_near_room1 +%WER 3.24 [ 132 / 4071, 16 ins, 29 del, 87 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_1.0_far_room1 +%WER 7.20 [ 292 / 4058, 30 ins, 56 del, 206 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_0.5_far_room2 +%WER 6.67 [ 270 / 4045, 21 ins, 56 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_9_0.5_far_room3 +%WER 2.85 [ 116 / 4071, 17 ins, 16 del, 83 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_7_0.0_near_room1 +%WER 3.52 [ 143 / 4058, 18 ins, 22 del, 103 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_0.5_near_room2 +%WER 4.23 [ 171 / 4045, 22 ins, 29 del, 120 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_9_0.0_near_room3 + +1ch - WPE +######################################## +%WER 18.66 [ 273 / 1463, 17 ins, 72 del, 184 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch_wpe/wer_10_1.0_far_room1 +%WER 15.41 [ 247 / 1603, 17 ins, 68 del, 162 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch_wpe/wer_12_0.0_near_room1 +%WER 3.14 [ 128 / 4071, 20 ins, 19 del, 89 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_0.0_far_room1 +%WER 6.73 [ 273 / 4058, 34 ins, 46 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_0.5_far_room2 +%WER 6.33 [ 256 / 4045, 23 ins, 52 del, 181 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_10_0.5_far_room3 +%WER 2.60 [ 106 / 4071, 16 ins, 15 del, 75 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_7_0.0_near_room1 +%WER 3.18 [ 129 / 4058, 13 ins, 23 del, 93 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_1.0_near_room2 +%WER 3.98 [ 161 / 4045, 21 ins, 27 del, 113 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_9_0.0_near_room3 + +2ch - WPE+BeamformIt +######################################## +%WER 14.90 [ 218 / 1463, 15 ins, 58 del, 145 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_2ch_beamformit/wer_10_1.0_far_room1 +%WER 12.23 [ 196 / 1603, 13 ins, 41 del, 142 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_2ch_beamformit/wer_12_0.0_near_room1 +%WER 3.24 [ 132 / 4071, 24 ins, 18 del, 90 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_7_0.0_far_room1 +%WER 4.21 [ 171 / 4058, 17 ins, 33 del, 121 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_9_0.5_far_room2 +%WER 4.65 [ 188 / 4045, 20 ins, 33 del, 135 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_9_0.5_far_room3 +%WER 2.65 [ 108 / 4071, 11 ins, 23 del, 74 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_0.5_near_room1 +%WER 2.98 [ 121 / 4058, 7 ins, 26 del, 88 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_1.0_near_room2 +%WER 3.44 [ 139 / 4045, 25 ins, 21 del, 93 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_0.0_near_room3 + +8ch - WPE+BeamformIt +######################################## +%WER 11.07 [ 162 / 1463, 17 ins, 38 del, 107 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_8ch_beamformit/wer_10_0.5_far_room1 +%WER 9.86 [ 158 / 1603, 12 ins, 46 del, 100 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_8ch_beamformit/wer_12_1.0_near_room1 +%WER 3.05 [ 124 / 4071, 17 ins, 22 del, 85 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_0.0_far_room1 +%WER 3.01 [ 122 / 4058, 12 ins, 23 del, 87 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_10_0.5_far_room2 +%WER 3.19 [ 129 / 4045, 19 ins, 21 del, 89 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_7_1.0_far_room3 +%WER 2.65 [ 108 / 4071, 15 ins, 20 del, 73 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_7_0.5_near_room1 +%WER 2.51 [ 102 / 4058, 9 ins, 21 del, 72 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_1.0_near_room2 +%WER 2.79 [ 113 / 4045, 17 ins, 21 del, 75 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_0.5_near_room3 + +exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et* +######################################## + +No Front-End +######################################## +%WER 20.90 [ 619 / 2962, 36 ins, 147 del, 436 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch/wer_11_1.0_far_room1 +%WER 18.65 [ 584 / 3131, 45 ins, 136 del, 403 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch/wer_11_0.5_near_room1 +%WER 3.79 [ 224 / 5907, 20 ins, 49 del, 155 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_11_1.0_far_room1 +%WER 7.68 [ 478 / 6226, 60 ins, 94 del, 324 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_10_0.0_far_room2 +%WER 7.40 [ 434 / 5868, 46 ins, 93 del, 295 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_9_0.5_far_room3 +%WER 3.28 [ 194 / 5907, 36 ins, 29 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_9_0.0_near_room1 +%WER 4.63 [ 288 / 6226, 33 ins, 57 del, 198 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_8_1.0_near_room2 +%WER 4.75 [ 279 / 5868, 26 ins, 60 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_10_1.0_near_room3 + +1ch - WPE +######################################## +%WER 17.69 [ 524 / 2962, 39 ins, 100 del, 385 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch_wpe/wer_13_0.0_far_room1 +%WER 16.00 [ 501 / 3131, 39 ins, 115 del, 347 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch_wpe/wer_11_0.5_near_room1 +%WER 3.67 [ 217 / 5907, 31 ins, 34 del, 152 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_0.5_far_room1 +%WER 7.15 [ 445 / 6226, 39 ins, 91 del, 315 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_10_0.5_far_room2 +%WER 7.11 [ 417 / 5868, 39 ins, 100 del, 278 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_9_1.0_far_room3 +%WER 3.03 [ 179 / 5907, 37 ins, 24 del, 118 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_0.0_near_room1 +%WER 4.74 [ 295 / 6226, 34 ins, 57 del, 204 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_1.0_near_room2 +%WER 4.31 [ 253 / 5868, 27 ins, 51 del, 175 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_9_1.0_near_room3 + +2ch - WPE+BeamformIt +######################################## +%WER 14.35 [ 425 / 2962, 32 ins, 90 del, 303 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_2ch_beamformit/wer_11_1.0_far_room1 +%WER 12.17 [ 381 / 3131, 44 ins, 76 del, 261 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_2ch_beamformit/wer_10_0.5_near_room1 +%WER 3.23 [ 191 / 5907, 18 ins, 40 del, 133 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_far_room1 +%WER 5.35 [ 333 / 6226, 31 ins, 75 del, 227 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_far_room2 +%WER 5.81 [ 341 / 5868, 43 ins, 57 del, 241 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_0.5_far_room3 +%WER 3.15 [ 186 / 5907, 24 ins, 33 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_8_1.0_near_room1 +%WER 4.42 [ 275 / 6226, 28 ins, 57 del, 190 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_8_1.0_near_room2 +%WER 4.12 [ 242 / 5868, 21 ins, 43 del, 178 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_near_room3 + +8ch - WPE+BeamformIt +######################################## +%WER 11.01 [ 326 / 2962, 30 ins, 58 del, 238 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_8ch_beamformit/wer_11_1.0_far_room1 +%WER 9.49 [ 297 / 3131, 27 ins, 78 del, 192 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_8ch_beamformit/wer_12_1.0_near_room1 +%WER 3.50 [ 207 / 5907, 29 ins, 33 del, 145 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_8_1.0_far_room1 +%WER 4.42 [ 275 / 6226, 32 ins, 61 del, 182 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_1.0_far_room2 +%WER 3.83 [ 225 / 5868, 34 ins, 37 del, 154 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_0.5_far_room3 +%WER 3.15 [ 186 / 5907, 26 ins, 31 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_8_1.0_near_room1 +%WER 4.00 [ 249 / 6226, 27 ins, 57 del, 165 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_1.0_near_room2 +%WER 3.54 [ 208 / 5868, 16 ins, 41 del, 151 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_10_1.0_near_room3 +######################################## + +######################################## +SE Scores - 8ch - WPE+BeamformIt +######################################## + +Data type : SimData +######################################## + +============================================== + Cepstral distance in dB +---------------------------------------------- + mean median +---------------------------------------------- + org enh org enh +---------------------------------------------- + dt_far_room1 2.65 1.97 2.36 1.74 + dt_far_room2 5.08 4.66 4.94 4.30 + dt_far_room3 4.82 4.03 4.60 3.63 + dt_near_room1 1.96 1.67 1.67 1.37 + dt_near_room2 4.58 4.33 4.30 3.88 + dt_near_room3 4.20 3.71 3.91 3.26 +---------------------------------------------- + average 3.88 3.39 3.63 3.03 +============================================== + + +============================================== + SRMR (only mean used) +---------------------------------------------- + mean median +---------------------------------------------- + org enh org enh +---------------------------------------------- + dt_far_room1 4.63 4.91 - - + dt_far_room2 2.94 5.13 - - + dt_far_room3 2.76 4.87 - - + dt_near_room1 4.37 4.62 - - + dt_near_room2 3.67 4.39 - - + dt_near_room3 3.66 4.54 - - +---------------------------------------------- + average 3.67 4.74 - - +============================================== + + +============================================== + Log likelihood ratio +---------------------------------------------- + mean median +---------------------------------------------- + org enh org enh +---------------------------------------------- + dt_far_room1 0.38 0.33 0.35 0.30 + dt_far_room2 0.77 0.56 0.64 0.43 + dt_far_room3 0.85 0.52 0.77 0.45 + dt_near_room1 0.34 0.34 0.33 0.32 + dt_near_room2 0.51 0.50 0.43 0.33 + dt_near_room3 0.65 0.50 0.59 0.43 +---------------------------------------------- + average 0.58 0.46 0.52 0.38 +============================================== + + +============================================== + Frequency-weighted segmental SNR in dB +---------------------------------------------- + mean median +---------------------------------------------- + org enh org enh +---------------------------------------------- + dt_far_room1 6.75 8.99 8.93 11.06 + dt_far_room2 0.53 3.84 0.37 5.91 + dt_far_room3 0.14 3.76 0.39 6.57 + dt_near_room1 8.10 9.50 10.47 11.32 + dt_near_room2 3.07 5.10 4.58 8.12 + dt_near_room3 2.32 4.54 4.41 8.15 +---------------------------------------------- + average 3.48 5.96 4.86 8.52 +============================================== + +Data type : RealData +######################################## + +============================== + SRMR +------------------------------ + org enh +------------------------------ + dt_far_room1 3.51 6.03 + dt_near_room1 4.05 6.68 +------------------------------ + average 3.78 6.36 +============================== +>>>>>>> 77343718c6dc1936d7374b4948be4706d6f9ee2a diff --git a/egs/reverb/s5/conf/decode_dnn.config b/egs/reverb/s5/conf/decode_dnn.config deleted file mode 100644 index bfaae86702e..00000000000 --- a/egs/reverb/s5/conf/decode_dnn.config +++ /dev/null @@ -1,2 +0,0 @@ -beam=18.0 # beam for decoding. Was 13.0 in the scripts. -latbeam=10.0 # this has most effect on size of the lattices. diff --git a/egs/reverb/s5/conf/fbank.conf b/egs/reverb/s5/conf/fbank.conf deleted file mode 100644 index c4b73674cab..00000000000 --- a/egs/reverb/s5/conf/fbank.conf +++ /dev/null @@ -1,2 +0,0 @@ -# No non-default options for now. - diff --git a/egs/reverb/s5/conf/mfcc_hires.conf b/egs/reverb/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..fd64b62eb16 --- /dev/null +++ b/egs/reverb/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-400 diff --git a/egs/reverb/s5/conf/online_cmvn.conf b/egs/reverb/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/reverb/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/reverb/s5/conf/reverb_beamformit.cfg b/egs/reverb/s5/conf/reverb_beamformit.cfg new file mode 100755 index 00000000000..70fdd858651 --- /dev/null +++ b/egs/reverb/s5/conf/reverb_beamformit.cfg @@ -0,0 +1,50 @@ +#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/) + +# scrolling size to compute the delays +scroll_size = 250 + +# cross correlation computation window size +window_size = 500 + +#amount of maximum points for the xcorrelation taken into account +nbest_amount = 4 + +#flag wether to apply an automatic noise thresholding +do_noise_threshold = 1 + +#Percentage of frames with lower xcorr taken as noisy +noise_percent = 10 + +######## acoustic modelling parameters + +#transition probabilities weight for multichannel decoding +trans_weight_multi = 25 +trans_weight_nbest = 25 + +### + +#flag wether to print the feaures after setting them, or not +print_features = 1 + +#flag wether to use the bad frames in the sum process +do_avoid_bad_frames = 1 + +#flag to use the best channel (SNR) as a reference +#defined from command line +do_compute_reference = 1 + +#flag wether to use a uem file or not(process all the file) +do_use_uem_file = 0 + +#flag wether to use an adaptative weights scheme or fixed weights +do_adapt_weights = 1 + +#flag wether to output the sph files or just run the system to create the auxiliary files +do_write_sph_files = 1 + +####directories where to store/retrieve info#### +#channels_file = ./cfg-files/channels + +#show needs to be passed as argument normally, here a default one is given just in case +#show_id = Ttmp + diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m index cc01ff89b7d..831ff6a5226 100755 --- a/egs/reverb/s5/local/Generate_mcTrainData_cut.m +++ b/egs/reverb/s5/local/Generate_mcTrainData_cut.m @@ -1,13 +1,13 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir) % % Input variables: -% WSJ_dir_name: string name of user's clean wsjcam0 corpus directory -% (*Directory structure for wsjcam0 corpushas to be kept as it is after obtaining it from LDC. +% WSJ_dir_name: string name of WAV file directory converted from original wsjcam0 SPHERE files +% (*Directory structure for wsjcam0 corpus to be kept as it is after obtaining it from LDC. % Otherwise this script does not work.) % % This function generates multi-condition traiing data % based on the following items: -% 1. wsjcam0 corpus (distributed from the LDC) +% 1. wsjcam0 corpus (WAV files) % 2. room impulse responses (ones under ./RIR/) % 3. noise (ones under ./NOISE/). % Generated data has the same directory structure as original wsjcam0 corpus. @@ -26,8 +26,6 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir) display(['Name of directory for original wsjcam0: ',WSJ_dir_name]) display(['Name of directory to save generated multi-condition training data: ',save_dir]) -unix(['chmod u+x sphere_to_wave.csh']); -unix(['chmod u+x bin/*']); % Parameters related to acoustic conditions SNRdB=20; @@ -89,7 +87,6 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir) save_dir_tr=[save_dir,'/data/mc_train/']; end mkdir([save_dir_tr]); -%mkdir([save_dir,'/taskfiles/']) mic_idx=['A';'B';'C';'D';'E';'F';'G';'H']; prev_fname='dummy'; @@ -114,13 +111,12 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir) end prev_fname=fname(1:idx1(end)); - % load (sphere format) speech signal - x=read_sphere([WSJ_dir_name,'/data/', fname]); - x=x/(2^15); % conversion from short-int to float + % load speech signal + x=audioread([WSJ_dir_name, '/data/', fname, '.wav'])'; % load RIR and noise for "THIS" utterance - eval(['RIR=wavread(RIR_sim',num2str(rcount),');']); - eval(['NOISE=wavread([noise_sim',num2str(ceil(rcount/4)),',''_',num2str(ncount),'.wav'']);']); + eval(['RIR=audioread(RIR_sim',num2str(rcount),');']); + eval(['NOISE=audioread([noise_sim',num2str(ceil(rcount/4)),',''_',num2str(ncount),'.wav'']);']); % Generate 8ch noisy reverberant data y=gen_obs(x,RIR,NOISE,SNRdB); @@ -138,8 +134,9 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir) y=y/4; % common normalization to all the data to prevent clipping % denominator was decided experimentally - for ch=1:8 - eval(['wavwrite(y(:,',num2str(ch),'),16000,''',save_dir_tr fname,'_ch',num2str(ch),'.wav'');']); + for ch=1:8 + outfilename = [save_dir_tr, fname, '_ch', num2str(ch), '.wav']; + eval(['audiowrite(outfilename, y(:,',num2str(ch),'), 16000);']); end display(['sentence ',num2str(fcount),' (out of 7861) finished! (Multi-condition training data)']) diff --git a/egs/reverb/s5/local/REVERB_create_mcdata.sh b/egs/reverb/s5/local/REVERB_create_mcdata.sh deleted file mode 100755 index 4cc776aa159..00000000000 --- a/egs/reverb/s5/local/REVERB_create_mcdata.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# Copyright 2013 MERL (author: Shinji Watanabe) -# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -if [ $# -ne 2 ]; then - printf "\nUSAGE: %s \n\n" `basename $0` - echo "e.g.,:" - echo " `basename $0` /archive/speech-db/processed/public/REVERB/wsjcam0 data_mc_tr" - exit 1; -fi - -wsjcam0_dir=$1 -reverb_tr_dir=$2 - -dir=`pwd`/data/local/reverb_tools -mkdir -p $dir $reverb_tr_dir -lmdir=`pwd`/data/local/nist_lm - -# Download tools -URL1="http://reverb2014.dereverberation.com/tools/reverb_tools_for_Generate_mcTrainData.tgz" -URL2="http://reverb2014.dereverberation.com/tools/REVERB_TOOLS_FOR_ASR_ver2.0.tgz" -for f in $URL1 $URL2; do - x=`basename $f` - if [ ! -e $dir/$x ]; then - wget $f -O $dir/$x || exit 1; - tar zxvf $dir/$x -C $dir || exit 1; - fi -done -URL3="http://reverb2014.dereverberation.com/tools/taskFiles_et.tgz" -x=`basename $URL3` -if [ ! -e $dir/$x ]; then - wget $URL3 -O $dir/$x || exit 1; - tar zxvf $dir/$x -C $dir || exit 1; - cp -fr $dir/`basename $x .tgz`/* $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/ -fi - -# Download and install nist tools -pushd $dir/ReleasePackage/reverb_tools_for_asr_ver2.0 -perl -ape "s|^main$|targetSPHEREDir\=tools/SPHERE\ninstall_nist|;" installTools > installnist -chmod u+x installnist -./installnist -popd - -# Make mcTrainData -cp local/Generate_mcTrainData_cut.m $dir/reverb_tools_for_Generate_mcTrainData/ -pushd $dir/reverb_tools_for_Generate_mcTrainData/ -# copied nist tools required for the following matlab command -cp $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/tools/SPHERE/nist/bin/{h_strip,w_decode} ./bin/ - -tmpdir=`mktemp -d tempXXXXX ` -tmpmfile=$tmpdir/run_mat.m -cat < $tmpmfile -addpath(genpath('.')) -Generate_mcTrainData_cut('$wsjcam0_dir', '$reverb_tr_dir'); -EOF -cat $tmpmfile | matlab -nodisplay -rm -rf $tmpdir -popd - -echo "Successfully generated multi-condition training data and stored it in $reverb_tr_dir." && exit 0; diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh deleted file mode 100755 index a4599f97702..00000000000 --- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh +++ /dev/null @@ -1,165 +0,0 @@ -#!/bin/bash - -# Copyright 2013 MERL (author: Felix Weninger) -# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# for REVERB challenge: - -dir=`pwd`/data/local/data -lmdir=`pwd`/data/local/nist_lm -mkdir -p $dir $lmdir -local=`pwd`/local -utils=`pwd`/utils -root=`pwd` - -. ./path.sh # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi - -cd $dir - -MIC=primary - -# input corpus (original or processed, tr or dt, etc.) -RWSJ=$1 -if [ ! -d "$RWSJ" ]; then - echo Could not find directory $RWSJ! Check pathnames in corpus.sh! - exit 1 -fi - -mcwsjav_mlf=$RWSJ/mlf/WSJ.mlf -if [ ! -z "$4" ]; then - mcwsjav_mlf=$4 -fi - -# the name of the dataset to be created -dataset=REVERB_Real_dt - -# the WSJCAM0 set that the set is based on (tr, dt, ...) -# this will be used to find the correct transcriptions etc. -dt_or_x=dt - -if [ ! -z "$2" ]; then - dataset=$2 -fi -# dt or et -if [ ! -z "$3" ]; then - dt_or_x=$3 -fi - -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions -# for the REVERB Challenge - -taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch -#taskFiles=`ls $taskFileDir/*Data_dt_for_*` -taskFiles=`ls $taskFileDir/RealData_${dt_or_x}_for_1ch_{far,near}*` - -dir2=$dir/$dataset -mkdir -p $dir2 - -for taskFile in $taskFiles; do - -set=`basename $taskFile` - - -echo $mcwsjav_mlf - -# MLF transcription correction -# taken from HTK baseline script -sed -e ' -# dos to unix line feed conversion -s/\x0D$//' \ --e " - s/\x60//g # remove unicode character grave accent. - " \ --e " - # fix the single quote for the word yield - # and the quoted ROOTS - # e.g. yield' --> yield - # reason: YIELD' is not in dict, while YIELD is - s/YIELD'/YIELD/g - s/'ROOTS'/ROOTS/g - s/'WHERE/WHERE/g - s/PEOPLE'/PEOPLE/g - s/SIT'/SIT/g - s/'DOMINEE/DOMINEE/g - s/CHURCH'/CHURCH/g" \ --e ' - # fix the single missing double full stop issue at the end of an utterance - # e.g. I. C. N should be I. C. N. - # reason: N is not in dict, while N. is - /^[A-Z]$/ { - # append a line - N - # search for single dot on the second line - /\n\./ { - # found it - now replace the - s/\([A-Z]\)\n\./\1\.\n\./ - } - }' \ -$mcwsjav_mlf |\ -perl $local/mlf2text.pl > $dir2/$set.txt1 - -#exit - -#taskFile=$taskFileDir/$set -# contains pointer to wav files with relative path --> add absolute path -echo taskFile = $taskFile -awk '{print "'$RWSJ'"$1}' < $taskFile > $dir2/${set}.flist || exit 1; - -# this is like flist2scp.pl but it can take wav file list as input -(perl -e 'while(<>){ - m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; - $id = lc $1; - print "$id $_"; -}' < $dir2/$set.flist || exit 1) | sort > $dir2/${set}_wav.scp - - -# Make the utt2spk and spk2utt files. -cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1; -cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1; - -awk '{print $1}' < $dir2/$set.utt2spk |\ -$local/find_transcripts_txt.pl $dir2/$set.txt1 | sort | uniq > $dir2/$set.txt -#rm $dir2/$set.txt1 - -# Create directory structure required by decoding scripts - -cd $root -mkdir -p data/$dataset/$set -cp $dir2/${set}_wav.scp data/$dataset/$set/wav.scp || exit 1; -cp $dir2/$set.txt data/$dataset/$set/text || exit 1; -cp $dir2/$set.spk2utt data/$dataset/$set/spk2utt || exit 1; -cp $dir2/$set.utt2spk data/$dataset/$set/utt2spk || exit 1; - -echo "Data preparation for $set succeeded" -#echo "Put files into $dir2/$set.*" - - -mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ - data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; -steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; - -done diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh deleted file mode 100755 index 6ab2f2f4b73..00000000000 --- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# Copyright 2013 MERL (author: Felix Weninger) -# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -dir=$PWD/data/local/data -lmdir=$PWD/data/local/nist_lm -mkdir -p $dir $lmdir -local=$PWD/local -utils=$PWD/utils -root=$PWD - -. ./path.sh # Needed for KALDI_ROOT -export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -if [ ! -x $sph2pipe ]; then - echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; - exit 1; -fi - -RWSJ=$1 # input corpus (original or processed, tr or dt, etc.) -dataset=REVERB_dt # the name of the dataset to be created -if [ ! -z "$2" ]; then - dataset=$2 -fi -dt_or_x=dt # the WSJCAM0 set that the set is based on (tr, dt, ...) -# this will be used to find the correct transcriptions etc. -if [ ! -z "$3" ]; then - dt_or_x=$3 -fi - -if [ ! -d "$RWSJ" ]; then - echo Could not find directory $RWSJ! Check pathnames in corpus.sh! - exit 1 -fi - -cd $dir -MIC=primary - -# unfortunately, we need a pointer to HTK baseline -# since the corpus does NOT contain the data set descriptions -# for the REVERB Challenge -taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch -#taskFiles=`ls $taskFileDir/*Data_dt_for_*` -nch=1 -if [ "$dt_or_x" = "tr" ]; then - taskFiles=`ls $taskFileDir/SimData_tr_for_${nch}ch*` || exit 1 -else - taskFiles=`ls $taskFileDir/SimData_${dt_or_x}_for_${nch}ch_{far,near}*` || exit 1 -fi -for taskFile in $taskFiles; do - -set=`basename $taskFile` - -#taskFile=$taskFileDir/$set -dir2=$dir/$dataset -mkdir -p $dir2 -# contains pointer to wav files with relative path --> add absolute path -echo taskFile = $taskFile -awk '{print "'$RWSJ/data'"$1}' < $taskFile > $dir2/${set}.flist || exit 1; - -# this is like flist2scp.pl but it can take wav file list as input -perl -e 'while(<>){ - m:^\S+/(\w{8})\w*\.wav$: || die "Bad line $_"; - $id = lc $1; - print "$id $_"; -}' < $dir2/$set.flist | sort > $dir2/${set}_wav.scp || exit 1; - -# find transcriptions of given utterances in si_dt.dot -# create a trans1 file for each set, convert to txt (kaldi "MLF") -dot=$dir/si_${dt_or_x}.dot -perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' $taskFile |\ -perl $local/find_transcripts_singledot.pl $dot \ -> $dir2/$set.trans1 || exit 1; - -noiseword=""; -cat $dir2/$set.trans1 | $local/normalize_transcript.pl $noiseword | sort | uniq > $dir2/$set.txt || exit 1; -#exit - - -# Make the utt2spk and spk2utt files. -cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1; -cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1; - -# Create directory structure required by decoding scripts -cd $root -mkdir -p data/$dataset/$set -cp $dir2/${set}_wav.scp data/$dataset/$set/wav.scp || exit 1; -cp $dir2/$set.txt data/$dataset/$set/text || exit 1; -cp $dir2/$set.spk2utt data/$dataset/$set/spk2utt || exit 1; -cp $dir2/$set.utt2spk data/$dataset/$set/utt2spk || exit 1; - -echo "Data preparation for $set succeeded" -#echo "Put files into $dir2/$set.*" - - -mfccdir=mfcc/$dataset -#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do -#for x in si_tr; do -steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \ - data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; -steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1; - -done diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh deleted file mode 100755 index c4b5eeb87f3..00000000000 --- a/egs/reverb/s5/local/calc_wer.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -# Copyright 2016 MERL (author: Shinji Watanabe) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -. ./cmd.sh -. ./path.sh - -lmw=15 -am="tri2a" -lm="bg_5k" -decode="" - -. utils/parse_options.sh - -if [ ! -z $decode ]; then - decode="_$decode" -fi - -dir="exp/$am/decode${decode}_${lm}_REVERB_" -echo "####################" -echo "${dir}*dt*" -for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do - echo $a | awk -F '_' '{for(i=NF-6;i [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/reverb/s5/local/chain/run_tdnn.sh b/egs/reverb/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/reverb/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/reverb/s5/local/chain/run_tdnn_lstm.sh b/egs/reverb/s5/local/chain/run_tdnn_lstm.sh new file mode 120000 index 00000000000..8e647598556 --- /dev/null +++ b/egs/reverb/s5/local/chain/run_tdnn_lstm.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_1a.sh \ No newline at end of file diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..61cc8b97d41 --- /dev/null +++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,281 @@ +#!/bin/bash + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=96 +train_set=tr_simu_8ch +test_sets="dt_real_1ch dt_simu_1ch et_real_1ch et_simu_1ch" +gmm=tri3 +nnet3_affix=_tr_simu_8ch +lm_suffix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05" + output_opts="l2-regularize=0.01 bottleneck-dim=320" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=512 + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn3 $opts dim=512 + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-layer name=tdnn5 $opts dim=512 + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + lstm_opts="decay-time=40" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=$hidden_dim + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim + + fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim + fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$train_cmd --mem 4G" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.srand=$srand \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.momentum=0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0" \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +# Not testing the 'looped' decoding separately, because for +# TDNN systems it would give exactly the same results as the +# normal decoding. + +if $test_online_decoding && [ $stage -le 17 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online + + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l " + echo "options" + echo " --cmd # Command to run in parallel with" + echo " --nch # nch of WPE to use for computing SE scores" + echo " --enable_pesq # Boolean flag to enable PESQ" + exit 1; +fi + +reverb_data=$1 +enhancement_directory=$2 +pesqdir=$3 +enhancement_directory_sim=$enhancement_directory/WPE/${nch}ch/REVERB_WSJCAM0_dt/data/ +enhancement_directory_real=$enhancement_directory/WPE/${nch}ch/MC_WSJ_AV_Dev/ +expdir=${PWD}/exp/compute_se_${nch}ch +if $enable_pesq; then + compute_pesq=1 +else + compute_pesq=0 +fi + +pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools +$cmd $expdir/compute_se_real.log matlab -nodisplay -nosplash -r "addpath('SRMRToolbox'); score_RealData('$reverb_data','$enhancement_directory_real');exit" +$cmd $expdir/compute_se_sim.log matlab -nodisplay -nosplash -r "addpath('SRMRToolbox'); score_SimData('$reverb_data','$enhancement_directory_sim','$pesqdir',$compute_pesq);exit" +popd +rm -rf $expdir/scores +mv local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/scores $expdir/ diff --git a/egs/reverb/s5/local/download_se_eval_tool.sh b/egs/reverb/s5/local/download_se_eval_tool.sh new file mode 100755 index 00000000000..c7b272907b6 --- /dev/null +++ b/egs/reverb/s5/local/download_se_eval_tool.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# This script downloads the official REVERB challenge SE scripts and SRMR toolbox +# This script also downloads and compiles PESQ +# please make sure that you or your institution have the license to report PESQ +# Apache 2.0 + +wget 'https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200102-I!!SOFT-ZST-E&type=items' -O PESQ.zip +unzip PESQ.zip -d local/PESQ_sources +rm PESQ.zip +cd local/PESQ_sources/P862/Software/source +gcc *.c -lm -o PESQ +cd ../../../../../ +mv local/PESQ_sources/P862/Software/source/PESQ local/ + +wget 'https://reverb2014.dereverberation.com/tools/REVERB-SPEENHA.Release04Oct.zip' -O REVERB_scores.zip +unzip REVERB_scores.zip -d local/REVERB_scores_source +rm REVERB_scores.zip + +pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools +sed -i 's/wavread/audioread/g' prog/score_sim.m +git clone https://github.com/MuSAELab/SRMRToolbox.git +sed -i 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m +sed -i 's/SRMR_main/SRMR/g' prog/score_real.m +sed -i 's/SRMR_main/SRMR/g' prog/score_sim.m +sed -i 's/+wb\ //g' prog/calcpesq.m +sed -i 's/pesq_/_pesq_/g' prog/calcpesq.m +sed -ie '30d;31d' prog/calcpesq.m +patch score_RealData.m -i ../../../score_RealData.patch -o score_RealData_new.m +mv score_RealData_new.m score_RealData.m +patch score_SimData.m -i ../../../score_SimData.patch -o score_SimData_new.m +mv score_SimData_new.m score_SimData.m +popd diff --git a/egs/reverb/s5/local/generate_data.sh b/egs/reverb/s5/local/generate_data.sh new file mode 100755 index 00000000000..3228f0e1b3c --- /dev/null +++ b/egs/reverb/s5/local/generate_data.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe) +# Apache 2.0 +# This script is adapted from data preprations scripts in the Kaldi reverb recipe +# https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local + +# Begin configuration section. +wavdir=${PWD}/wav +# End configuration section + +. ./utils/parse_options.sh # accept options.. you can run this run.sh with the + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 1 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora3/LDC/LDC95S24/wsjcam0" + exit 1 +fi + +set -e -o pipefail + +wsjcam0=$1 +mkdir -p ${wavdir} + +# tool directory +dir=${PWD}/data/local/reverb_tools +mkdir -p ${dir} + +# Download tools +URL1="http://reverb2014.dereverberation.com/tools/reverb_tools_for_Generate_mcTrainData.tgz" +URL2="http://reverb2014.dereverberation.com/tools/REVERB_TOOLS_FOR_ASR_ver2.0.tgz" +for f in $URL1 $URL2; do + x=`basename $f` + if [ ! -e $dir/$x ]; then + wget $f -O $dir/$x || exit 1; + tar zxvf $dir/$x -C $dir || exit 1; + fi +done +URL3="http://reverb2014.dereverberation.com/tools/taskFiles_et.tgz" +x=`basename $URL3` +if [ ! -e $dir/$x ]; then + wget $URL3 -O $dir/$x || exit 1; + tar zxvf $dir/$x -C $dir || exit 1; + cp -fr $dir/`basename $x .tgz`/* $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/ +fi + +# generate WAV files for matlab +echo "generating WAV files" +sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +if [ ! -x $sph2pipe ]; then + echo "Could not find (or execute) the sph2pipe program at ${sph2pipe}"; + exit 1; +fi +for sph in `cat ${dir}/reverb_tools_for_Generate_mcTrainData/etc/audio_si_tr.lst`; do + d=`dirname ${wavdir}/WSJCAM0/data/${sph}` + if [ ! -d "${d}" ]; then + mkdir -p ${d} + fi + ${sph2pipe} -f wav ${wsjcam0}/data/${sph}.wv1 > ${wavdir}/WSJCAM0/data/${sph}.wav +done +nwav=`find ${wavdir}/WSJCAM0/data/primary_microphone/si_tr | grep .wav | wc -l` +echo "generated ${nwav} WAV files (it must be 7861)" +[ "$nwav" -eq 7861 ] || echo "Warning: expected 7861 WAV files, got $nwav" + +# generalte training data +reverb_tr_dir=${wavdir}/REVERB_WSJCAM0_tr +cp local/Generate_mcTrainData_cut.m $dir/reverb_tools_for_Generate_mcTrainData/ +pushd $dir/reverb_tools_for_Generate_mcTrainData/ +tmpdir=`mktemp -d tempXXXXX ` +tmpmfile=$tmpdir/run_mat.m +cat < $tmpmfile +addpath(genpath('.')) +Generate_mcTrainData_cut('$wavdir/WSJCAM0', '$reverb_tr_dir'); +EOF +cat $tmpmfile | matlab -nodisplay +rm -rf $tmpdir +popd + +echo "Successfully generated multi-condition training data and stored it in $reverb_tr_dir." && exit 0; diff --git a/egs/reverb/s5/local/get_results.sh b/egs/reverb/s5/local/get_results.sh index 7c74736e5d1..8867961dcdd 100755 --- a/egs/reverb/s5/local/get_results.sh +++ b/egs/reverb/s5/local/get_results.sh @@ -1,18 +1,86 @@ #!/bin/bash -# Reproduce selected results in Table 1 from Weninger et al. (2014) # "Our baselines" - -# LDA-STC fMLLR MCT DT LM MBR -# No No No No BG No -local/calc_wer.sh -# No No Yes No BG No -local/calc_wer.sh --am tri2a_mc -# No Yes Yes No BG No -local/calc_wer.sh --am tri2a_mc --decode basis_fmllr -# Yes Yes Yes No TG No -local/calc_wer.sh --am tri2b_mc --lm tg_5k --decode basis_fmllr -# Yes Yes Yes Yes TG No -local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode basis_fmllr -# Yes Yes Yes Yes TG Yes -local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode mbr_basis_fmllr +echo "########################################" +echo "GMM RESULTs:" +echo "exp/tri3/decode_dt_real_1ch" +cat exp/tri3/decode_dt_real_1ch/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_1ch" +cat exp/tri3/decode_dt_simu_1ch/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_1ch" +cat exp/tri3/decode_et_real_1ch/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_simu_1ch" +cat exp/tri3/decode_et_simu_1ch/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_real_1ch_wpe" +cat exp/tri3/decode_dt_real_1ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_1ch_wpe" +cat exp/tri3/decode_dt_simu_1ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_1ch_wpe" +cat exp/tri3/decode_et_real_1ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_simu_1ch_wpe" +cat exp/tri3/decode_et_simu_1ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_real_2ch_wpe" +cat exp/tri3/decode_dt_real_2ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_2ch_wpe" +cat exp/tri3/decode_dt_simu_2ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_2ch_wpe" +cat exp/tri3/decode_et_real_2ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_simu_2ch_wpe" +cat exp/tri3/decode_et_simu_2ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_real_8ch_wpe" +cat exp/tri3/decode_dt_real_8ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_8ch_wpe" +cat exp/tri3/decode_dt_simu_8ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_8ch_wpe" +cat exp/tri3/decode_et_real_8ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_simu_8ch_wpe" +cat exp/tri3/decode_et_simu_8ch_wpe/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_real_2ch_beamformit" +cat exp/tri3/decode_dt_real_2ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_2ch_beamformit" +cat exp/tri3/decode_dt_simu_2ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_2ch_beamformit" +cat exp/tri3/decode_et_real_2ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_simu_2ch_beamformit" +cat exp/tri3/decode_et_simu_2ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_real_8ch_beamformit" +cat exp/tri3/decode_dt_real_8ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_simu_8ch_beamformit" +cat exp/tri3/decode_dt_simu_8ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_real_8ch_beamformit" +cat exp/tri3/decode_et_real_8ch_beamformit/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_dt_cln" +cat exp/tri3/decode_dt_cln/scoring_kaldi/best_wer* +echo "" +echo "exp/tri3/decode_et_cln" +cat exp/tri3/decode_et_cln/scoring_kaldi/best_wer* +echo "########################################" +echo "TDNN RESULTs:" +echo "exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt*" +cat exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt*/scoring_kaldi/best_wer_* +echo "" +echo "exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et*" +cat exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et*/scoring_kaldi/best_wer_* diff --git a/egs/reverb/s5/local/nnet3/compare_wer.sh b/egs/reverb/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..095e85cc338 --- /dev/null +++ b/egs/reverb/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid acc " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo diff --git a/egs/reverb/s5/local/nnet3/run_ivector_common.sh b/egs/reverb/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..3af3ad77565 --- /dev/null +++ b/egs/reverb/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train_worn_u100k +test_sets="dev_worn dev_beamformit_ref" +gmm=tri3 +nj=96 + +nnet3_affix=_train_worn_u100k + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{4,5,6,8}/$USER/kaldi-data/mfcc/reverb-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l &2 "$0" "$@" +if [ $# -ne 1 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora5/REVERB_2014/REVERB" + exit 1 +fi + +set -e -o pipefail + +reverb=$1 + +# working directory +dir=${PWD}/data/local/data +mkdir -p ${dir} + +for task in dt et; do + if [ ${task} == 'dt' ]; then + mlf=${reverb}/MC_WSJ_AV_Dev/mlf/WSJ.mlf + elif [ ${task} == 'et' ]; then + mlf=${reverb}/MC_WSJ_AV_Eval/mlf/WSJ.mlf + fi + # MLF transcription correction + # taken from HTK baseline script + sed -e ' +# dos to unix line feed conversion +s/\x0D$//' \ + -e " + s/\x60//g # remove unicode character grave accent. + " \ + -e " + # fix the single quote for the word yield + # and the quoted ROOTS + # e.g. yield' --> yield + # reason: YIELD' is not in dict, while YIELD is + s/YIELD'/YIELD/g + s/'ROOTS'/ROOTS/g + s/'WHERE/WHERE/g + s/PEOPLE'/PEOPLE/g + s/SIT'/SIT/g + s/'DOMINEE/DOMINEE/g + s/CHURCH'/CHURCH/g" \ + -e ' + # fix the single missing double full stop issue at the end of an utterance + # e.g. I. C. N should be I. C. N. + # reason: N is not in dict, while N. is + /^[A-Z]$/ { + # append a line + N + # search for single dot on the second line + /\n\./ { + # found it - now replace the + s/\([A-Z]\)\n\./\1\.\n\./ + } + }' \ + $mlf |\ + perl local/mlf2text.pl > ${dir}/${task}.txt +done + + +noiseword=""; +for nch in 1 2 8; do + taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch + # make a wav list + for task in dt et; do + if [ ${task} == 'dt' ]; then + audiodir=${reverb}/MC_WSJ_AV_Dev + audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Dev + elif [ ${task} == 'et' ]; then + audiodir=${reverb}/MC_WSJ_AV_Eval + audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Eval + fi + for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do + perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir} ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_real_${nch}ch_wav.scp + for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do + perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir_wpe} ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_real_${nch}ch_wpe_wav.scp + done + # make a transcript + for task in dt et; do + for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do + perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id\n";}' ${taskdir}/$x |\ + perl local/find_transcripts_txt.pl ${dir}/${task}.txt |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_real_${nch}ch.trans1 || exit 1; + cat ${dir}/${task}_real_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_real_${nch}ch.txt || exit 1; + done + + # Make the utt2spk and spk2utt files. + for task in dt et; do + cat ${dir}/${task}_real_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_real_${nch}ch.utt2spk || exit 1; + cat ${dir}/${task}_real_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_real_${nch}ch.spk2utt || exit 1; + done +done + +# finally copy the above files to the data directory +for nch in 1 2 8; do + for task in dt et; do + datadir=data/${task}_real_${nch}ch + mkdir -p ${datadir} + sort ${dir}/${task}_real_${nch}ch_wav.scp > ${datadir}/wav.scp + sort ${dir}/${task}_real_${nch}ch.txt > ${datadir}/text + sort ${dir}/${task}_real_${nch}ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_real_${nch}ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + if [ ${nch} != 1 ]; then + datadir=data/${task}_real_${nch}ch_beamformit + mkdir -p ${datadir} + sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/-[1-8]_/-bf${nch}_/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp + sort ${dir}/${task}_real_1ch.txt > ${datadir}/text + sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + fi + datadir=data/${task}_real_${nch}ch_wpe + mkdir -p ${datadir} + sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp + sort ${dir}/${task}_real_1ch.txt > ${datadir}/text + sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + done +done diff --git a/egs/reverb/s5/local/prepare_simu_data.sh b/egs/reverb/s5/local/prepare_simu_data.sh new file mode 100755 index 00000000000..8757021ddd7 --- /dev/null +++ b/egs/reverb/s5/local/prepare_simu_data.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# +# Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe) +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 +# This script is adapted from data preparation scripts in the Kaldi reverb recipe +# https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local + +# Begin configuration section. +wavdir=${PWD}/wav +# End configuration section +. ./utils/parse_options.sh # accept options.. you can run this run.sh with the + +. ./path.sh + +echo >&2 "$0" "$@" +if [ $# -ne 2 ] ; then + echo >&2 "$0" "$@" + echo >&2 "$0: Error: wrong number of arguments" + echo -e >&2 "Usage:\n $0 [opts] " + echo -e >&2 "eg:\n $0 /export/corpora5/REVERB_2014/REVERB /export/corpora3/LDC/LDC95S24/wsjcam0" + exit 1 +fi + +set -e -o pipefail + +reverb=$1 +wsjcam0=$2 + +# tool directory +tooldir=${PWD}/data/local/reverb_tools + +# working directory +dir=${PWD}/data/local/data +mkdir -p ${dir} + +# make a one dot file for train, dev, and eval data +# the directory structure of WSJCAM0 is not consistent and we need such process for each task +cp ${wsjcam0}/data/primary_microphone/etc/si_tr.dot ${dir}/tr.dot +cat ${wsjcam0}/data/primary_microphone/etc/si_dt*.dot | sort > ${dir}/dt.dot +cat ${wsjcam0}/data/*/si_et*/*/*.dot | sort > ${dir}/et.dot + +noiseword=""; +for nch in 1 2 8; do + taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch + # make a wav list + task=tr + for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do + perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch_wav.scp + for task in dt et; do + for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do + perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch_wav.scp + if [ ${nch} == 1 ]; then + for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do + perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_cln_wav.scp + fi + done + + task=tr + for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do + perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp + for task in dt et; do + for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do + perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp + done + + # make a transcript + task=tr + for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do + perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\ + perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1; + cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1; + for task in dt et; do + for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do + perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\ + perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1; + cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1; + if [ ${nch} == 1 ]; then + for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do + perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\ + perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\ + sed -e "s/^\(...\)/\1_${x}_\1/" + done > ${dir}/${task}_cln.trans1 || exit 1; + cat ${dir}/${task}_cln.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_cln.txt || exit 1; + fi + done + + # Make the utt2spk and spk2utt files. + for task in tr dt et; do + cat ${dir}/${task}_simu_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_simu_${nch}ch.utt2spk || exit 1; + cat ${dir}/${task}_simu_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_simu_${nch}ch.spk2utt || exit 1; + done + for task in dt et; do + cat ${dir}/${task}_cln_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_cln.utt2spk || exit 1; + cat ${dir}/${task}_cln.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_cln.spk2utt || exit 1; + done +done + +# finally copy the above files to the data directory +for nch in 1 2 8; do + for task in tr dt et; do + datadir=data/${task}_simu_${nch}ch + mkdir -p ${datadir} + sort ${dir}/${task}_simu_${nch}ch_wav.scp > ${datadir}/wav.scp + sort ${dir}/${task}_simu_${nch}ch.txt > ${datadir}/text + sort ${dir}/${task}_simu_${nch}ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_simu_${nch}ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + if [ ${task} != 'tr' ]; then + datadir=data/${task}_simu_${nch}ch_wpe + mkdir -p ${datadir} + sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp + sort ${dir}/${task}_simu_1ch.txt > ${datadir}/text + sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + if [ ${nch} != 1 ]; then + datadir=data/${task}_simu_${nch}ch_beamformit + mkdir -p ${datadir} + sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/ch1/bf${nch}/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp + sort ${dir}/${task}_simu_1ch.txt > ${datadir}/text + sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + else + datadir=data/${task}_cln + mkdir -p ${datadir} + sort ${dir}/${task}_cln_wav.scp > ${datadir}/wav.scp + sort ${dir}/${task}_cln.txt > ${datadir}/text + sort ${dir}/${task}_cln.utt2spk > ${datadir}/utt2spk + sort ${dir}/${task}_cln.spk2utt > ${datadir}/spk2utt + ./utils/fix_data_dir.sh ${datadir} + fi + fi + done +done diff --git a/egs/reverb/s5/local/run_beamform.sh b/egs/reverb/s5/local/run_beamform.sh new file mode 100755 index 00000000000..1c8aade7287 --- /dev/null +++ b/egs/reverb/s5/local/run_beamform.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe) +# Copyright 2018, Johns Hopkins University (Author: Aswin Shanmugam Subramanian) + +. ./cmd.sh +. ./path.sh + +# Config: +nj=50 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +if [ $# != 1 ]; then + echo "Wrong #arguments ($#, expected 1)" + echo "Usage: local/run_beamform.sh [options] " + echo "main options (for others, see top of script file)" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + exit 1; +fi + +odir=$1 +dir=${PWD}/data/local/data + +if [ -z $BEAMFORMIT ] ; then + export BEAMFORMIT=$KALDI_ROOT/tools/extras/BeamformIt +fi +export PATH=${PATH}:$BEAMFORMIT +! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; extras/install_beamformit.sh;'" && exit 1 + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +for task in dt et; do + for nch in 2 8; do + wdir=exp/beamform_real_${task}_${nch}ch + mkdir -p $wdir/log + arrays=$wdir/channels + output_wavfiles=$wdir/wavfiles.list + if [ ${nch} == 2 ]; then + allwavs=`cat ${dir}/${task}_real_${nch}ch_wpe_wav.scp | cut -d " " -f2` + allwavs_beamformit=`cat data/${task}_real_${nch}ch_beamformit/wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels.2nd + echo $allwavs_beamformit | tr ' ' '\n' | rev | sort | rev | awk -F 'WPE/' '{print $2}' | awk -F '.wav' '{print $1}' > $output_wavfiles + paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $arrays + elif [ ${nch} == 8 ]; then + allwavs=`cat ${dir}/${task}_real_${nch}ch_wpe_wav.scp | cut -d " " -f2` + allwavs_beamformit=`cat data/${task}_real_${nch}ch_beamformit/wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels.2nd + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels.3rd + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels.4th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels.5th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels.6th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels.7th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels.8th + echo $allwavs_beamformit | tr ' ' '\n' | rev | sort | rev | awk -F 'WPE/' '{print $2}' | awk -F '.wav' '{print $1}' > $output_wavfiles + paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th > $arrays + fi + # split the list for parallel processing + split_wavfiles="" + for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" + done + utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + + echo -e "Beamforming - $task - real - $nch ch\n" + # making a shell script for each job + for n in `seq $nj`; do + cat <<-EOF > $wdir/log/beamform.$n.sh + while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $arrays \ + --config_file `pwd`/conf/reverb_beamformit.cfg \ + --result_dir $odir + done < $output_wavfiles.$n + EOF + done + + chmod a+x $wdir/log/beamform.*.sh + $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + $wdir/log/beamform.JOB.sh + done +done + +for task in dt et; do + for nch in 2 8; do + wdir=exp/beamform_simu_${task}_${nch}ch + mkdir -p $wdir/log + arrays=$wdir/channels + output_wavfiles=$wdir/wavfiles.list + if [ ${nch} == 2 ]; then + allwavs=`cat ${dir}/${task}_simu_${nch}ch_wpe_wav.scp | grep "ch[1-2].wav" | cut -d " " -f2` + allwavs_beamformit=`cat data/${task}_simu_${nch}ch_beamformit/wav.scp | grep "bf2.wav" | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd + echo $allwavs_beamformit | tr ' ' '\n' | awk -F 'WPE/' '{print $2}' | sort | awk -F '.wav' '{print $1}' > $output_wavfiles + paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $arrays + elif [ ${nch} == 8 ]; then + allwavs=`cat ${dir}/${task}_simu_${nch}ch_wpe_wav.scp | grep "ch[1-8].wav" | cut -d " " -f2` + allwavs_beamformit=`cat data/${task}_simu_${nch}ch_beamformit/wav.scp | grep "bf8.wav" | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd + echo $allwavs | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels.3rd + echo $allwavs | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels.4th + echo $allwavs | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels.5th + echo $allwavs | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels.6th + echo $allwavs | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels.7th + echo $allwavs | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels.8th + echo $allwavs_beamformit | tr ' ' '\n' | awk -F 'WPE/' '{print $2}' | sort | awk -F '.wav' '{print $1}' > $output_wavfiles + paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th > $arrays + fi + # split the list for parallel processing + split_wavfiles="" + for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" + done + utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1; + + echo -e "Beamforming - $task - simu - $nch ch\n" + # making a shell script for each job + for n in `seq $nj`; do + cat <<-EOF > $wdir/log/beamform.$n.sh + while read line; do + $BEAMFORMIT/BeamformIt -s \$line -c $arrays \ + --config_file `pwd`/conf/reverb_beamformit.cfg \ + --result_dir $odir + done < $output_wavfiles.$n + EOF + done + + chmod a+x $wdir/log/beamform.*.sh + $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \ + $wdir/log/beamform.JOB.sh + done +done +echo "`basename $0` Done." diff --git a/egs/reverb/s5/local/run_wpe.py b/egs/reverb/s5/local/run_wpe.py new file mode 100644 index 00000000000..cc9cd41927a --- /dev/null +++ b/egs/reverb/s5/local/run_wpe.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 +# Works with both python2 and python3 + +import numpy as np +import soundfile as sf +import time +import os, errno +from tqdm import tqdm +import argparse + +from nara_wpe.wpe import wpe +from nara_wpe.utils import stft, istft +from nara_wpe import project_root + +parser = argparse.ArgumentParser() +parser.add_argument('--files', '-f', nargs='+') +args = parser.parse_args() + +input_files = args.files[:len(args.files)//2] +output_files = args.files[len(args.files)//2:] +out_dir = os.path.dirname(output_files[0]) +try: + os.makedirs(out_dir) +except OSError as e: + if e.errno != errno.EEXIST: + raise + +stft_options = dict( + size=512, + shift=128, + window_length=None, + fading=True, + pad=True, + symmetric_window=False +) + +sampling_rate = 16000 +delay = 3 +iterations = 5 +taps = 10 + +signal_list = [ + sf.read(f)[0] + for f in input_files +] +y = np.stack(signal_list, axis=0) +Y = stft(y, **stft_options).transpose(2, 0, 1) +Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0) +z = istft(Z, size=stft_options['size'], shift=stft_options['shift']) + +for d in range(len(signal_list)): + sf.write(output_files[d], z[d,:], sampling_rate) diff --git a/egs/reverb/s5/local/run_wpe.sh b/egs/reverb/s5/local/run_wpe.sh new file mode 100755 index 00000000000..d1ea56c6c55 --- /dev/null +++ b/egs/reverb/s5/local/run_wpe.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian) +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +# Config: +nj=50 +cmd=run.pl + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +miniconda_dir=$HOME/miniconda3/ +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';" +fi + +# check if WPE is installed +result=`$HOME/miniconda3/bin/python -c "\ +try: + import nara_wpe + print('1') +except ImportError: + print('0')"` + +if [ "$result" == "1" ]; then + echo "WPE is installed" +else + echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh" +fi + +dir=${PWD}/data/local/data + +for task in dt et; do + for nch in 1 2 8; do + wdir=exp/wpe_real_${task}_${nch}ch + mkdir -p $wdir/log + arrays=$wdir/channels + output_wavfiles=$wdir/wavfiles.list + if [ ${nch} == 1 ]; then + allwavs=`cat ${dir}/${task}_real_1ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_real_1ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' > $wdir/channels_input + echo $allwavs_output | tr ' ' '\n' > $wdir/channels_output + paste -d" " $wdir/channels_input $wdir/channels_output > $arrays + elif [ ${nch} == 2 ]; then + allwavs=`cat ${dir}/${task}_real_2ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_real_2ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels.2nd + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels_output.1st + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels_output.2nd + paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels_output.1st $wdir/channels_output.2nd > $arrays + elif [ ${nch} == 8 ]; then + allwavs=`cat ${dir}/${task}_real_8ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_real_8ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels.2nd + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels.3rd + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels.4th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels.5th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels.6th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels.7th + echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels.8th + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels_output.1st + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels_output.2nd + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels_output.3rd + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels_output.4th + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels_output.5th + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels_output.6th + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels_output.7th + echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels_output.8th + paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th $wdir/channels_output.1st $wdir/channels_output.2nd $wdir/channels_output.3rd $wdir/channels_output.4th $wdir/channels_output.5th $wdir/channels_output.6th $wdir/channels_output.7th $wdir/channels_output.8th > $arrays + fi + + # split the list for parallel processing + split_wavfiles="" + for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" + done + utils/split_scp.pl $arrays $split_wavfiles || exit 1; + + echo -e "Dereverberation - $task - real - $nch ch\n" + # making a shell script for each job + for n in `seq $nj`; do + cat <<-EOF > $wdir/log/wpe.$n.sh + while read line; do + $HOME/miniconda3/bin/python local/run_wpe.py \ + --file \$line + done < $output_wavfiles.$n + EOF + done + + chmod a+x $wdir/log/wpe.*.sh + $cmd JOB=1:$nj $wdir/log/wpe.JOB.log \ + $wdir/log/wpe.JOB.sh + done +done + +for task in dt et; do + for nch in 1 2 8; do + wdir=exp/wpe_simu_${task}_${nch}ch + mkdir -p $wdir/log + arrays=$wdir/channels + output_wavfiles=$wdir/wavfiles.list + if [ ${nch} == 1 ]; then + allwavs=`cat ${dir}/${task}_simu_1ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_simu_1ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' > $wdir/channels_input + echo $allwavs_output | tr ' ' '\n' > $wdir/channels_output + paste -d" " $wdir/channels_input $wdir/channels_output > $arrays + elif [ ${nch} == 2 ]; then + allwavs=`cat ${dir}/${task}_simu_2ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_simu_2ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd + echo $allwavs_output | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels_output.1st + echo $allwavs_output | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels_output.2nd + paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels_output.1st $wdir/channels_output.2nd > $arrays + elif [ ${nch} == 8 ]; then + allwavs=`cat ${dir}/${task}_simu_8ch_wav.scp | cut -d " " -f2` + allwavs_output=`cat ${dir}/${task}_simu_8ch_wpe_wav.scp | cut -d " " -f2` + echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st + echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd + echo $allwavs | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels.3rd + echo $allwavs | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels.4th + echo $allwavs | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels.5th + echo $allwavs | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels.6th + echo $allwavs | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels.7th + echo $allwavs | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels.8th + echo $allwavs_output | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels_output.1st + echo $allwavs_output | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels_output.2nd + echo $allwavs_output | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels_output.3rd + echo $allwavs_output | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels_output.4th + echo $allwavs_output | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels_output.5th + echo $allwavs_output | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels_output.6th + echo $allwavs_output | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels_output.7th + echo $allwavs_output | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels_output.8th + paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th $wdir/channels_output.1st $wdir/channels_output.2nd $wdir/channels_output.3rd $wdir/channels_output.4th $wdir/channels_output.5th $wdir/channels_output.6th $wdir/channels_output.7th $wdir/channels_output.8th > $arrays + fi + + # split the list for parallel processing + split_wavfiles="" + for n in `seq $nj`; do + split_wavfiles="$split_wavfiles $output_wavfiles.$n" + done + utils/split_scp.pl $arrays $split_wavfiles || exit 1; + + echo -e "Dereverberation - $task - simu - $nch ch\n" + # making a shell script for each job + for n in `seq $nj`; do + cat <<-EOF > $wdir/log/wpe.$n.sh + while read line; do + $HOME/miniconda3/bin/python local/run_wpe.py \ + --file \$line + done < $output_wavfiles.$n + EOF + done + + chmod a+x $wdir/log/wpe.*.sh + $cmd JOB=1:$nj $wdir/log/wpe.JOB.log \ + $wdir/log/wpe.JOB.sh + done +done +echo "`basename $0` Done." diff --git a/egs/reverb/s5/local/score.sh b/egs/reverb/s5/local/score.sh index abd8149a672..66bc976333f 100755 --- a/egs/reverb/s5/local/score.sh +++ b/egs/reverb/s5/local/score.sh @@ -1,23 +1,29 @@ #!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) # Apache 2.0 +# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER + [ -f ./path.sh ] && . ./path.sh # begin configuration section. cmd=run.pl stage=0 -decode_mbr=true -word_ins_penalty=0.0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 min_lmwt=7 max_lmwt=17 +iter=final #end configuration section. +echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then - echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] " + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --stage (0|1|2) # start scoring script from part-way through." @@ -37,21 +43,122 @@ for f in $symtab $dir/lat.1.gz $data/text; do [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; done -mkdir -p $dir/scoring/log -cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +if echo $data | grep -q "real"; then + tasks="\ + near_room1 far_room1" +elif echo $data | grep -q "cln"; then + tasks="\ + cln_room1 cln_room2 cln_room3" +else + tasks="\ + near_room1 far_room1 \ + near_room2 far_room2 \ + near_room3 far_room3" +fi +for task in ${tasks}; do + grep $task $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt_${task}.txt || exit 1; +done + +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab \ - ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + for task in ${tasks}; do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + grep $task $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_kaldi/test_filt_${task}.txt ark,p:- ">&" $dir/wer_LMWT_${wip}_${task} || exit 1; + done + done +fi + + + +if [ $stage -le 1 ]; then + for task in ${tasks}; do + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip}_${task} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer_${task} || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer_${task}) + best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-2; print $N}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi + done +fi -# Note: the double level of quoting for the sed command -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - cat $dir/scoring/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null exit 0; diff --git a/egs/reverb/s5/local/score_RealData.patch b/egs/reverb/s5/local/score_RealData.patch new file mode 100644 index 00000000000..cafa521d483 --- /dev/null +++ b/egs/reverb/s5/local/score_RealData.patch @@ -0,0 +1,14 @@ +11c11 +< clear all; +--- +> function score_RealData(download_from_ldc,senhroot) +26c26,27 +< srmrdir = 'SRMRtoolbox-ReverbChallenge'; +--- +> srmrdir = 'SRMRToolbox'; +> addpath(genpath('SRMRToolbox/libs')); +32d32 +< senhroot = '../output/RealData'; +129a130,131 +> +> end diff --git a/egs/reverb/s5/local/score_SimData.patch b/egs/reverb/s5/local/score_SimData.patch new file mode 100644 index 00000000000..4fb0d9f48ac --- /dev/null +++ b/egs/reverb/s5/local/score_SimData.patch @@ -0,0 +1,23 @@ +11c11 +< clear all; +--- +> function score_SimData(download_from_ldc,senhroot,pesqdir,compute_pesq) +26,27c26,27 +< srmrdir = 'SRMRtoolbox-ReverbChallenge'; +< % pesqdir = '/directory/where/pesq/executable/is/stored'; +--- +> srmrdir = 'SRMRToolbox'; +> addpath(genpath('SRMRToolbox/libs')); +36d35 +< senhroot = '../output/SimData'; +39c38 +< if exist('pesqdir', 'var') +--- +> if exist('pesqdir', 'var') && compute_pesq~=0 +471c470,472 +< fclose(fid); +\ No newline at end of file +--- +> fclose(fid); +> +> end diff --git a/egs/reverb/s5/local/score_mbr.sh b/egs/reverb/s5/local/score_mbr.sh deleted file mode 120000 index 2573fadf042..00000000000 --- a/egs/reverb/s5/local/score_mbr.sh +++ /dev/null @@ -1 +0,0 @@ -../../../wsj/s5/local/score_mbr.sh \ No newline at end of file diff --git a/egs/reverb/s5/path.sh b/egs/reverb/s5/path.sh index 1a6fb5f891b..f46c5d8cb72 100644 --- a/egs/reverb/s5/path.sh +++ b/egs/reverb/s5/path.sh @@ -1,4 +1,6 @@ export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +[ -f $KALDI_ROOT/tools/extras/env.sh ] && . $KALDI_ROOT/tools/extras/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh index cb0b00c19b6..999ec98e637 100755 --- a/egs/reverb/s5/run.sh +++ b/egs/reverb/s5/run.sh @@ -1,6 +1,8 @@ #!/bin/bash # Copyright 2013-2014 MERL (author: Felix Weninger and Shinji Watanabe) +# Johns Hopkins University (author: Szu-Jui Chen) +# Johns Hopkins University (author: Aswin Shanmugam Subramanian) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,7 +35,13 @@ fi . ./cmd.sh . ./path.sh -stage=1 +stage=0 +nch_se=8 +# flag for turing on computation of dereverberation measures +compute_se=true +# please make sure that you or your institution have the license to report PESQ before turning on the below flag +enable_pesq=false + . utils/parse_options.sh # Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', @@ -41,297 +49,141 @@ set -euxo pipefail # please make sure to set the paths of the REVERB and WSJ0 data if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then - REVERB_home=/export/corpora5/REVERB_2014/REVERB + reverb=/export/corpora5/REVERB_2014/REVERB export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0 # set LDC WSJ0 directory to obtain LMs # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B # It is assumed that there will be a 'wsj0' subdirectory # within the top-level corpus directory -elif [[ $(hostname -f) == *.merl.com ]] ; then - REVERB_home=/db/laputa1/data/original/public/REVERB - export wsjcam0=$REVERB_home/wsjcam0 - # set LDC WSJ0 directory to obtain LMs - # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z) - export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B - # It is assumed that there will be a 'wsj0' subdirectory - # within the top-level corpus directory else echo "Set the data directory locations." && exit 1; fi -export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt -export reverb_et=$REVERB_home/REVERB_WSJCAM0_et -export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev -export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval - -# set the directory of the multi-condition training data to be generated -reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut -# LDA context size (left/right) (4 is default) -context_size=4 +#training set and test set +train_set=tr_simu_8ch +test_sets="dt_real_8ch_beamformit dt_simu_8ch_beamformit et_real_8ch_beamformit et_simu_8ch_beamformit dt_real_1ch_wpe dt_simu_1ch_wpe et_real_1ch_wpe et_simu_1ch_wpe dt_cln et_cln" # The language models with which to decode (tg_5k or bg_5k) lm="tg_5k" # number of jobs for feature extraction and model training -nj_train=30 - +nj=92 # number of jobs for decoding -nj_decode=8 - -# set to true if you want the tri2a systems (re-implementation of the HTK baselines) -do_tri2a=true +decode_nj=10 -if [ $stage -le 1 ]; then - # Generate multi-condition training data - # Note that utterance lengths match the original set. - # This enables using clean alignments in multi-condition training (stereo training) - local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr +wavdir=${PWD}/wav +pesqdir=${PWD}/local +if [ ${stage} -le 1 ]; then + # data preparation + echo "stage 0: Data preparation" + local/generate_data.sh --wavdir ${wavdir} ${wsjcam0} + local/prepare_simu_data.sh --wavdir ${wavdir} ${reverb} ${wsjcam0} + local/prepare_real_data.sh --wavdir ${wavdir} ${reverb} fi if [ $stage -le 2 ]; then + local/run_wpe.sh --cmd "$train_cmd" + local/run_beamform.sh --cmd "$train_cmd" ${wavdir}/WPE/ +fi + +# Compute dereverberation scores +if [ $stage -le 3 ] && $compute_se; then + if [ ! -d local/REVERB_scores_source ] || [ ! -d local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools/SRMRToolbox ] || [ ! -f local/PESQ ]; then + # download and install speech enhancement evaluation tools + local/download_se_eval_tool.sh + fi + local/compute_se_scores.sh --nch $nch_se --enable_pesq $enable_pesq $reverb $wavdir $pesqdir + cat exp/compute_se_${nch_se}ch/scores/score_SimData + cat exp/compute_se_${nch_se}ch/scores/score_RealData +fi + +if [ $stage -le 4 ]; then # Prepare wsjcam0 clean data and wsj0 language model. local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 - + # Prepare merged BEEP/CMU dictionary. local/wsj_prepare_beep_dict.sh # Prepare wordlists, etc. - utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang # Prepare directory structure for clean data. Apply some language model fixes. local/wsjcam0_format_data.sh +fi - # Now it's getting more interesting. - # Prepare the multi-condition training data and the REVERB dt set. - # This also extracts MFCC features (!!!) - # This creates the data sets called REVERB_tr_cut and REVERB_dt. - # If you have processed waveforms, this is a good starting point to integrate them. - # For example, you could have something like - # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt - # The first argument is supposed to point to a folder that has the same structure - # as the REVERB corpus. - local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr - local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt - local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et - - # Prepare the REVERB "real" dt set from MCWSJAV corpus. - # This corpus is *never* used for training. - # This creates the data set called REVERB_Real_dt and its subfolders - local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt - # The MLF file exists only once in the corpus, namely in the real_dt directory - # so we pass it as 4th argument - local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf +if [ $stage -le 5 ]; then + for dset in ${train_set} ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} + done fi -if [ $stage -le 3 ]; then - # Extract MFCC features for clean sets. - # For the non-clean data sets, this is outsourced to the data preparation scripts. +if [ $stage -le 6 ]; then + # Extract MFCC features for train and test sets. mfccdir=mfcc - ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct. - for x in si_tr; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \ + for x in ${train_set} ${test_sets}; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 \ data/$x exp/make_mfcc/$x $mfccdir steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done fi -if [ $stage -le 4 ]; then - # Train monophone model on clean data (si_tr). - echo "### TRAINING mono0a ###" - steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ - data/si_tr data/lang exp/mono0a - - # Align monophones with clean data. - echo "### ALIGNING mono0a_ali ###" - steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \ - data/si_tr data/lang exp/mono0a exp/mono0a_ali - - # Create first triphone recognizer. - echo "### TRAINING tri1 ###" - steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ - 2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 - - echo "### ALIGNING tri1_ali ###" - # Re-align triphones. - steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ - data/si_tr data/lang exp/tri1 exp/tri1_ali +if [ $stage -le 7 ]; then + # Starting basic training on MFCC features + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono fi -# The following code trains and evaluates a delta feature recognizer, which is similar to the HTK -# baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only. -if $do_tri2a; then -if [ $stage -le 5 ]; then - # Train tri2a, which is deltas + delta-deltas, on clean data. - steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a - - # Re-align triphones using clean data. This gives a smallish performance gain. - steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ - data/si_tr data/lang exp/tri2a exp/tri2a_ali +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/mono exp/mono_ali - # Train a multi-condition triphone recognizer. - # This uses alignments on *clean* data, which is allowed for REVERB. - # However, we have to use the "cut" version so that the length of the - # waveforms match. - # It is actually asserted by the Challenge that clean and multi-condition waves are aligned. steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc - - # Prepare clean and mc tri2a models for decoding. - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k & - utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k & - wait + 2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1 fi -if [ $stage -le 6 ]; then - # decode REVERB dt using tri2a, clean - for dataset in data/REVERB_*{dt,et}/*; do - steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ - exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & - done - - # decode REVERB dt using tri2a, mc - for dataset in data/REVERB_*{dt,et}/*; do - steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & - done - - # basis fMLLR for tri2a_mc system - # This computes a transform for every training utterance and computes a basis from that. - steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc - - # Recognition using fMLLR adaptation (per-utterance processing). - for dataset in data/REVERB_*{dt,et}/*; do - steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ - exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` & - done - wait -fi -fi +if [ $stage -le 9 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri1 exp/tri1_ali -if [ $stage -le 7 ]; then - # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe. - echo "### TRAINING tri2b ###" steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b - - # tri2b (LDA-MLLT system) with multi-condition training, using default parameters. - echo "### TRAINING tri2b_mc ###" - steps/train_lda_mllt.sh --cmd "$train_cmd"\ - --splice-opts "--left-context=$context_size --right-context=$context_size" \ - 2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc + 4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2 fi -# Prepare tri2b* systems for decoding. -if [ $stage -le 8 ]; then - echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###" - for recog in tri2b tri2b_mc; do - utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm & +if [ $stage -le 10 ]; then + utils/mkgraph.sh data/lang_test_$lm exp/tri2 exp/tri2/graph + for dset in ${test_sets}; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} & done wait fi -# discriminative training on top of multi-condition systems -# one could also add tri2b here to have a DT clean recognizer for reference -if [ $stage -le 9 ]; then - base_recog=tri2b_mc - bmmi_recog=${base_recog}_mmi_b0.1 - echo "### DT $base_recog --> $bmmi_recog ###" - - # get alignments from base recognizer - steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \ - --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali - - # get lattices from base recognizer - denlats_dir=${base_recog}_denlats - subsplit=`echo $nj_train \* 2 | bc` - # DT with multi-condition data ... - steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set} data/lang exp/tri2 exp/tri2_ali - # boosted MMI training - steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \ - data/REVERB_tr_cut/SimData_tr_for_1ch_A \ - data/lang \ - exp/${base_recog}_ali \ - exp/$denlats_dir \ - exp/$bmmi_recog - cp exp/$base_recog/ali.* exp/$bmmi_recog + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3 fi -# decoding using various recognizers -if [ $stage -le 10 ]; then - # put tri2b last since it takes longest due to the large mismatch. - for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do - # The graph from the ML directory is used in recipe - recog2=`echo $recog | sed s/_mmi.*//` - graph=exp/$recog2/graph_$lm - - echo "### DECODING with $recog, noadapt, $lm ###" - for dataset in data/REVERB_*{dt,et}/*; do - decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` - steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \ - $graph $dataset \ - exp/$recog/decode_$decode_suff & - done - wait - - echo " ## MBR RESCORING with $recog, noadapt ##" - for dataset in data/REVERB_*{dt,et}/*; do - decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` - mkdir -p exp/$recog/decode_mbr_$decode_suff - cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff - local/score_mbr.sh --cmd "$decode_cmd" \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff & - done - wait - - done # loop recog +if [ $stage -le 12 ]; then + utils/mkgraph.sh data/lang_test_$lm exp/tri3 exp/tri3/graph + for dset in ${test_sets}; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} & + done + wait fi -# decoding using various recognizers with adaptation -if [ $stage -le 11 ]; then - # put tri2b last since it takes longest due to the large mismatch. - for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do - # The graph from the ML directory is used in recipe - recog2=`echo $recog | sed s/_mmi.*//` - graph=exp/$recog2/graph_$lm - - # set the adaptation data - if [[ "$recog" =~ _mc ]]; then - tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A - else - tr_dataset=si_tr - fi - - echo "### DECODING with $recog, basis_fmllr, $lm ###" - steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog - for dataset in data/REVERB_*{dt,et}/*; do - ( - decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` - steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \ - $graph $dataset \ - exp/$recog/decode_basis_fmllr_$decode_suff - ) & - done - wait - - echo " ## MBR RESCORING with $recog, basis_fmllr ##" - for dataset in data/REVERB_*{dt,et}/*; do - decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` - mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff - cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff - local/score_mbr.sh --cmd "$decode_cmd" \ - $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff & - done - wait - - done # loop recog +if [ $stage -le 13 ]; then + # chain TDNN + local/chain/run_tdnn.sh --nj ${nj} --train-set ${train_set} --test-sets "$test_sets" --gmm tri3 --nnet3-affix _${train_set} \ + --lm-suffix _test_$lm fi -# get all WERs with lmw=15 -if [ $stage -le 12 ]; then +# get all WERs. +if [ $stage -le 14 ]; then local/get_results.sh fi diff --git a/tools/extras/install_wpe.sh b/tools/extras/install_wpe.sh new file mode 100755 index 00000000000..4d129fc6db7 --- /dev/null +++ b/tools/extras/install_wpe.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Installs nara-wpe with dependencies +# miniconda should be installed in $HOME/miniconda3/ + +miniconda_dir=$HOME/miniconda3/ + +if [ ! -d $miniconda_dir ]; then + echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1; +fi + +$HOME/miniconda3/bin/python -m pip install soundfile +git clone https://github.com/fgnt/nara_wpe.git +cd nara_wpe +$HOME/miniconda3/bin/python -m pip install --editable .