Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
275 commits
Select commit Hold shift + click to select a range
c6ffb15
Merge branch 'master' into semi_supervised
hhadian Jun 3, 2017
7b01bb0
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 3, 2017
403e3e2
Add nnet3, chain, and semi_sepervised scripts for fisher english
hhadian Jun 6, 2017
0c8974e
Merge remote-tracking branch 'origin/semi_supervised' into semi_super…
hhadian Jun 6, 2017
e1de4e4
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 8, 2017
41952cd
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 9, 2017
2e2b3d1
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 11, 2017
51c32f7
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 14, 2017
232397e
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 15, 2017
1414f6f
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 16, 2017
c65ef65
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 17, 2017
9677175
Merge branch 'master' of github.com:vimalmanohar/kaldi into chain-smbr
vimalmanohar Jun 20, 2017
20cf238
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 20, 2017
ae1cfe1
Merge branch 'master' of github.com:vimalmanohar/kaldi into chain-smbr
vimalmanohar Jun 21, 2017
bf56938
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 21, 2017
0bacc83
SMBR chain
vimalmanohar Jun 22, 2017
2c43456
chain-smbr: Bug fixes
vimalmanohar Jun 22, 2017
6adc948
Chain SMBR fixes
vimalmanohar Jun 22, 2017
2959279
chain-smbr: Bug fixes
vimalmanohar Jun 22, 2017
51ec051
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 22, 2017
758e9a4
chain-smbr: Bug fix
vimalmanohar Jun 22, 2017
d364040
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 23, 2017
2f15292
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 24, 2017
d8db02d
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 25, 2017
9d97243
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 27, 2017
57d1016
temp
vimalmanohar Jun 22, 2017
a03b401
smbr-dash
vimalmanohar Jun 22, 2017
0682618
smbr without leaky
vimalmanohar Jun 24, 2017
62da39a
chain-smbr: Fix bugs in chain smbr
vimalmanohar Jun 27, 2017
5b7879d
smbr training
vimalmanohar Jun 27, 2017
378267b
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 28, 2017
a973632
Adding missing chain-smbr-kernels.cu
vimalmanohar Jun 29, 2017
e7d9d52
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 29, 2017
55d3321
Add phone-insertion-penalty + minor updates
hhadian Jun 29, 2017
0a19c27
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jun 30, 2017
f776b3a
Minor bug fixes
vimalmanohar Jun 30, 2017
d1b872c
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 1, 2017
c11756d
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 4, 2017
8fd9f19
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 7, 2017
f37c374
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 8, 2017
a89d02d
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 9, 2017
4c86384
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 10, 2017
774d78e
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 12, 2017
845f27b
chain-smbr: Adding smbr
vimalmanohar Jul 12, 2017
545154a
added scripts for new weight transfer method for transferring all lay…
pegahgh Jul 14, 2017
5248c1a
merged with master
pegahgh Jul 14, 2017
40c85dc
updated PR w.r.t comments.
pegahgh Jul 14, 2017
39a731f
small fix to parser.py.
pegahgh Jul 14, 2017
970842e
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 15, 2017
c1996ff
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 17, 2017
72480ec
fixed issues w.r.t. comments (except prepare_wsj_rm_lang.sh).
pegahgh Jul 17, 2017
7559d3a
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 18, 2017
e0d43a6
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 20, 2017
4a217ea
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Jul 22, 2017
48d8161
chain: Fixes for silence
vimalmanohar Jul 23, 2017
9fedda9
chain: Updating chain script
vimalmanohar Jul 23, 2017
de34ec4
Merging masterR
vimalmanohar Jul 23, 2017
e51826a
fixed small issue with language-model.*.
pegahgh Jul 29, 2017
d64d017
semisup: Updating semisupervised scripts
vimalmanohar Aug 4, 2017
0a6b824
added new Xconfig layer to parse existing model and modified run_tdnn…
pegahgh Aug 6, 2017
e830a04
modified scripts to accept --trainer.input-model and prepare *.fst ou…
pegahgh Aug 9, 2017
49bcf2e
removed changes to language-model.* and generated weighted phone lm u…
pegahgh Aug 10, 2017
d25e63a
optimized alignment processing stage in weighted phone lm generation.
pegahgh Aug 10, 2017
f2d01ae
added check to have possitive int as phone lm weights.
pegahgh Aug 10, 2017
293c531
fixed small issue with train_dnn.py.
pegahgh Aug 10, 2017
2462cf5
merged with kaldi/master.
pegahgh Aug 10, 2017
5b510f9
fixed some issues.
pegahgh Aug 11, 2017
ac95720
fixed some issues.
pegahgh Aug 15, 2017
ed8b952
fixed some comments and removed some options.
pegahgh Aug 17, 2017
b92a63a
semisup: Adding some extra script for semi-supervised recipes
vimalmanohar Aug 17, 2017
7a9ef54
fixed src dirs options for transfer learning scripts 1{a,b,c} and mod…
pegahgh Aug 17, 2017
4d8ec90
semisup: Merging from master
vimalmanohar Aug 18, 2017
775b34d
minor change to prepare for tf learning
vimalmanohar Aug 23, 2017
a2d5e62
semisup: Merging transfer learning
vimalmanohar Aug 23, 2017
e0fd23e
semisup: Separate tolerance for silence
vimalmanohar Aug 23, 2017
405af6c
Merge branch 'chain-smbr' of github.com:vimalmanohar/kaldi into semis…
vimalmanohar Aug 23, 2017
89e574b
modified comments in xconfig and train.py and modified scripts to gen…
pegahgh Aug 24, 2017
eb00983
small fix.
pegahgh Aug 24, 2017
ef7275b
fixed old comments and added new comments.
pegahgh Aug 24, 2017
82fa510
fixed some issues in python codes using pylint package.
pegahgh Aug 24, 2017
40dc5e4
smbr: Fix aux objf
vimalmanohar Aug 24, 2017
bd20bdf
semisup: Merge chain-smbr
vimalmanohar Aug 24, 2017
1a74866
semisup: Merge chain-smbr
vimalmanohar Aug 24, 2017
a856dea
Update parser.py
pegahgh Aug 26, 2017
55a64ff
Update run_tdnn_wsj_rm_1c.sh
pegahgh Aug 30, 2017
c2593d8
Update basic_layers.py
pegahgh Aug 30, 2017
26b4ddd
Update parser.py
pegahgh Aug 30, 2017
90fc04a
chain: objective function fixes
vimalmanohar Sep 1, 2017
d811e15
semisup: Minor fixes to chain semisup
vimalmanohar Sep 1, 2017
af050b6
semisup: Add more recipes
vimalmanohar Sep 1, 2017
80db322
[egs] Fix default for egs.cmd
vimalmanohar Sep 1, 2017
ea3f34a
semisup-clean: Removing some recipes
vimalmanohar Sep 1, 2017
3c7780d
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Sep 2, 2017
82daf84
Update xconfig_to_configs.py
vimalmanohar Sep 2, 2017
f6bea67
semisup: Merging transfer learning
vimalmanohar Sep 2, 2017
24bd794
semisup: Merging from transfer-learning
vimalmanohar Sep 2, 2017
417b3cf
semisup: Merging from transfer-learning
vimalmanohar Sep 2, 2017
9c45f34
semisup: Removing some recipes
vimalmanohar Sep 2, 2017
f88a115
Merge pull request #12 from vimalmanohar/patch-4
pegahgh Sep 2, 2017
ed63b19
Update make_weighted_den_fst.sh
vimalmanohar Sep 3, 2017
43d1fe2
Merge pull request #13 from vimalmanohar/patch-5
pegahgh Sep 4, 2017
d397d3f
semisup: Removing unrequired codes
vimalmanohar Sep 4, 2017
2de6266
semisup: Removing more unrequired codes
vimalmanohar Sep 4, 2017
e025ee2
semisup: Remove build_tree_from_lats
vimalmanohar Sep 4, 2017
6fecd2b
Remove unrelated codes
vimalmanohar Sep 4, 2017
f6f4e29
semisup: Cleaning up scripts not used
vimalmanohar Sep 4, 2017
125abf0
fixed small issues.
pegahgh Sep 6, 2017
f51492b
fixed small issue.
pegahgh Sep 6, 2017
ba308ea
modified make_weighted_den_fst.sh
pegahgh Sep 10, 2017
8fae871
modified weighted_den_fst.sh
pegahgh Sep 10, 2017
6f5e8eb
fixed some issues.
pegahgh Sep 12, 2017
3985924
fixed some small issues.
pegahgh Sep 12, 2017
17bb56f
Merge branch 'master' into transfer-learning-wsj-rm
danpovey Sep 13, 2017
fe07c0b
[scripts] Cosmetic and other improvements to make_weighted_den_fst.sh…
danpovey Sep 13, 2017
b5ce647
smbr: Logging bug fix
vimalmanohar Sep 13, 2017
967531d
semisup: Extend trivial output layer
vimalmanohar Sep 13, 2017
e5e57ee
temp fix
vimalmanohar Sep 13, 2017
9ff681a
Merging from transfer learning
vimalmanohar Sep 13, 2017
a34655c
Merge branch 'transfer_learning' of github.com:danpovey/kaldi into se…
vimalmanohar Sep 13, 2017
088aad3
semisup: Merging the finalized transfer-learning
vimalmanohar Sep 15, 2017
d61cb4b
semisup: Adding lattice splitting chain code
vimalmanohar Sep 25, 2017
8772dba
semisup: Adding tolerances to lattices
vimalmanohar Oct 3, 2017
339c435
Old tolerance approach
vimalmanohar Oct 11, 2017
e90ca23
semisup: adding mbr supervision
vimalmanohar Oct 16, 2017
ea6ed69
semisup: Adding semisup recipes
vimalmanohar Oct 16, 2017
bacca8b
Minor bug fix in get_egs.sh
vimalmanohar Oct 17, 2017
417ecfd
Best path system recipe
vimalmanohar Oct 17, 2017
6f0de80
Add some minor check
vimalmanohar Oct 18, 2017
c6aa0e4
Updates to work with RNNLM
vimalmanohar Oct 19, 2017
c22bd48
Fix tolerance fst
vimalmanohar Oct 20, 2017
0d8af58
Minor fix to _m
Oct 20, 2017
f0c9fe1
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
Oct 20, 2017
5bfdd39
Tolerance fst fixed
vimalmanohar Oct 22, 2017
37cafe8
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
Oct 22, 2017
479e769
semisup: Fixing some bugs and making cleaner scripts
Oct 27, 2017
a3c3703
minor changes
vimalmanohar Oct 27, 2017
90e88ba
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
vimalmanohar Oct 27, 2017
bf10730
semisup: Changes to get_egs
Oct 27, 2017
18093ae
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
vimalmanohar Oct 27, 2017
0bbd2ce
semisup: Adding 100k experiments
Oct 29, 2017
99b8fc1
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
vimalmanohar Oct 29, 2017
f392d74
Changed permissions
vimalmanohar Oct 30, 2017
ebe5e8d
[egs] Bug fix in train_raw_dnn.py
vimalmanohar Sep 26, 2017
fbedee0
steps/cleanup: Fixed corner case in resolve_ctm_edits_overlaps.py
vimalmanohar Nov 1, 2017
fe7d835
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Nov 2, 2017
05ba2d9
Binaries for undeterminized lattices
Nov 2, 2017
fcefeaa
semisup: Adding tfrnnlm scripts
vimalmanohar Nov 2, 2017
ada93ca
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Nov 3, 2017
a40461c
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Nov 4, 2017
3be8143
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Nov 5, 2017
a0572b5
semisup: Undeterminized lattices recipes
Nov 6, 2017
8a035ab
semisup-smbr: Bug fix in 15k_s
vimalmanohar Nov 6, 2017
155b90a
Undo _s changes
vimalmanohar Nov 6, 2017
34f780a
semisup-smbr: Adding undeterminized version of rescoring
Nov 6, 2017
62b0f3b
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
vimalmanohar Nov 6, 2017
0651075
semisup-smbr: Fix undeterminized lattice rescoring
Nov 6, 2017
35afc06
Merge branch 'semisup-smbr' of github.com:vimalmanohar/kaldi into sem…
vimalmanohar Nov 6, 2017
eadc843
semisup: 50 hours recipe
vimalmanohar Nov 12, 2017
c5acc17
semisup: Pocolm for fisher english
vimalmanohar Nov 14, 2017
f71741a
semisup: Fix lattice rescoring
Nov 17, 2017
5103952
semisup: Code changes for undeterminized lattices
Nov 17, 2017
fc472c3
semisup: Adding more recipes
Nov 17, 2017
010bc4e
semisup: Unk model on Fisher
vimalmanohar Nov 17, 2017
d43125b
semisup: Bug fix in ivectors in semi-supervised scenario
vimalmanohar Nov 17, 2017
82efedb
semisup: Minor fixes to scripts
vimalmanohar Nov 20, 2017
df9f480
semisup-clean: Temporary merge
vimalmanohar Nov 20, 2017
b1805bf
semisup-clean: Merging semisup-smbr
vimalmanohar Nov 20, 2017
65fbcd7
semisup-clean: Removing some unused scripts
vimalmanohar Nov 21, 2017
5cda53b
semisup-clean: Removing experimental scripts
vimalmanohar Nov 21, 2017
bd2b2d7
semisup-clean: Removing smart splitting code
vimalmanohar Nov 21, 2017
2380264
semisup-clean: Remove support for non-compact lattices
vimalmanohar Nov 21, 2017
995bf24
semisup-clean: Remove smart splitting recipes
vimalmanohar Nov 21, 2017
5ca1012
semisup-clean: Removing experimental codes and cleanup
vimalmanohar Nov 21, 2017
ceed512
semisup-clean: Remove no-chunking stuff
vimalmanohar Nov 21, 2017
7b0c1a5
semisup-clean: UNK model fisher
vimalmanohar Nov 21, 2017
69495d0
Merge branch 'master' of github.com:vimalmanohar/kaldi into semisup-c…
vimalmanohar Nov 21, 2017
13d78fc
semisup-clean: Keep only the important recipes 100 hours sup
vimalmanohar Nov 21, 2017
e0ff557
semisup-clean: Add write-compact=false option
vimalmanohar Nov 23, 2017
2237087
semisup: Removing some unnecessary parts
vimalmanohar Nov 28, 2017
e762533
semisup-smbr: Minor updates
vimalmanohar Nov 28, 2017
d5c7edf
semisup-clean: Removing some tuning scripts
vimalmanohar Nov 28, 2017
a627cd7
semisup-clean: Remove rnnlm stuff for now
vimalmanohar Nov 28, 2017
17f9165
semisup: Removing some old modifications
vimalmanohar Nov 28, 2017
e3b7d72
semisup-smbr: Re-organizing stuff
vimalmanohar Nov 28, 2017
76cc0a0
semisup-smbr: Adding more recipes
vimalmanohar Nov 28, 2017
47ab45a
semisup-smbr: Add stages to scoring scripts
vimalmanohar Nov 28, 2017
c4488ba
semisup-clean: Merging latest changes
vimalmanohar Nov 28, 2017
fe72721
semisup-clean: Keep only changes to be committed now
vimalmanohar Nov 29, 2017
1dc7e27
semisup-clean: Remove smart splitting recipes
vimalmanohar Dec 1, 2017
9ba5c34
semisup-clean: Remove and cleanup some recipe
vimalmanohar Dec 1, 2017
b99764c
semisup-clean: cleaned up ivector extractor script
vimalmanohar Dec 1, 2017
37bb897
semisup: unk model script
vimalmanohar Dec 1, 2017
ec15e64
temp changes
vimalmanohar Dec 1, 2017
b3e1142
semisup-clean: Merging latest changes semisup
vimalmanohar Dec 1, 2017
c3e32f1
semisup-clean: Cleaning up recipes
vimalmanohar Dec 3, 2017
75fbde4
Merge branch 'master' of github.com:kaldi-asr/kaldi
vimalmanohar Dec 8, 2017
e62dac0
semisup: Making changes based on comments
vimalmanohar Dec 8, 2017
ac5da45
Merging from kaldi master
vimalmanohar Dec 8, 2017
5d1f4c9
semisup: Minor fixes
vimalmanohar Dec 11, 2017
f3fd4a9
semisup: Minor fixes
vimalmanohar Dec 11, 2017
0a69689
semisup: Re-organizing some scripts
vimalmanohar Dec 11, 2017
cce099f
Reverting discriminative changes for now
vimalmanohar Dec 28, 2017
ed5efd6
Revert some changes not required now
vimalmanohar Dec 28, 2017
5cafdc5
Merging from golden
vimalmanohar Dec 28, 2017
7a1ff5c
Minor fix
vimalmanohar Dec 28, 2017
e2c6603
Removingfew files from the PR
vimalmanohar Dec 29, 2017
2c06bf5
Clean the recipe
vimalmanohar Jan 8, 2018
9933ea7
Added some checks
vimalmanohar Jan 9, 2018
db0bc54
Remove truncate-deriv-weights
vimalmanohar Jan 9, 2018
c71cf88
Remove some unused binaries in chainbin get-egs
vimalmanohar Jan 9, 2018
6d8350e
Remove mkgraph.sh changes
vimalmanohar Jan 9, 2018
ef4750a
Merge branch 'master' of github.com:kaldi-asr/kaldi into semisup-clean
vimalmanohar Jan 9, 2018
0ee0075
Remove some tuning scripts
vimalmanohar Jan 9, 2018
4908983
Add recipe for build tree multiple sources
vimalmanohar Jan 9, 2018
85780b1
Remove some lattice function changes
vimalmanohar Jan 9, 2018
842dce9
Rename some scripts
vimalmanohar Jan 9, 2018
926dc3a
semisup: Reduce the number of scripts
vimalmanohar Jan 10, 2018
cf2e307
semisup: Revert changes to path.sh
vimalmanohar Jan 10, 2018
1acab6f
Fix some bugs and missing functions
vimalmanohar Jan 20, 2018
63a46b5
Minor changes
vimalmanohar Jan 23, 2018
7ceb06c
Merge branch 'master' of github.com:kaldi-asr/kaldi into semisup-clean
vimalmanohar Jan 23, 2018
fcb2572
Bug fixes in latbin
vimalmanohar Jan 23, 2018
91de50a
semisup: Adding comment about pocolm
vimalmanohar Jan 29, 2018
1cdcc17
Fix few issues based on comments
vimalmanohar Jan 30, 2018
c1bc78c
Add some documentation
vimalmanohar Jan 31, 2018
f3d4dde
Minor update
vimalmanohar Feb 2, 2018
31f38fa
Simplifying based on comments
vimalmanohar Feb 9, 2018
f9921b3
Cleaning up some script and adding comments
vimalmanohar Feb 9, 2018
3411ec7
Minor fix
vimalmanohar Feb 12, 2018
c0e75e9
Make allocate_multilingual_examples.py simpler
hhadian Feb 13, 2018
64573f6
Adding results to scripts
vimalmanohar Feb 19, 2018
af09cdc
Merging from kaldi master
vimalmanohar Feb 19, 2018
60de6eb
Adding some comments
vimalmanohar Feb 20, 2018
a3203d9
Minor fix
vimalmanohar Feb 20, 2018
88fd7e8
Minor bug fixes
vimalmanohar Feb 21, 2018
41a46c0
chain-semisup: Removing 15 hr recipe and adding more comments
vimalmanohar Feb 22, 2018
ebd1fec
Some cleaning
hhadian Feb 24, 2018
ac05137
Add --block-size option + minor fixes
hhadian Feb 24, 2018
c245e3f
Minor fixs
hhadian Feb 24, 2018
aaaf678
Some fixes in the comments
hhadian Feb 25, 2018
7a39bdb
Make block-size fixed
hhadian Feb 28, 2018
abae1a9
Small change for merging
vimalmanohar Mar 1, 2018
2bae581
semisup: Fixing based on comments
vimalmanohar Mar 1, 2018
a455f03
Show some info + warning + flush all the remaining partial blocks to …
hhadian Mar 2, 2018
17a703f
Some changes based on the comments
vimalmanohar Mar 4, 2018
2bbfd07
Merging new multilingual script
vimalmanohar Mar 4, 2018
6fefecb
Various bug fixes
vimalmanohar Mar 14, 2018
0460f06
Fixed few bugs and tested
vimalmanohar Mar 26, 2018
812b8c8
Merging kaldi master
vimalmanohar Mar 26, 2018
c729b4c
Fixed minor issues
vimalmanohar Mar 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@ if [ $stage -le 10 ] && [ ! -z $megs_dir ]; then
common_egs_dir="${multi_egs_dirs[@]} $megs_dir"
steps/nnet3/multilingual/combine_egs.sh $egs_opts \
--cmd "$decode_cmd" \
--samples-per-iter 400000 \
$num_langs ${common_egs_dir[@]} || exit 1;
fi

Expand Down
27 changes: 14 additions & 13 deletions egs/fisher_english/s5/local/fisher_create_test_lang.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
#!/bin/bash
#

if [ -f path.sh ]; then . ./path.sh; fi

mkdir -p data/lang_test
# This script formats ARPA LM into G.fst.

arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
dir=data/lang_test

if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh

[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;

mkdir -p data/lang_test
cp -r data/lang/* data/lang_test
mkdir -p $dir
cp -r data/lang/* $dir

gunzip -c "$arpa_lm" | \
arpa2fst --disambig-symbol=#0 \
--read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
--read-symbol-table=$dir/words.txt - $dir/G.fst


echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
fstisstochastic $dir/G.fst

## Check lexicon.
## just have a look and make sure it seems sane.
Expand All @@ -27,22 +29,21 @@ fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/l
echo Performing further checks

# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
fstdeterminize $dir/G.fst /dev/null || echo Error determinizing G.

# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
fstdeterminize $dir/L_disambig.fst /dev/null || echo Error determinizing L.

# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fsttablecompose $dir/L_disambig.fst $dir/G.fst | \
fstdeterminizestar >/dev/null || echo Error

# Checking that LG is stochastic:
fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
fsttablecompose data/lang/L_disambig.fst $dir/G.fst | \
fstisstochastic || echo "[log:] LG is not stochastic"


echo "$0 succeeded"
160 changes: 160 additions & 0 deletions egs/fisher_english/s5/local/fisher_train_lms_pocolm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/bin/bash

# Copyright 2016 Vincent Nguyen
# 2016 Johns Hopkins University (author: Daniel Povey)
# 2017 Vimal Manohar
# Apache 2.0
#
# This script is used to train LMs using pocolm toolkit.
# We use limit-unk-history=true, which truncates the history left of OOV word.
# This ensure the graph is compact when using phone LM to model OOV word.
# See the script local/run_unk_model.sh.

set -e
stage=0

text=data/train/text
lexicon=data/local/dict/lexicon.txt
dir=data/local/pocolm

num_ngrams_large=5000000
num_ngrams_small=2500000

echo "$0 $@" # Print the command line for logging
. utils/parse_options.sh || exit 1;

lm_dir=${dir}/data

mkdir -p $dir
. ./path.sh || exit 1; # for KALDI_ROOT
export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
( # First make sure the pocolm toolkit is installed.
cd $KALDI_ROOT/tools || exit 1;
if [ -d pocolm ]; then
echo Not installing the pocolm toolkit since it is already there.
else
echo "$0: Please install the PocoLM toolkit with: "
echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
exit 1;
fi
) || exit 1;

for f in "$text" "$lexicon"; do
[ ! -f $f ] && echo "$0: No such file $f" && exit 1;
done

num_dev_sentences=10000

if [ $stage -le 0 ]; then
mkdir -p ${dir}/data
mkdir -p ${dir}/data/text

echo "$0: Getting the Data sources"

rm ${dir}/data/text/* 2>/dev/null || true

cleantext=$dir/text_all.gz

cut -d ' ' -f 2- $text | awk -v lex=$lexicon '
BEGIN{
while((getline<lex) >0) { seen[$1]=1; }
}
{
for(n=1; n<=NF;n++) {
if (seen[$n]) {
printf("%s ", $n);
} else {
printf("<unk> ");
}
}
printf("\n");
}' | gzip -c > $cleantext || exit 1;

# This is for reporting perplexities
gunzip -c $dir/text_all.gz | head -n $num_dev_sentences > \
${dir}/data/test.txt

# use a subset of the annotated training data as the dev set .
# Note: the name 'dev' is treated specially by pocolm, it automatically
# becomes the dev set.
gunzip -c $dir/text_all.gz | tail -n +$[num_dev_sentences+1] | \
head -n $num_dev_sentences > ${dir}/data/text/dev.txt

gunzip -c $dir/text_all.gz | tail -n +$[2*num_dev_sentences+1] > \
${dir}/data/text/train.txt

# for reporting perplexities, we'll use the "real" dev set.
# (a subset of the training data is used as ${dir}/data/text/dev.txt to work
# out interpolation weights.
# note, we can't put it in ${dir}/data/text/, because then pocolm would use
# it as one of the data sources.
cat data/dev/text data/test/text | cut -d " " -f 2- > ${dir}/data/real_dev_set.txt

cat $lexicon | awk '{print $1}' | sort | uniq | awk '
{
if ($1 == "<s>") {
print "<s> is in the vocabulary!" | "cat 1>&2"
exit 1;
}
if ($1 == "</s>") {
print "</s> is in the vocabulary!" | "cat 1>&2"
exit 1;
}
printf("%s\n", $1);
}' > $dir/data/wordlist || exit 1;
fi

order=4
wordlist=${dir}/data/wordlist

lm_name="`basename ${wordlist}`_${order}"
min_counts='train=1'
if [ -n "${min_counts}" ]; then
lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
fi

unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm

if [ $stage -le 1 ]; then
# decide on the vocabulary.
# Note: you'd use --wordlist if you had a previously determined word-list
# that you wanted to use.
# Note: if you have more than one order, use a certain amount of words as the
# vocab and want to restrict max memory for 'sort',
echo "$0: training the unpruned LM"
train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \
--limit-unk-history=true \
--fold-dev-into=train ${bypass_metaparam_optim_opt} \
--min-counts="${min_counts}" \
${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}

get_data_prob.py ${dir}/data/test.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_test.log

get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' | tee ${unpruned_lm_dir}/perplexity_real_dev_set.log
fi

if [ $stage -le 2 ]; then
echo "$0: pruning the LM (to larger size)"
# Using 5 million n-grams for a big LM for rescoring purposes.
prune_lm_dir.py --target-num-ngrams=$num_ngrams_large --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big

get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_test.log

get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_big/perplexity_real_dev_set.log

mkdir -p ${dir}/data/arpa
format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
fi

if [ $stage -le 3 ]; then
echo "$0: pruning the LM (to smaller size)"
# Using 2.5 million n-grams for a smaller LM for graph building.
# Prune from the bigger-pruned LM, it'll be faster.
prune_lm_dir.py --target-num-ngrams=$num_ngrams_small ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small

get_data_prob.py ${dir}/data/test.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_test.log

get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' | tee ${dir}/data/lm_${order}_prune_small/perplexity_real_dev_set.log

format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
fi
Loading