Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
84 commits
Select commit Hold shift + click to select a range
5fe6cb2
minor change
aarora8 Sep 14, 2018
e1f4530
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into ar1
aarora8 Sep 14, 2018
c3443d2
updating run_end2end for text localization
aarora8 Sep 15, 2018
9c6a923
adding higher language model
aarora8 Sep 15, 2018
2c87fe5
fixing bug
aarora8 Sep 15, 2018
053fbdb
minor fix
aarora8 Sep 15, 2018
837fd4d
adding augmentation
aarora8 Sep 15, 2018
9c1d553
updating parameters
aarora8 Sep 17, 2018
47b6508
updating parameters
aarora8 Sep 17, 2018
18f585e
updating parameters
aarora8 Sep 17, 2018
cf22d16
minor cleaning and higher order language model
aarora8 Sep 17, 2018
13f2386
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into ar2
aarora8 Sep 17, 2018
95aed10
updating results
aarora8 Sep 17, 2018
85e3649
minor fix and adding tuning directory
aarora8 Sep 17, 2018
bff652c
adding overwrite variable
aarora8 Sep 17, 2018
303246e
adding documentation, fixing run.sh, minor fix
aarora8 Sep 17, 2018
6b857de
adding text localization changes
aarora8 Sep 18, 2018
1bd1448
adding gpu = false for alignments in runend2end
aarora8 Sep 18, 2018
895342a
updating text localization routine
aarora8 Sep 18, 2018
a72d922
removing unused function
aarora8 Sep 18, 2018
b2ef923
minor change
aarora8 Sep 18, 2018
b8974aa
adding option for augmentation
aarora8 Sep 18, 2018
04b938c
updating text localization routines
aarora8 Sep 18, 2018
2a35cf7
fixing merge conflict
aarora8 Sep 19, 2018
92a470d
removing unnecessary files
aarora8 Sep 19, 2018
8a9b46a
Merge branch 'ar1' of https://github.com/aarora8/kaldi into ar3
aarora8 Sep 19, 2018
d7092e4
Merge branch 'ar1' of https://github.com/aarora8/kaldi into ar3
aarora8 Sep 19, 2018
9271545
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into ar1
aarora8 Sep 19, 2018
86ea346
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into ar3
aarora8 Sep 19, 2018
e7b7597
adding lm rescoring, cleaning in chain scripts
aarora8 Sep 19, 2018
e647607
minor fix
aarora8 Sep 19, 2018
c1c06d0
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar1
aarora8 Sep 19, 2018
a0d2b68
removing prepend words
aarora8 Sep 19, 2018
53edde4
minor bug fix
aarora8 Sep 19, 2018
e4f973d
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar1
aarora8 Sep 19, 2018
e9ae853
fixing run.sh
aarora8 Sep 19, 2018
8d0c793
removing prepare data
aarora8 Sep 19, 2018
ee582d5
fixing run.sh
aarora8 Sep 19, 2018
a16a11d
removing reverse.py
aarora8 Sep 19, 2018
fb0b8a2
removing prepare data
aarora8 Sep 19, 2018
7835ed4
adding augmentation during line image creation, removing unnecessary …
aarora8 Sep 19, 2018
0234a1a
adding chain recepi
aarora8 Sep 19, 2018
a17fbb3
minor fix
aarora8 Sep 19, 2018
59c84f2
bug fix
aarora8 Sep 19, 2018
a23b478
fixing bugs
aarora8 Sep 20, 2018
a3aac1a
fixing bugs
aarora8 Sep 20, 2018
8fc860d
bug fix
aarora8 Sep 20, 2018
cafd89a
fixing bug in subset
aarora8 Sep 20, 2018
87c9241
adding augmentation in text localization
aarora8 Sep 20, 2018
0e74e55
fixing bugs
aarora8 Sep 20, 2018
60915aa
fixing bugs
aarora8 Sep 20, 2018
4099d4a
fixing bugs
aarora8 Sep 20, 2018
4f98f69
fixing bugs
aarora8 Sep 20, 2018
717501f
fixing bugs
aarora8 Sep 21, 2018
56c77c4
fixing bugs
aarora8 Sep 21, 2018
b9d2651
fixing bugs
aarora8 Sep 21, 2018
74f7a82
fixing bugs
aarora8 Sep 21, 2018
7597638
fixing bugs
aarora8 Sep 21, 2018
479590a
fixing run.sh
aarora8 Sep 21, 2018
87ab218
fixing bug in language modelling
aarora8 Sep 21, 2018
d979000
correcting options
aarora8 Sep 21, 2018
ed3ab45
adding comments
aarora8 Sep 21, 2018
fa34b22
merge conflict
aarora8 Sep 21, 2018
22df693
fixing conflict
aarora8 Sep 21, 2018
95b1c3a
updating chain parameters
aarora8 Sep 21, 2018
8e40c2e
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar2
aarora8 Sep 21, 2018
0b71dae
updating chain parameters
aarora8 Sep 21, 2018
a5d04ec
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar2
aarora8 Sep 21, 2018
e380a20
updating parameters
aarora8 Sep 21, 2018
639289d
updating parameters
aarora8 Sep 21, 2018
78135bb
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar2
aarora8 Sep 21, 2018
04e0236
updating parameters
aarora8 Sep 21, 2018
e1efebc
Merge branch 'ar3' of https://github.com/aarora8/kaldi into ar2
aarora8 Sep 21, 2018
9c33a35
fixing bug in make features
aarora8 Sep 22, 2018
d4516ea
Revert "fixing bug in make features"
aarora8 Sep 22, 2018
bac599a
modification from review
aarora8 Oct 9, 2018
9f0259f
Merge branch 'ar1' of https://github.com/aarora8/kaldi into ar2
aarora8 Oct 9, 2018
c0ac631
Merge branch 'master' of https://github.com/kaldi-asr/kaldi into ar2
aarora8 Oct 9, 2018
f0a990e
modification from review, adding new augmentation in make feature
aarora8 Oct 9, 2018
09da981
minor fix
aarora8 Oct 9, 2018
3d9615e
fixing bugs
aarora8 Oct 9, 2018
c33da9f
adding doocumentation
aarora8 Oct 9, 2018
ee42879
modification from review
aarora8 Oct 9, 2018
405763b
Merge branch 'ar2' of https://github.com/aarora8/kaldi into ar1
aarora8 Oct 9, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 47 additions & 15 deletions egs/cifar/v1/image/ocr/make_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,13 @@
'and right side of the image.')
parser.add_argument('--num-channels', type=int, default=1,
help='Number of color channels')
parser.add_argument('--vertical-shift', type=int, default=0,
help='total number of padding pixel per column')
parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
help="Flip the image left-right for right to left languages")
parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
help="performs image augmentation")
parser.add_argument('--augment_type', type=str, default='no_aug',
choices=['no_aug', 'random_scale','random_shift'],
help='Subset of data to process.')
args = parser.parse_args()


Expand All @@ -68,7 +71,6 @@ def write_kaldi_matrix(file_handle, matrix, key):
file_handle.write("\n")
file_handle.write(" ]\n")


def horizontal_pad(im, allowed_lengths = None):
if allowed_lengths is None:
left_padding = right_padding = args.padding
Expand Down Expand Up @@ -112,6 +114,33 @@ def get_scaled_image_aug(im, mode='normal'):
return im_scaled_up
return im

def vertical_shift(im, mode='normal'):
if args.vertical_shift == 0:
return im
total = args.vertical_shift
if mode == 'notmid':
val = random.randint(0, 1)
if val == 0:
mode = 'top'
else:
mode = 'bottom'
if mode == 'normal':
top = int(total / 2)
bottom = total - top
elif mode == 'top': # more padding on top
top = random.randint(total / 2, total)
bottom = total - top
elif mode == 'bottom': # more padding on bottom
top = random.randint(0, total / 2)
bottom = total - top
width = im.shape[1]
im_pad = np.concatenate(
(255 * np.ones((top, width), dtype=int) -
np.random.normal(2, 1, (top, width)).astype(int), im), axis=0)
im_pad = np.concatenate(
(im_pad, 255 * np.ones((bottom, width), dtype=int) -
np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0)
return im_pad

### main ###
random.seed(1)
Expand All @@ -134,7 +163,6 @@ def get_scaled_image_aug(im, mode='normal'):

num_fail = 0
num_ok = 0
aug_setting = ['normal', 'scaled']
with open(data_list_path) as f:
for line in f:
line = line.strip()
Expand All @@ -144,21 +172,25 @@ def get_scaled_image_aug(im, mode='normal'):
im = misc.imread(image_path)
if args.fliplr:
im = np.fliplr(im)
if args.augment:
im_aug = get_scaled_image_aug(im, aug_setting[1])
else:
im_aug = get_scaled_image_aug(im, aug_setting[0])
im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
if im_horizontal_padded is None:
if args.augment_type == 'no_aug' or 'random_shift':
im = get_scaled_image_aug(im, 'normal')
elif args.augment_type == 'random_scale':
im = get_scaled_image_aug(im, 'scaled')
im = horizontal_pad(im, allowed_lengths)
if im is None:
num_fail += 1
continue
if args.augment_type == 'no_aug' or 'random_scale':
im = vertical_shift(im, 'normal')
elif args.augment_type == 'random_shift':
im = vertical_shift(im, 'notmid')
if args.num_channels == 1:
data = np.transpose(im_horizontal_padded, (1, 0))
data = np.transpose(im, (1, 0))
elif args.num_channels == 3:
H = im_horizontal_padded.shape[0]
W = im_horizontal_padded.shape[1]
C = im_horizontal_padded.shape[2]
data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C))
H = im.shape[0]
W = im.shape[1]
C = im.shape[2]
data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C))
data = np.divide(data, 255.0)
num_ok += 1
write_kaldi_matrix(out_fh, data, image_id)
Expand Down
14 changes: 14 additions & 0 deletions egs/madcat_ar/v1/local/chain/compare_wer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,27 @@ for x in $*; do
done
echo

echo -n "# WER (rescored) "
for x in $*; do
wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER "
for x in $*; do
cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

echo -n "# CER (rescored) "
for x in $*; do
cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

if $used_epochs; then
exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
fi
Expand Down
30 changes: 13 additions & 17 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,16 @@ reporting_email=
# chain options
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=false
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -168,13 +166,13 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.alignment-subsampling-factor=$frame_subsampling_factor \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you know how much does --ngram-order=2 --no-prune-ngram-order=1 alone help? I'm just curious.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, I tried running run_e2e_cnn_1a.sh once with --ngram-order=2 --no-prune-ngram-order=1 and once with --num-extra-lm-states=500 but results were same for madcat arabic 7.81 vs 7.82 WER. But it was more helpful in Tamil OCR setup, it had a absolute WER improvement of around 0.5%.

--chain.frame-subsampling-factor=4 \
--chain.alignment-subsampling-factor=4 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=1000000 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
Expand All @@ -183,10 +181,6 @@ if [ $stage -le 5 ]; then
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -207,18 +201,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
29 changes: 12 additions & 17 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,15 @@ lats_affix=
# chain options
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=false
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -170,13 +167,13 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
--chain.frame-subsampling-factor=4 \
--chain.alignment-subsampling-factor=1 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=1000000 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
Expand All @@ -185,10 +182,6 @@ if [ $stage -le 5 ]; then
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -209,18 +202,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
31 changes: 13 additions & 18 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,14 @@ reporting_email=
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=true
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -171,28 +168,24 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.alignment-subsampling-factor=1 \
--chain.left-tolerance 3 \
--chain.right-tolerance 3 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=2 \
--trainer.frames-per-iter=1000000 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
--trainer.optimization.final-effective-lrate=0.0001 \
--trainer.optimization.shrink-value=1.0 \
--trainer.num-chunk-per-minibatch=96,64 \
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--trainer.add-option="--optimization.memory-compression-level=2" \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -213,18 +206,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
Loading