diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index 7ab75498277..07f3cb12257 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -43,6 +43,8 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') +parser.add_argument('--num-channels', type=int, default=1, + help='Number of color channels') parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, help="Flip the image left-right for right to left languages") parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, @@ -84,9 +86,9 @@ def horizontal_pad(im, allowed_lengths = None): left_padding = int(padding // 2) right_padding = padding - left_padding dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels), dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels), dtype=int)), axis=1) return im_pad1 @@ -150,7 +152,13 @@ def get_scaled_image_aug(im, mode='normal'): if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_horizontal_padded, (1, 0)) + if args.num_channels == 1: + data = np.transpose(im_horizontal_padded, (1, 0)) + elif args.num_channels == 3: + H = im_horizontal_padded.shape[0] + W = im_horizontal_padded.shape[1] + C = im_horizontal_padded.shape[2] + data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/yomdle_fa/README.txt b/egs/yomdle_fa/README.txt new file mode 100644 index 00000000000..984ffdb53b5 --- /dev/null +++ b/egs/yomdle_fa/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various newswires (e.g. Hamshahri) diff --git a/egs/yomdle_fa/v1/cmd.sh b/egs/yomdle_fa/v1/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/yomdle_fa/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/yomdle_fa/v1/image b/egs/yomdle_fa/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/yomdle_fa/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh new file mode 100755 index 00000000000..34e938db069 --- /dev/null +++ b/egs/yomdle_fa/v1/local/augment_data.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr $fliplr --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/yomdle_fa/v1/local/bidi.py b/egs/yomdle_fa/v1/local/bidi.py new file mode 100755 index 00000000000..447313a5d02 --- /dev/null +++ b/egs/yomdle_fa/v1/local/bidi.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +# This script is largely written by Stephen Rawls +# and uses the python package https://pypi.org/project/PyICU_BiDi/ +# The code leaves right to left text alone and reverses left to right text. + +import icu_bidi +import io +import sys +import unicodedata +# R=strong right-to-left; AL=strong arabic right-to-left +rtl_set = set(chr(i) for i in range(sys.maxunicode) + if unicodedata.bidirectional(chr(i)) in ['R','AL']) +def determine_text_direction(text): + # Easy case first + for char in text: + if char in rtl_set: + return icu_bidi.UBiDiLevel.UBIDI_RTL + # If we made it here we did not encounter any strongly rtl char + return icu_bidi.UBiDiLevel.UBIDI_LTR + +def utf8_visual_to_logical(text): + text_dir = determine_text_direction(text) + + bidi = icu_bidi.Bidi() + bidi.inverse = True + bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT + bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS + + bidi.set_para(text, text_dir, None) + + res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) + + return res + +def utf8_logical_to_visual(text): + text_dir = determine_text_direction(text) + + bidi = icu_bidi.Bidi() + + bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT + bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS + + bidi.set_para(text, text_dir, None) + + res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) + + return res + + +##main## +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8") +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8") +for line in sys.stdin: + line = line.strip() + line = utf8_logical_to_visual(line)[::-1] + sys.stdout.write(line + '\n') diff --git a/egs/yomdle_fa/v1/local/chain/compare_wer.sh b/egs/yomdle_fa/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ab880c1adb5 --- /dev/null +++ b/egs/yomdle_fa/v1/local/chain/compare_wer.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..e7c125d16de --- /dev/null +++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh scale_baseline2/exp_yomdle_farsi/chain/e2e_cnn_1a scale_baseline2/exp_yomdle_farsi/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# WER 19.55 18.45 +# CER 5.64 4.94 +# Final train prob -0.0065 -0.0633 +# Final valid prob 0.0015 -0.0619 +# Final train prob (xent) -0.2636 +# Final valid prob (xent) -0.2511 + +set -e -o pipefail + +data_dir=data +exp_dir=exp + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=196" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=120 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=16 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=4 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi diff --git a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..bb5352943f6 --- /dev/null +++ b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp_yomdle_farsi/chain/e2e_cnn_1a exp_yomdle_farsi/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# WER 19.55 18.45 +# CER 5.64 4.94 +# Final train prob -0.0065 -0.0633 +# Final valid prob 0.0015 -0.0619 +# Final train prob (xent) -0.2636 +# Final valid prob (xent) -0.2511 + +set -e + +data_dir=data +exp_dir=exp + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=4 +num_jobs_final=8 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + $data_dir/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat $data_dir/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \ + utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=144" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=120 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $data_dir/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/yomdle_fa/v1/local/create_download.sh b/egs/yomdle_fa/v1/local/create_download.sh new file mode 100755 index 00000000000..1040ecc2165 --- /dev/null +++ b/egs/yomdle_fa/v1/local/create_download.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2018 Chun-Chieh Chang + +# The original format of the dataset given is GEDI and page images. +# This script is written to create line images from page images. +# It also creates csv files from the GEDI files. + +database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed +database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi +slam_dir=download/slam_farsi +yomdle_dir=download/yomdle_farsi + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +echo "$0: Processing SLAM ${language}" +echo "Date: $(date)." +mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image} +local/GEDI2CSV_enriched.py \ + --inputDir ${database_slam} \ + --outputDir ${slam_dir}/truth_csv_raw \ + --log ${slam_dir}/GEDI2CSV_enriched.log +local/create_line_image_from_page_image.py \ + ${database_slam} \ + ${slam_dir}/truth_csv_raw \ + ${slam_dir} + +echo "$0: Processing YOMDLE ${language}" +echo "Date: $(date)." +mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image} +local/YOMDLE2CSV.py \ + --inputDir ${database_yomdle} \ + --outputDir ${yomdle_dir}/truth_csv_raw/ \ + --log ${yomdle_dir}/YOMDLE2CSV.log +local/create_line_image_from_page_image.py \ + --im-format "jpg" \ + ${database_yomdle}/images \ + ${yomdle_dir}/truth_csv_raw \ + ${yomdle_dir} diff --git a/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py new file mode 100755 index 00000000000..77a6791d5d7 --- /dev/null +++ b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import argparse +import csv +import itertools +import sys +import os +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple + +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage + +parser = argparse.ArgumentParser(description="Creates line images from page image") +parser.add_argument('image_dir', type=str, help='Path to full page images') +parser.add_argument('csv_dir', type=str, help='Path to csv files') +parser.add_argument('out_dir', type=str, help='Path to output directory') +parser.add_argument('--im-format', type=str, default='png', help='What file format are the images') +parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area') +parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +def get_orientation(origin, p1, p2): + """ + Given origin and two points, return the orientation of the Point p1 with + regards to Point p2 using origin. + Returns + ------- + integer: Negative if p1 is clockwise of p2. + """ + difference = ( + ((p2[0] - origin[0]) * (p1[1] - origin[1])) + - ((p1[0] - origin[0]) * (p2[1] - origin[1])) + ) + return difference + + +def compute_hull(points): + """ + Given input list of points, return a list of points that + made up the convex hull. + Returns + ------- + [(float, float)]: convexhull points + """ + hull_points = [] + start = points[0] + min_x = start[0] + for p in points[1:]: + if p[0] < min_x: + min_x = p[0] + start = p + + point = start + hull_points.append(start) + + far_point = None + while far_point is not start: + p1 = None + for p in points: + if p is point: + continue + else: + p1 = p + break + + far_point = p1 + + for p2 in points: + if p2 is point or p2 is p1: + continue + else: + direction = get_orientation(point, far_point, p2) + if direction > 0: + far_point = p2 + + hull_points.append(far_point) + point = far_point + return hull_points + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + #hull_ordered = compute_hull(points) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = im.size[0] / 2 + center_y = im.size[1] / 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +### main ### +csv_count = 0 +for filename in sorted(os.listdir(args.csv_dir)): + if filename.endswith('.csv') and (csv_count < args.head or args.head < 0): + csv_count = csv_count + 1 + with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f: + image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format) + if not os.path.isfile(image_file): + continue + csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename) + csv_out_fh = open(csv_out_file, 'w', encoding='utf-8') + csv_out_writer = csv.writer(csv_out_fh) + im = Image.open(image_file) + im = pad_image(im) + count = 1 + for row in itertools.islice(csv.reader(f), 0, None): + if count == 1: + count = 0 + continue + + points = [] + points.append((int(row[2]), int(row[3]))) + points.append((int(row[4]), int(row[5]))) + points.append((int(row[6]), int(row[7]))) + points.append((int(row[8]), int(row[9]))) + + x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])] + y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])] + min_x, min_y = min(x), min(y) + max_x, max_y = max(x), max(y) + if min_x == max_x or min_y == max_y: + continue + + try: + updated_mbb_input = update_minimum_bounding_box_input(points) + bounding_box = minimum_bounding_box(updated_mbb_input) + except Exception as e: + print("Error: Skipping Image " + row[1]) + continue + + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + min_x = int(min(x1, x2, x3, x4)) + min_y = int(min(y1, y2, y3, y4)) + max_x = int(max(x1, x2, x3, x4)) + max_y = int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1_new = (x1 - min_x, y1 - min_y) + p2_new = (x2 - min_x, y2 - min_y) + p3_new = (x3 - min_x, y3 - min_y) + p4_new = (x4 - min_x, y4 - min_y) + rot_points.append(p1_new) + rot_points.append(p2_new) + rot_points.append(p3_new) + rot_points.append(p4_new) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points)) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + csv_out_writer.writerow(row) + image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1]) + region_final.save(image_out_file) diff --git a/egs/yomdle_fa/v1/local/extract_features.sh b/egs/yomdle_fa/v1/local/extract_features.sh new file mode 100755 index 00000000000..7d6806a2712 --- /dev/null +++ b/egs/yomdle_fa/v1/local/extract_features.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +augment=false +num_channels=3 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/yomdle_fa/v1/local/gedi2csv.py b/egs/yomdle_fa/v1/local/gedi2csv.py new file mode 100755 index 00000000000..43a07421dd1 --- /dev/null +++ b/egs/yomdle_fa/v1/local/gedi2csv.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +""" +GEDI2CSV +Convert GEDI-type bounding boxes to CSV format + +GEDI Format Example: + + + + + + + + + +CSV Format Example +ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang +0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的,有一大部分在我灵魂中却,0,0.0,0,,zh-cn +""" + +import logging +import os +import sys +import time +import glob +import csv +import imghdr +from PIL import Image +import argparse +import pdb +import cv2 +import numpy as np +import xml.etree.ElementTree as ET + +sin = np.sin +cos = np.cos +pi = np.pi + +def Rotate2D(pts, cnt, ang=90): + M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]]) + res = np.dot(pts-cnt,M)+cnt + return M, res + +def npbox2string(npar): + if np.shape(npar)[0] != 1: + print('Error during CSV conversion\n') + c1,r1 = npar[0][0],npar[0][1] + c2,r2 = npar[0][2],npar[0][3] + c3,r3 = npar[0][4],npar[0][5] + c4,r4 = npar[0][6],npar[0][7] + + return c1,r1,c2,r2,c3,r3,c4,r4 + +# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) +# Get 4 corners of the rectangle using cv2.boxPoints() + +class GEDI2CSV(): + + """ Initialize the extractor""" + def __init__(self, logger, args): + self._logger = logger + self._args = args + + """ + Segment image with GEDI bounding box information + """ + def csvfile(self, coords, polys, baseName, pgrot): + + """ for writing the files """ + writePath = self._args.outputDir + writePath = os.path.join(writePath,'') + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + rotlist = [] + + header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type'] + conf=100 + write_ctr = 0 + if len(coords) == 0 and len(polys) == 0: + self._logger.info('Found %s with no text content',(baseName)) + print('...Found %s with no text content' % (baseName)) + return + + strPos = writePath + baseName + + """ for each group of coordinates """ + for i in coords: + + [id,x,y,w,h,degrees,text,qual,script,text_type] = i + + contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)]) + + """ + First rotate around upper left corner based on orientationD keyword + """ + M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180) + rot = np.int0(rot) + + # rot is the 8 points rotated by degrees + # pgrot is the rotation after extraction, so save + + # save rotated points to list or array + rot = np.reshape(rot,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot) + + text = text.replace(u'\ufeff','') + + bbrot = degrees + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type]) + + # if there are polygons, first save the text + for j in polys: + arr = [] + [id,poly_val,text,qual,script,text_type] = j + for i in poly_val: + arr.append(eval(i)) + + contour = np.asarray(arr) + convex = cv2.convexHull(contour) + rect = cv2.minAreaRect(convex) + box = cv2.boxPoints(rect) + box = np.int0(box) + box = np.reshape(box,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box) + + bbrot = 0.0 + + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type]) + + # then write out all of list to file + with open(strPos + ".csv", "w", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + for row in rotlist: + writer.writerow(row) + write_ctr += 1 + + return write_ctr + + +def main(args): + + startTime = time.clock() + + writePath = args.outputDir + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + """ Setup logging """ + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + if args.log: + handler = logging.FileHandler(args.log) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + gtconverter = GEDI2CSV(logger, args) + namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"} + keyCnt=0 + + fileCnt = 0 + line_write_ctr = 0 + line_error_ctr = 0 + + """ + Get all XML files in the directory and sub folders + """ + for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True): + for file in filenames: + if file.lower().endswith('.xml'): + fullName = os.path.join(root,file) + baseName = os.path.splitext(fullName) + + fileCnt += 1 + + """ read the XML file """ + tree = ET.parse(fullName) + gedi_root = tree.getroot() + child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0] + totalpages = int(child.attrib['NrOfPages']) + coordinates=[] + polygons = [] + if args.ftype == 'boxed': + fileTypeStr = 'col' + elif args.ftype == 'transcribed': + fileTypeStr = 'Text_Content' + else: + print('Filetype must be either boxed or transcribed!') + logger.info('Filetype must be either boxed or transcribed!') + sys.exit(-1) + + if args.quality == 'both': + qualset = {'Regular','Low-Quality'} + elif args.quality == 'low': + qualset = {'Low-Quality'} + elif args.quality == 'regular': + qualset = {'Regular'} + else: + print('Quality must be both, low or regular!') + logger.info('Quality must be both, low or regular!') + sys.exit(-1) + + + + """ and for each page """ + for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)): + + if 'GEDI_orientation' not in pgs.attrib: + pageRot=0 + else: + pageRot = int(pgs.attrib['GEDI_orientation']) + logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot))) + + """ find children for each page """ + for zone in pgs.findall('gedi:DL_ZONE',namespaces): + + if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \ + ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset: + if zone.get('polygon'): + keyCnt+=1 + polygons.append([zone.attrib['id'],zone.get('polygon').split(';'), + zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]) + elif zone.get(fileTypeStr) != None: + keyCnt+=1 + coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']), + int(zone.attrib['width']), int(zone.attrib['height']), + float(zone.get('orientationD',0.0)), + zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')] + coordinates.append(coord) + + if len(coordinates) > 0 or len(polygons) > 0: + line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot) + else: + print('...%s has no applicable content' % (baseName[0])) + + print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr)) + + +def parse_arguments(argv): + """ Args and defaults """ + parser = argparse.ArgumentParser() + + parser.add_argument('--inputDir', type=str, help='Input directory', required=True) + parser.add_argument('--outputDir', type=str, help='Output directory', required=True) + parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed') + parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular') + parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log') + + return parser.parse_args(argv) + +if __name__ == '__main__': + """ Run """ + main(parse_arguments(sys.argv[1:])) + + + + + + diff --git a/egs/yomdle_fa/v1/local/prepare_dict.sh b/egs/yomdle_fa/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..f1b1a8d70cc --- /dev/null +++ b/egs/yomdle_fa/v1/local/prepare_dict.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +data_dir=data + +. ./utils/parse_options.sh || exit 1; + +base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2) + +mkdir -p $dir + +local/prepare_lexicon.py --data-dir $data_dir $dir + +sed -i '/^\s*$/d' $dir/lexicon.txt +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/yomdle_fa/v1/local/prepare_lexicon.py b/egs/yomdle_fa/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..46be4f37970 --- /dev/null +++ b/egs/yomdle_fa/v1/local/prepare_lexicon.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +parser.add_argument('--data-dir', type=str, default='data', help='Path to text file') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join(args.data_dir, 'train', 'text') +text_fh = open(text_path, 'r', encoding='utf-8') + +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word + characters = " ".join([ 'SIL' if char == '|' else char for char in characters]) + characters = characters.replace('#','') + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/yomdle_fa/v1/local/process_data.py b/egs/yomdle_fa/v1/local/process_data.py new file mode 100755 index 00000000000..3423cc5380e --- /dev/null +++ b/egs/yomdle_fa/v1/local/process_data.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the extracted Farsi OCR (yomdle and slam) database files + and creates the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train + Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that + utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" + +import argparse +import os +import sys +import csv +import itertools +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('out_dir', type=str, help='directory to output files') +parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +count = 0 +for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))): + if filename.endswith('.csv') and (count < args.head or args.head < 0): + count = count + 1 + csv_filepath = os.path.join(args.database_path, 'truth_csv', filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + row_count = 0 + for row in csv.reader(csv_file): + if row_count == 0: + row_count = 1 + continue + image_id = os.path.splitext(row[1])[0] + image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1]) + text = unicodedata.normalize('NFC', row[11]) + file_info = os.stat(image_filepath) + if file_info.st_size != 0: + if text: + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] + '\n') diff --git a/egs/yomdle_fa/v1/local/score.sh b/egs/yomdle_fa/v1/local/score.sh new file mode 100755 index 00000000000..f2405205f02 --- /dev/null +++ b/egs/yomdle_fa/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@" +steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@" diff --git a/egs/yomdle_fa/v1/local/train_lm.sh b/egs/yomdle_fa/v1/local/train_lm.sh new file mode 100755 index 00000000000..bc738f217da --- /dev/null +++ b/egs/yomdle_fa/v1/local/train_lm.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/yomdle_fa/v1/local/train_lm_lr.sh b/egs/yomdle_fa/v1/local/train_lm_lr.sh new file mode 100755 index 00000000000..5bfc20acdeb --- /dev/null +++ b/egs/yomdle_fa/v1/local/train_lm_lr.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE+Extra training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data +extra_lm=download/extra_lm.txt +order=3 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='extra_lm=10 train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/yomdle_fa/v1/local/wer_output_filter b/egs/yomdle_fa/v1/local/wer_output_filter new file mode 100755 index 00000000000..08d5563bca4 --- /dev/null +++ b/egs/yomdle_fa/v1/local/wer_output_filter @@ -0,0 +1,151 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +# Arabic-specific normalization +while (<>) { + @F = split " "; + print "$F[0] "; + foreach $s (@F[1..$#F]) { + # Normalize tabs, spaces, and no-break spaces + $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g; + # Normalize "dots"/"filled-circles" to periods + $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g; + # Normalize dashes to regular hyphen + $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g; + # Normalize various parenthesis to regular parenthesis + $s =~ s/\x{UFF09}/\x{0029}/g; + $s =~ s/\x{UFF08}/\x{0028}/g; + + # Convert various presentation forms to base form + $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g; + $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g; + $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g; + $s =~ s/[\x{FBDD}]+/\x{0677}/g; + $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g; + $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g; + $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g; + $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g; + $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g; + $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g; + $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g; + $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g; + $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g; + $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g; + $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g; + $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g; + $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g; + $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g; + $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g; + $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g; + $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g; + $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g; + $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g; + $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g; + $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g; + $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g; + $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g; + $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g; + $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g; + $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g; + $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g; + $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g; + $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g; + $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g; + $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g; + $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g; + $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g; + $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g; + $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g; + $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g; + $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g; + $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g; + $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g; + $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g; + $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g; + $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g; + $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g; + $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g; + $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g; + $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g; + $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g; + $s =~ s/[\x{FE80}]+/\x{0621}/g; + $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g; + $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g; + $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g; + $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g; + $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g; + $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g; + $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g; + $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g; + $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g; + $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g; + $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g; + $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g; + $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g; + $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g; + $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g; + $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g; + $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g; + $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g; + $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g; + $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g; + $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g; + $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g; + $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g; + $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g; + + # Remove tatweel + $s =~ s/\x{0640}//g; + # Remove vowels and hamza + $s =~ s/[\x{064B}-\x{0655}]+//g; + # Remove right-to-left and left-to-right + $s =~ s/[\x{200F}\x{200E}]+//g; + # Arabic Keheh to Arabic Kaf + $s =~ s/\x{06A9}/\x{0643}/g; + # Arabic Yeh to Farsi Yeh + $s =~ s/\x{064A}/\x{06CC}/g; + # Decompose RIAL + $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g; + # Farsi arabic-indic digits to arabic-indic digits + $s =~ s/\x{06F0}/\x{0660}/g; + $s =~ s/\x{06F1}/\x{0661}/g; + $s =~ s/\x{06F2}/\x{0662}/g; + $s =~ s/\x{06F3}/\x{0663}/g; + $s =~ s/\x{06F4}/\x{0664}/g; + $s =~ s/\x{06F5}/\x{0665}/g; + $s =~ s/\x{06F6}/\x{0666}/g; + $s =~ s/\x{06F7}/\x{0667}/g; + $s =~ s/\x{06F8}/\x{0668}/g; + $s =~ s/\x{06F9}/\x{0669}/g; + # Arabic-indic digits to digits + $s =~ s/\x{0660}/0/g; + $s =~ s/\x{0661}/1/g; + $s =~ s/\x{0662}/2/g; + $s =~ s/\x{0663}/3/g; + $s =~ s/\x{0664}/4/g; + $s =~ s/\x{0665}/5/g; + $s =~ s/\x{0666}/6/g; + $s =~ s/\x{0667}/7/g; + $s =~ s/\x{0668}/8/g; + $s =~ s/\x{0669}/9/g; + # Arabic comma to comma + $s =~ s/\x{060C}/\x{002C}/g; + + $s =~ s/\|/ /g; + if ($s ne "") { + print "$s"; + } else { + print ""; + } + } + print "\n"; +} + diff --git a/egs/yomdle_fa/v1/local/yomdle2csv.py b/egs/yomdle_fa/v1/local/yomdle2csv.py new file mode 100755 index 00000000000..3641de90324 --- /dev/null +++ b/egs/yomdle_fa/v1/local/yomdle2csv.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 + +""" +GEDI2CSV +Convert GEDI-type bounding boxes to CSV format + +GEDI Format Example: + + + + + + + + + +CSV Format Example +ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang +0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的,有一大部分在我灵魂中却,0,0.0,0,,zh-cn +""" + +import logging +import os +import sys +import time +import glob +import csv +import imghdr +from PIL import Image +import argparse +import pdb +import cv2 +import numpy as np +import xml.etree.ElementTree as ET + +sin = np.sin +cos = np.cos +pi = np.pi + +def Rotate2D(pts, cnt, ang=90): + M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]]) + res = np.dot(pts-cnt,M)+cnt + return M, res + +def npbox2string(npar): + if np.shape(npar)[0] != 1: + print('Error during CSV conversion\n') + c1,r1 = npar[0][0],npar[0][1] + c2,r2 = npar[0][2],npar[0][3] + c3,r3 = npar[0][4],npar[0][5] + c4,r4 = npar[0][6],npar[0][7] + + return c1,r1,c2,r2,c3,r3,c4,r4 + +# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) +# Get 4 corners of the rectangle using cv2.boxPoints() + +class GEDI2CSV(): + + """ Initialize the extractor""" + def __init__(self, logger, args): + self._logger = logger + self._args = args + + """ + Segment image with GEDI bounding box information + """ + def csvfile(self, coords, polys, baseName, pgrot): + + """ for writing the files """ + writePath = self._args.outputDir + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + rotlist = [] + + header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang'] + conf=100 + pgrot = 0 + bbrot = 0 + qual = 0 + script = '' + + write_ctr = 0 + if len(coords) == 0 and len(polys) == 0: + self._logger.info('Found %s with no text content',(baseName)) + print('...Found %s with no text content' % (baseName)) + return + + strPos = writePath + baseName + + for j in polys: + try: + arr = [] + [id,poly_val,text,qual,lang] = j + script=None + #print(j) + for i in poly_val: + if len(i.strip()) > 0: + #print(i) + arr.append(eval(i)) + + contour = np.asarray(arr) + #print(contour) + convex = cv2.convexHull(contour) + rect = cv2.minAreaRect(convex) + box = cv2.boxPoints(rect) + box = np.int0(box) + box = np.reshape(box,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box) + + bbrot = 0.0 + + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang]) + + except: + print('...polygon error %s, %s' % (j, baseName)) + continue + + # then write out all of list to file + with open(strPos + ".csv", "w", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + for row in rotlist: + writer.writerow(row) + write_ctr += 1 + + return write_ctr + + +def main(args): + + startTime = time.clock() + + writePath = args.outputDir + print('write to %s' % (writePath)) + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + """ Setup logging """ + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + if args.log: + handler = logging.FileHandler(args.log) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + gtconverter = GEDI2CSV(logger, args) + namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"} + keyCnt=0 + + fileCnt = 0 + line_write_ctr = 0 + line_error_ctr = 0 + file_error_ctr = 0 + """ + Get all XML files in the directory and sub folders + """ + print('reading %s' % (args.inputDir)) + for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True): + for file in filenames: + if file.lower().endswith('.xml'): + fullName = os.path.join(root,file) + baseName = os.path.splitext(fullName) + + fileCnt += 1 + + try: + """ read the XML file """ + tree = ET.parse(fullName) + except: + print('...ERROR parsing %s' % (fullName)) + file_error_ctr += 1 + continue + + gedi_root = tree.getroot() + child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0] + totalpages = int(child.attrib['NrOfPages']) + coordinates=[] + polygons = [] + + """ and for each page """ + for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)): + + if 'GEDI_orientation' not in pgs.attrib: + pageRot=0 + else: + pageRot = int(pgs.attrib['GEDI_orientation']) + logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot))) + + """ find children for each page """ + for zone in pgs.findall('gedi:DL_ZONE',namespaces): + + if zone.attrib['gedi_type']=='Text' : + if zone.get('polygon'): + keyCnt+=1 + polygons.append([zone.attrib['id'],zone.get('polygon').split(';'), + zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')]) + else: + print('...Not polygon') + + + if len(coordinates) > 0 or len(polygons) > 0: + line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot) + else: + print('...%s has no text content' % (baseName[0])) + + + print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr)) + + +def parse_arguments(argv): + """ Args and defaults """ + parser = argparse.ArgumentParser() + + parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml') + parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/') + parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt') + + return parser.parse_args(argv) + + +if __name__ == '__main__': + """ Run """ + main(parse_arguments(sys.argv[1:])) diff --git a/egs/yomdle_fa/v1/path.sh b/egs/yomdle_fa/v1/path.sh new file mode 100644 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/yomdle_fa/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/yomdle_fa/v1/run.sh b/egs/yomdle_fa/v1/run.sh new file mode 100755 index 00000000000..a7547b1ee69 --- /dev/null +++ b/egs/yomdle_fa/v1/run.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +set -e +stage=0 +nj=60 + +database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed +database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi +download_dir=data_yomdle_farsi/download/ +extra_lm=download/extra_lm.txt +data_dir=data_yomdle_farsi +exp_dir=exp_yomdle_farsi + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $stage -le -1 ]; then + local/create_download.sh --database-slam $database_slam \ + --database-yomdle $database_yomdle \ + --slam-dir download/slam_farsi \ + --yomdle-dir download/yomdle_farsi +fi + +if [ $stage -le 0 ]; then + mkdir -p data_slam_farsi/slam + mkdir -p data_yomdle_farsi/yomdle + local/process_data.py download/slam_farsi data_slam_farsi/slam + local/process_data.py download/yomdle_farsi data_yomdle_farsi/yomdle + ln -s ../data_slam_farsi/slam ${data_dir}/test + ln -s ../data_yomdle_farsi/yomdle ${data_dir}/train + image/fix_data_dir.sh ${data_dir}/test + image/fix_data_dir.sh ${data_dir}/train +fi + +mkdir -p $data_dir/{train,test}/data +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py --feat-dim 40 $data_dir/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train + + for datasplit in train test; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd "$cmd" \ + --feat-dim 40 --num-channels 3 --fliplr true \ + $data_dir/${datasplit} + steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1; + done + + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh $data_dir/train +fi + +if [ $stage -le 2 ]; then + for datasplit in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 --fliplr false $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir + steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing dictionary and lang..." + if [ ! -f $data_dir/train/bpe.out ]; then + cut -d' ' -f2- $data_dir/train/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out + for datasplit in test train train_aug; do + cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids + cut -d' ' -f2- $data_dir/$datasplit/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text + mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old + paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text + done + fi + + local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + $data_dir/local/dict "" $data_dir/lang/temp $data_dir/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh --data-dir $data_dir --dir $data_dir/local/local_lm + utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + $data_dir/local/dict/lexicon.txt $data_dir/lang_test +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \ + $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir +fi + +if [ $stage -le 8 ]; then + echo "$0: Estimating a language model for lattice rescoring...$(date)" + local/train_lm_lr.sh --data-dir $data_dir --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6 + + utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \ + $data_dir/lang_test $data_dir/lang_test_lr + steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \ + $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr +fi diff --git a/egs/yomdle_fa/v1/steps b/egs/yomdle_fa/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/yomdle_fa/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/yomdle_fa/v1/utils b/egs/yomdle_fa/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/yomdle_fa/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/yomdle_zh/README.txt b/egs/yomdle_zh/README.txt new file mode 100644 index 00000000000..39d2348ca10 --- /dev/null +++ b/egs/yomdle_zh/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for OCR on the Yomdle and Slam datasets. +Training is done on the Yomdle dataset and testing is done on Slam. +LM rescoring is also done with extra corpus data obtained from various sources (e.g. Hamshahri) diff --git a/egs/yomdle_zh/v1/cmd.sh b/egs/yomdle_zh/v1/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/yomdle_zh/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/yomdle_zh/v1/image b/egs/yomdle_zh/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/yomdle_zh/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh new file mode 100755 index 00000000000..34e938db069 --- /dev/null +++ b/egs/yomdle_zh/v1/local/augment_data.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr $fliplr --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/yomdle_zh/v1/local/bidi.py b/egs/yomdle_zh/v1/local/bidi.py new file mode 100755 index 00000000000..447313a5d02 --- /dev/null +++ b/egs/yomdle_zh/v1/local/bidi.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +# This script is largely written by Stephen Rawls +# and uses the python package https://pypi.org/project/PyICU_BiDi/ +# The code leaves right to left text alone and reverses left to right text. + +import icu_bidi +import io +import sys +import unicodedata +# R=strong right-to-left; AL=strong arabic right-to-left +rtl_set = set(chr(i) for i in range(sys.maxunicode) + if unicodedata.bidirectional(chr(i)) in ['R','AL']) +def determine_text_direction(text): + # Easy case first + for char in text: + if char in rtl_set: + return icu_bidi.UBiDiLevel.UBIDI_RTL + # If we made it here we did not encounter any strongly rtl char + return icu_bidi.UBiDiLevel.UBIDI_LTR + +def utf8_visual_to_logical(text): + text_dir = determine_text_direction(text) + + bidi = icu_bidi.Bidi() + bidi.inverse = True + bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT + bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS + + bidi.set_para(text, text_dir, None) + + res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) + + return res + +def utf8_logical_to_visual(text): + text_dir = determine_text_direction(text) + + bidi = icu_bidi.Bidi() + + bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT + bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS + + bidi.set_para(text, text_dir, None) + + res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING) + + return res + + +##main## +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8") +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8") +for line in sys.stdin: + line = line.strip() + line = utf8_logical_to_visual(line)[::-1] + sys.stdout.write(line + '\n') diff --git a/egs/yomdle_zh/v1/local/chain/compare_wer.sh b/egs/yomdle_zh/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ab880c1adb5 --- /dev/null +++ b/egs/yomdle_zh/v1/local/chain/compare_wer.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..4183aa74587 --- /dev/null +++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# CER 15.44 13.57 +# Final train prob 0.0616 -0.0512 +# Final valid prob 0.0390 -0.0718 +# Final train prob (xent) -0.6199 +# Final valid prob (xent) -0.7448 + +set -e -o pipefail + +data_dir=data +exp_dir=exp + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=1000 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=3 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=180 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn3 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn6 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn8 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn9 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=4 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=16,8 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi diff --git a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..88bbd32790c --- /dev/null +++ b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b +# System e2e_cnn_1a cnn_e2eali_1b +# CER 15.44 13.57 +# Final train prob 0.0616 -0.0512 +# Final valid prob 0.0390 -0.0718 +# Final train prob (xent) -0.6199 +# Final valid prob (xent) -0.7448 + +set -e + +data_dir=data +exp_dir=exp + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=4 +num_jobs_final=8 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + $data_dir/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat $data_dir/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \ + utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=180 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir $data_dir/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $data_dir/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph $data_dir/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/yomdle_zh/v1/local/create_download.sh b/egs/yomdle_zh/v1/local/create_download.sh new file mode 100755 index 00000000000..3c4be4699ef --- /dev/null +++ b/egs/yomdle_zh/v1/local/create_download.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2018 Chun-Chieh Chang + +# The original format of the dataset given is GEDI and page images. +# This script is written to create line images from page images. +# It also creates csv files from the GEDI files. + +database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed +database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi +cangjie_url=https://raw.githubusercontent.com/wanleung/libcangjie/master/tables/cj5-cc.txt +download_dir=download +slam_dir=$download_dir/slam_farsi +yomdle_dir=$download_dir/yomdle_farsi + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +echo "$0: Processing SLAM ${language}" +echo "Date: $(date)." +mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image} +local/gedi2csv.py \ + --inputDir ${database_slam} \ + --outputDir ${slam_dir}/truth_csv_raw \ + --log ${slam_dir}/GEDI2CSV_enriched.log +local/create_line_image_from_page_image.py \ + ${database_slam} \ + ${slam_dir}/truth_csv_raw \ + ${slam_dir} + +echo "$0: Processing YOMDLE ${language}" +echo "Date: $(date)." +mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image} +local/yomdle2csv.py \ + --inputDir ${database_yomdle} \ + --outputDir ${yomdle_dir}/truth_csv_raw/ \ + --log ${yomdle_dir}/YOMDLE2CSV.log +local/create_line_image_from_page_image.py \ + --im-format "jpg" \ + ${database_yomdle}/images \ + ${yomdle_dir}/truth_csv_raw \ + ${yomdle_dir} + +echo "Downloading table for CangJie." +wget -P $download_dir/ $cangjie_url || exit 1; diff --git a/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py new file mode 100755 index 00000000000..77a6791d5d7 --- /dev/null +++ b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import argparse +import csv +import itertools +import sys +import os +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple + +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage + +parser = argparse.ArgumentParser(description="Creates line images from page image") +parser.add_argument('image_dir', type=str, help='Path to full page images') +parser.add_argument('csv_dir', type=str, help='Path to csv files') +parser.add_argument('out_dir', type=str, help='Path to output directory') +parser.add_argument('--im-format', type=str, default='png', help='What file format are the images') +parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area') +parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +def get_orientation(origin, p1, p2): + """ + Given origin and two points, return the orientation of the Point p1 with + regards to Point p2 using origin. + Returns + ------- + integer: Negative if p1 is clockwise of p2. + """ + difference = ( + ((p2[0] - origin[0]) * (p1[1] - origin[1])) + - ((p1[0] - origin[0]) * (p2[1] - origin[1])) + ) + return difference + + +def compute_hull(points): + """ + Given input list of points, return a list of points that + made up the convex hull. + Returns + ------- + [(float, float)]: convexhull points + """ + hull_points = [] + start = points[0] + min_x = start[0] + for p in points[1:]: + if p[0] < min_x: + min_x = p[0] + start = p + + point = start + hull_points.append(start) + + far_point = None + while far_point is not start: + p1 = None + for p in points: + if p is point: + continue + else: + p1 = p + break + + far_point = p1 + + for p2 in points: + if p2 is point or p2 is p1: + continue + else: + direction = get_orientation(point, far_point, p2) + if direction > 0: + far_point = p2 + + hull_points.append(far_point) + point = far_point + return hull_points + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + #hull_ordered = compute_hull(points) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = im.size[0] / 2 + center_y = im.size[1] / 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +### main ### +csv_count = 0 +for filename in sorted(os.listdir(args.csv_dir)): + if filename.endswith('.csv') and (csv_count < args.head or args.head < 0): + csv_count = csv_count + 1 + with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f: + image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format) + if not os.path.isfile(image_file): + continue + csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename) + csv_out_fh = open(csv_out_file, 'w', encoding='utf-8') + csv_out_writer = csv.writer(csv_out_fh) + im = Image.open(image_file) + im = pad_image(im) + count = 1 + for row in itertools.islice(csv.reader(f), 0, None): + if count == 1: + count = 0 + continue + + points = [] + points.append((int(row[2]), int(row[3]))) + points.append((int(row[4]), int(row[5]))) + points.append((int(row[6]), int(row[7]))) + points.append((int(row[8]), int(row[9]))) + + x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])] + y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])] + min_x, min_y = min(x), min(y) + max_x, max_y = max(x), max(y) + if min_x == max_x or min_y == max_y: + continue + + try: + updated_mbb_input = update_minimum_bounding_box_input(points) + bounding_box = minimum_bounding_box(updated_mbb_input) + except Exception as e: + print("Error: Skipping Image " + row[1]) + continue + + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + min_x = int(min(x1, x2, x3, x4)) + min_y = int(min(y1, y2, y3, y4)) + max_x = int(max(x1, x2, x3, x4)) + max_y = int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1_new = (x1 - min_x, y1 - min_y) + p2_new = (x2 - min_x, y2 - min_y) + p3_new = (x3 - min_x, y3 - min_y) + p4_new = (x4 - min_x, y4 - min_y) + rot_points.append(p1_new) + rot_points.append(p2_new) + rot_points.append(p3_new) + rot_points.append(p4_new) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points)) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + csv_out_writer.writerow(row) + image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1]) + region_final.save(image_out_file) diff --git a/egs/yomdle_zh/v1/local/extract_features.sh b/egs/yomdle_zh/v1/local/extract_features.sh new file mode 100755 index 00000000000..7d6806a2712 --- /dev/null +++ b/egs/yomdle_zh/v1/local/extract_features.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +nj=4 +cmd=run.pl +feat_dim=40 +fliplr=false +augment=false +num_channels=3 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/yomdle_zh/v1/local/gedi2csv.py b/egs/yomdle_zh/v1/local/gedi2csv.py new file mode 100755 index 00000000000..43a07421dd1 --- /dev/null +++ b/egs/yomdle_zh/v1/local/gedi2csv.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +""" +GEDI2CSV +Convert GEDI-type bounding boxes to CSV format + +GEDI Format Example: + + + + + + + + + +CSV Format Example +ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang +0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的,有一大部分在我灵魂中却,0,0.0,0,,zh-cn +""" + +import logging +import os +import sys +import time +import glob +import csv +import imghdr +from PIL import Image +import argparse +import pdb +import cv2 +import numpy as np +import xml.etree.ElementTree as ET + +sin = np.sin +cos = np.cos +pi = np.pi + +def Rotate2D(pts, cnt, ang=90): + M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]]) + res = np.dot(pts-cnt,M)+cnt + return M, res + +def npbox2string(npar): + if np.shape(npar)[0] != 1: + print('Error during CSV conversion\n') + c1,r1 = npar[0][0],npar[0][1] + c2,r2 = npar[0][2],npar[0][3] + c3,r3 = npar[0][4],npar[0][5] + c4,r4 = npar[0][6],npar[0][7] + + return c1,r1,c2,r2,c3,r3,c4,r4 + +# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) +# Get 4 corners of the rectangle using cv2.boxPoints() + +class GEDI2CSV(): + + """ Initialize the extractor""" + def __init__(self, logger, args): + self._logger = logger + self._args = args + + """ + Segment image with GEDI bounding box information + """ + def csvfile(self, coords, polys, baseName, pgrot): + + """ for writing the files """ + writePath = self._args.outputDir + writePath = os.path.join(writePath,'') + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + rotlist = [] + + header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type'] + conf=100 + write_ctr = 0 + if len(coords) == 0 and len(polys) == 0: + self._logger.info('Found %s with no text content',(baseName)) + print('...Found %s with no text content' % (baseName)) + return + + strPos = writePath + baseName + + """ for each group of coordinates """ + for i in coords: + + [id,x,y,w,h,degrees,text,qual,script,text_type] = i + + contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)]) + + """ + First rotate around upper left corner based on orientationD keyword + """ + M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180) + rot = np.int0(rot) + + # rot is the 8 points rotated by degrees + # pgrot is the rotation after extraction, so save + + # save rotated points to list or array + rot = np.reshape(rot,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot) + + text = text.replace(u'\ufeff','') + + bbrot = degrees + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type]) + + # if there are polygons, first save the text + for j in polys: + arr = [] + [id,poly_val,text,qual,script,text_type] = j + for i in poly_val: + arr.append(eval(i)) + + contour = np.asarray(arr) + convex = cv2.convexHull(contour) + rect = cv2.minAreaRect(convex) + box = cv2.boxPoints(rect) + box = np.int0(box) + box = np.reshape(box,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box) + + bbrot = 0.0 + + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type]) + + # then write out all of list to file + with open(strPos + ".csv", "w", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + for row in rotlist: + writer.writerow(row) + write_ctr += 1 + + return write_ctr + + +def main(args): + + startTime = time.clock() + + writePath = args.outputDir + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + """ Setup logging """ + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + if args.log: + handler = logging.FileHandler(args.log) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + gtconverter = GEDI2CSV(logger, args) + namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"} + keyCnt=0 + + fileCnt = 0 + line_write_ctr = 0 + line_error_ctr = 0 + + """ + Get all XML files in the directory and sub folders + """ + for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True): + for file in filenames: + if file.lower().endswith('.xml'): + fullName = os.path.join(root,file) + baseName = os.path.splitext(fullName) + + fileCnt += 1 + + """ read the XML file """ + tree = ET.parse(fullName) + gedi_root = tree.getroot() + child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0] + totalpages = int(child.attrib['NrOfPages']) + coordinates=[] + polygons = [] + if args.ftype == 'boxed': + fileTypeStr = 'col' + elif args.ftype == 'transcribed': + fileTypeStr = 'Text_Content' + else: + print('Filetype must be either boxed or transcribed!') + logger.info('Filetype must be either boxed or transcribed!') + sys.exit(-1) + + if args.quality == 'both': + qualset = {'Regular','Low-Quality'} + elif args.quality == 'low': + qualset = {'Low-Quality'} + elif args.quality == 'regular': + qualset = {'Regular'} + else: + print('Quality must be both, low or regular!') + logger.info('Quality must be both, low or regular!') + sys.exit(-1) + + + + """ and for each page """ + for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)): + + if 'GEDI_orientation' not in pgs.attrib: + pageRot=0 + else: + pageRot = int(pgs.attrib['GEDI_orientation']) + logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot))) + + """ find children for each page """ + for zone in pgs.findall('gedi:DL_ZONE',namespaces): + + if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \ + ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset: + if zone.get('polygon'): + keyCnt+=1 + polygons.append([zone.attrib['id'],zone.get('polygon').split(';'), + zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]) + elif zone.get(fileTypeStr) != None: + keyCnt+=1 + coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']), + int(zone.attrib['width']), int(zone.attrib['height']), + float(zone.get('orientationD',0.0)), + zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')] + coordinates.append(coord) + + if len(coordinates) > 0 or len(polygons) > 0: + line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot) + else: + print('...%s has no applicable content' % (baseName[0])) + + print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr)) + + +def parse_arguments(argv): + """ Args and defaults """ + parser = argparse.ArgumentParser() + + parser.add_argument('--inputDir', type=str, help='Input directory', required=True) + parser.add_argument('--outputDir', type=str, help='Output directory', required=True) + parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed') + parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular') + parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log') + + return parser.parse_args(argv) + +if __name__ == '__main__': + """ Run """ + main(parse_arguments(sys.argv[1:])) + + + + + + diff --git a/egs/yomdle_zh/v1/local/prepare_dict.sh b/egs/yomdle_zh/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..65b2e7aa901 --- /dev/null +++ b/egs/yomdle_zh/v1/local/prepare_dict.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +data_dir=data + +. ./utils/parse_options.sh || exit 1; + +base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2) + +mkdir -p $dir + +local/prepare_lexicon.py --data-dir $data_dir $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/yomdle_zh/v1/local/prepare_lexicon.py b/egs/yomdle_zh/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..3ebb52e38f4 --- /dev/null +++ b/egs/yomdle_zh/v1/local/prepare_lexicon.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Chun-Chieh Chang + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +parser.add_argument('--data-dir', type=str, default='data', help='Path to text file') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join(args.data_dir, 'train', 'text') +text_fh = open(text_path, 'r', encoding='utf-8') + +# Used specially for Chinese. +# Uses the ChangJie keyboard input method to create subword units for Chinese. +cj5_table = {} +with open('download/cj5-cc.txt', 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split() + if not line_vect[0].startswith('yyy') and not line_vect[0].startswith('z'): + cj5_table[line_vect[1]] = "cj5_" + " cj5_".join(list(line_vect[0])) + +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split() + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word + characters = " ".join([ 'SIL' if char == '|' else cj5_table[char] if char in cj5_table else char for char in characters]) + characters = characters.replace('#','') + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/yomdle_zh/v1/local/process_data.py b/egs/yomdle_zh/v1/local/process_data.py new file mode 100755 index 00000000000..8964af8890a --- /dev/null +++ b/egs/yomdle_zh/v1/local/process_data.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# 2018 Chun Chieh Chang + +""" This script reads the extracted Farsi OCR (yomdle and slam) database files + and creates the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train + Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that + utt2spk file: english_phone_books_0001_0 english_phone_books_0001 + images.scp file: english_phone_books_0001_0 \ + data/download/truth_line_image/english_phone_books_0001_0.png +""" + +import argparse +import os +import sys +import csv +import itertools +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files") +parser.add_argument('database_path', type=str, help='Path to data') +parser.add_argument('out_dir', type=str, help='directory to output files') +parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data') +args = parser.parse_args() + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +count = 0 +for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))): + if filename.endswith('.csv') and (count < args.head or args.head < 0): + count = count + 1 + csv_filepath = os.path.join(args.database_path, 'truth_csv', filename) + csv_file = open(csv_filepath, 'r', encoding='utf-8') + row_count = 0 + for row in csv.reader(csv_file): + if row_count == 0: + row_count = 1 + continue + image_id = os.path.splitext(row[1])[0] + image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1]) + text = unicodedata.normalize('NFC', row[11]).replace('\n', '') + if os.path.isfile(image_filepath) and os.stat(image_filepath).st_size != 0 and text: + text_fh.write(image_id + ' ' + text + '\n') + utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n') + image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] + '\n') diff --git a/egs/yomdle_zh/v1/local/score.sh b/egs/yomdle_zh/v1/local/score.sh new file mode 100755 index 00000000000..f2405205f02 --- /dev/null +++ b/egs/yomdle_zh/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@" +steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@" diff --git a/egs/yomdle_zh/v1/local/train_lm.sh b/egs/yomdle_zh/v1/local/train_lm.sh new file mode 100755 index 00000000000..bc738f217da --- /dev/null +++ b/egs/yomdle_zh/v1/local/train_lm.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/yomdle_zh/v1/local/train_lm_lr.sh b/egs/yomdle_zh/v1/local/train_lm_lr.sh new file mode 100755 index 00000000000..5bfc20acdeb --- /dev/null +++ b/egs/yomdle_zh/v1/local/train_lm_lr.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the YOMDLE+Extra training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +data_dir=data +extra_lm=download/extra_lm.txt +order=3 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt + + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + nr=`cat $data_dir/train/text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + + # use the training data as an additional data source. + # we can later fold the dev data into this. + head -n $nr_train $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < $data_dir/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='extra_lm=10 train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \ + --min-counts="$min_counts" \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/yomdle_zh/v1/local/wer_output_filter b/egs/yomdle_zh/v1/local/wer_output_filter new file mode 100755 index 00000000000..08d5563bca4 --- /dev/null +++ b/egs/yomdle_zh/v1/local/wer_output_filter @@ -0,0 +1,151 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +# Arabic-specific normalization +while (<>) { + @F = split " "; + print "$F[0] "; + foreach $s (@F[1..$#F]) { + # Normalize tabs, spaces, and no-break spaces + $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g; + # Normalize "dots"/"filled-circles" to periods + $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g; + # Normalize dashes to regular hyphen + $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g; + # Normalize various parenthesis to regular parenthesis + $s =~ s/\x{UFF09}/\x{0029}/g; + $s =~ s/\x{UFF08}/\x{0028}/g; + + # Convert various presentation forms to base form + $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g; + $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g; + $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g; + $s =~ s/[\x{FBDD}]+/\x{0677}/g; + $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g; + $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g; + $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g; + $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g; + $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g; + $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g; + $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g; + $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g; + $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g; + $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g; + $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g; + $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g; + $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g; + $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g; + $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g; + $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g; + $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g; + $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g; + $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g; + $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g; + $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g; + $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g; + $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g; + $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g; + $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g; + $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g; + $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g; + $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g; + $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g; + $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g; + $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g; + $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g; + $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g; + $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g; + $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g; + $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g; + $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g; + $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g; + $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g; + $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g; + $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g; + $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g; + $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g; + $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g; + $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g; + $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g; + $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g; + $s =~ s/[\x{FE80}]+/\x{0621}/g; + $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g; + $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g; + $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g; + $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g; + $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g; + $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g; + $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g; + $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g; + $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g; + $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g; + $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g; + $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g; + $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g; + $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g; + $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g; + $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g; + $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g; + $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g; + $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g; + $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g; + $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g; + $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g; + $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g; + $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g; + + # Remove tatweel + $s =~ s/\x{0640}//g; + # Remove vowels and hamza + $s =~ s/[\x{064B}-\x{0655}]+//g; + # Remove right-to-left and left-to-right + $s =~ s/[\x{200F}\x{200E}]+//g; + # Arabic Keheh to Arabic Kaf + $s =~ s/\x{06A9}/\x{0643}/g; + # Arabic Yeh to Farsi Yeh + $s =~ s/\x{064A}/\x{06CC}/g; + # Decompose RIAL + $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g; + # Farsi arabic-indic digits to arabic-indic digits + $s =~ s/\x{06F0}/\x{0660}/g; + $s =~ s/\x{06F1}/\x{0661}/g; + $s =~ s/\x{06F2}/\x{0662}/g; + $s =~ s/\x{06F3}/\x{0663}/g; + $s =~ s/\x{06F4}/\x{0664}/g; + $s =~ s/\x{06F5}/\x{0665}/g; + $s =~ s/\x{06F6}/\x{0666}/g; + $s =~ s/\x{06F7}/\x{0667}/g; + $s =~ s/\x{06F8}/\x{0668}/g; + $s =~ s/\x{06F9}/\x{0669}/g; + # Arabic-indic digits to digits + $s =~ s/\x{0660}/0/g; + $s =~ s/\x{0661}/1/g; + $s =~ s/\x{0662}/2/g; + $s =~ s/\x{0663}/3/g; + $s =~ s/\x{0664}/4/g; + $s =~ s/\x{0665}/5/g; + $s =~ s/\x{0666}/6/g; + $s =~ s/\x{0667}/7/g; + $s =~ s/\x{0668}/8/g; + $s =~ s/\x{0669}/9/g; + # Arabic comma to comma + $s =~ s/\x{060C}/\x{002C}/g; + + $s =~ s/\|/ /g; + if ($s ne "") { + print "$s"; + } else { + print ""; + } + } + print "\n"; +} + diff --git a/egs/yomdle_zh/v1/local/yomdle2csv.py b/egs/yomdle_zh/v1/local/yomdle2csv.py new file mode 100755 index 00000000000..3641de90324 --- /dev/null +++ b/egs/yomdle_zh/v1/local/yomdle2csv.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 + +""" +GEDI2CSV +Convert GEDI-type bounding boxes to CSV format + +GEDI Format Example: + + + + + + + + + +CSV Format Example +ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang +0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的,有一大部分在我灵魂中却,0,0.0,0,,zh-cn +""" + +import logging +import os +import sys +import time +import glob +import csv +import imghdr +from PIL import Image +import argparse +import pdb +import cv2 +import numpy as np +import xml.etree.ElementTree as ET + +sin = np.sin +cos = np.cos +pi = np.pi + +def Rotate2D(pts, cnt, ang=90): + M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]]) + res = np.dot(pts-cnt,M)+cnt + return M, res + +def npbox2string(npar): + if np.shape(npar)[0] != 1: + print('Error during CSV conversion\n') + c1,r1 = npar[0][0],npar[0][1] + c2,r2 = npar[0][2],npar[0][3] + c3,r3 = npar[0][4],npar[0][5] + c4,r4 = npar[0][6],npar[0][7] + + return c1,r1,c2,r2,c3,r3,c4,r4 + +# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation ) +# Get 4 corners of the rectangle using cv2.boxPoints() + +class GEDI2CSV(): + + """ Initialize the extractor""" + def __init__(self, logger, args): + self._logger = logger + self._args = args + + """ + Segment image with GEDI bounding box information + """ + def csvfile(self, coords, polys, baseName, pgrot): + + """ for writing the files """ + writePath = self._args.outputDir + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + rotlist = [] + + header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang'] + conf=100 + pgrot = 0 + bbrot = 0 + qual = 0 + script = '' + + write_ctr = 0 + if len(coords) == 0 and len(polys) == 0: + self._logger.info('Found %s with no text content',(baseName)) + print('...Found %s with no text content' % (baseName)) + return + + strPos = writePath + baseName + + for j in polys: + try: + arr = [] + [id,poly_val,text,qual,lang] = j + script=None + #print(j) + for i in poly_val: + if len(i.strip()) > 0: + #print(i) + arr.append(eval(i)) + + contour = np.asarray(arr) + #print(contour) + convex = cv2.convexHull(contour) + rect = cv2.minAreaRect(convex) + box = cv2.boxPoints(rect) + box = np.int0(box) + box = np.reshape(box,(-1,1)).T + c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box) + + bbrot = 0.0 + + rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang]) + + except: + print('...polygon error %s, %s' % (j, baseName)) + continue + + # then write out all of list to file + with open(strPos + ".csv", "w", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(header) + for row in rotlist: + writer.writerow(row) + write_ctr += 1 + + return write_ctr + + +def main(args): + + startTime = time.clock() + + writePath = args.outputDir + print('write to %s' % (writePath)) + if os.path.isdir(writePath) != True: + os.makedirs(writePath) + + """ Setup logging """ + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + if args.log: + handler = logging.FileHandler(args.log) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + gtconverter = GEDI2CSV(logger, args) + namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"} + keyCnt=0 + + fileCnt = 0 + line_write_ctr = 0 + line_error_ctr = 0 + file_error_ctr = 0 + """ + Get all XML files in the directory and sub folders + """ + print('reading %s' % (args.inputDir)) + for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True): + for file in filenames: + if file.lower().endswith('.xml'): + fullName = os.path.join(root,file) + baseName = os.path.splitext(fullName) + + fileCnt += 1 + + try: + """ read the XML file """ + tree = ET.parse(fullName) + except: + print('...ERROR parsing %s' % (fullName)) + file_error_ctr += 1 + continue + + gedi_root = tree.getroot() + child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0] + totalpages = int(child.attrib['NrOfPages']) + coordinates=[] + polygons = [] + + """ and for each page """ + for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)): + + if 'GEDI_orientation' not in pgs.attrib: + pageRot=0 + else: + pageRot = int(pgs.attrib['GEDI_orientation']) + logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot))) + + """ find children for each page """ + for zone in pgs.findall('gedi:DL_ZONE',namespaces): + + if zone.attrib['gedi_type']=='Text' : + if zone.get('polygon'): + keyCnt+=1 + polygons.append([zone.attrib['id'],zone.get('polygon').split(';'), + zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')]) + else: + print('...Not polygon') + + + if len(coordinates) > 0 or len(polygons) > 0: + line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot) + else: + print('...%s has no text content' % (baseName[0])) + + + print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr)) + + +def parse_arguments(argv): + """ Args and defaults """ + parser = argparse.ArgumentParser() + + parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml') + parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/') + parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt') + + return parser.parse_args(argv) + + +if __name__ == '__main__': + """ Run """ + main(parse_arguments(sys.argv[1:])) diff --git a/egs/yomdle_zh/v1/path.sh b/egs/yomdle_zh/v1/path.sh new file mode 100644 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/yomdle_zh/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/yomdle_zh/v1/run.sh b/egs/yomdle_zh/v1/run.sh new file mode 100755 index 00000000000..7e6aab56806 --- /dev/null +++ b/egs/yomdle_zh/v1/run.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +set -e +stage=0 +nj=60 + +database_slam=/export/corpora5/slam/SLAM/Chinese/transcribed +database_yomdle=/export/corpora5/slam/YOMDLE/final_chinese +download_dir=data_yomdle_chinese/download/ +extra_lm=download/extra_lm.txt +data_dir=data_yomdle_chinese +exp_dir=exp_yomdle_chinese + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $stage -le -1 ]; then + local/create_download.sh --database-slam $database_slam \ + --database-yomdle $database_yomdle \ + --slam-dir download/slam_chinese \ + --yomdle-dir download/yomdle_chinese +fi + +if [ $stage -le 0 ]; then + mkdir -p data_slam_chinese/slam + mkdir -p data_yomdle_chinese/yomdle + local/process_data.py download/slam_chinese data_slam_chinese/slam + local/process_data.py download/yomdle_chinese data_yomdle_chinese/yomdle + ln -s ../data_slam_chinese/slam ${data_dir}/test + ln -s ../data_yomdle_chinese/yomdle ${data_dir}/train + image/fix_data_dir.sh ${data_dir}/test + image/fix_data_dir.sh ${data_dir}/train +fi + +mkdir -p $data_dir/{train,test}/data +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py --feat-dim 60 $data_dir/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train + + for datasplit in train test; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd "$cmd" \ + --feat-dim 60 --num-channels 3 \ + $data_dir/${datasplit} + steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1; + done + + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh $data_dir/train +fi + +if [ $stage -le 2 ]; then + for datasplit in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 60 $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir + steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing dictionary and lang..." + if [ ! -f $data_dir/train/bpe.out ]; then + cut -d' ' -f2- $data_dir/train/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out + for datasplit in test train train_aug; do + cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids + cut -d' ' -f2- $data_dir/$datasplit/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text + mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old + paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text + done + fi + + local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + $data_dir/local/dict "" $data_dir/lang/temp $data_dir/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh --data-dir $data_dir --dir $data_dir/local/local_lm + utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + $data_dir/local/dict/lexicon.txt $data_dir/lang_test +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \ + $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir +fi + +if [ $stage -le 8 ]; then + echo "$0: Estimating a language model for lattice rescoring...$(date)" + local/train_lm_lr.sh --data-dir $data_dir --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6 + + utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \ + $data_dir/lang_test $data_dir/lang_test_lr + steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \ + $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr +fi diff --git a/egs/yomdle_zh/v1/steps b/egs/yomdle_zh/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/yomdle_zh/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/yomdle_zh/v1/utils b/egs/yomdle_zh/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/yomdle_zh/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file