PaddlePaddle · SmileGoat · Apr 21, 2022 · Apr 20, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/speechx/examples/ds2_ol/aishell/local/split_data.sh b/speechx/examples/ds2_ol/aishell/local/split_data.sh
@@ -1,24 +1,30 @@
 #!/usr/bin/env bash
 
+set -eo pipefail
+
 data=$1
-feat_scp=$2
-split_feat_name=$3
+scp=$2
+split_name=$3
 numsplit=$4
 
+# save in $data/split{n}
+# $scp to split
+# 
 
-if ! [ "$numsplit" -gt 0 ]; then
+if [[ ! $numsplit -gt 0 ]]; then
   echo "Invalid num-split argument";
   exit 1;
 fi
 
 directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
-echo $feat_split_scp
+scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
+
 # if this mkdir fails due to argument-list being too long, iterate.
 if ! mkdir -p $directories >&/dev/null; then
   for n in `seq $numsplit`; do
     mkdir -p $data/split${numsplit}/$n
   done
 fi
 
-utils/split_scp.pl $feat_scp $feat_split_scp
+echo "utils/split_scp.pl $scp $scp_splits"
+utils/split_scp.pl $scp $scp_splits
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
@@ -29,7 +29,7 @@ vocb_dir=$ckpt_dir/data/lang_char/
 mkdir -p exp
 exp=$PWD/exp
 
-if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
     aishell_wav_scp=aishell_test.scp
     if [ ! -d $data/test ]; then
         pushd $data
@@ -42,11 +42,12 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
         paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
     fi
 
-
-    if [ ! -d $ckpt_dir ]; then
+    if [ ! -f $ckpt_dir/data/mean_std.json ]; then
         mkdir -p $ckpt_dir
-        wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-        tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+        pushd $ckpt_dir
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+        tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz 
+        popd
     fi
 
     lm=$data/zh_giga.no_cna_cmn.prune01244.klm
@@ -65,7 +66,7 @@ wer=./aishell_wer
 export GLOG_logtostderr=1
 
 
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # 3. gen linear feat
     cmvn=$data/cmvn.ark
     cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
@@ -80,7 +81,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
         --streaming_chunk=0.36
 fi
 
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     #  recognizer
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
     ctc-prefix-beam-search-decoder-ol \
@@ -92,10 +93,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
         --result_wspecifier=ark,t:$data/split${nj}/JOB/result
 
     cat $data/split${nj}/*/result > $exp/${label_file}
-    utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
+    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer}
 fi
 
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     #  decode with lm
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
     ctc-prefix-beam-search-decoder-ol \
@@ -108,21 +109,21 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
         --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
 
     cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
-    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
+    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
 fi
 
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    wfst=$data/wfst/
+    mkdir -p $wfst
+    if [ ! -f $wfst/aishell_graph.zip ]; then
+        pushd $wfst
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
+        unzip aishell_graph.zip
+        popd
+    fi
 
-wfst=$data/wfst/
-mkdir -p $wfst
-if [ ! -f $wfst/aishell_graph.zip ]; then
-    pushd $wfst
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
-    unzip aishell_graph.zip
-    popd
-fi
+    graph_dir=$wfst/aishell_graph
 
-graph_dir=$wfst/aishell_graph
-if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
     wfst-decoder-ol \
@@ -136,5 +137,5 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
         --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
 
     cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
-    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
-fi
+    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg
+fi
diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
@@ -3,6 +3,7 @@
 # To be run from one directory above this script.
 . ./path.sh
 
+nj=40
 text=data/local/lm/text
 lexicon=data/local/dict/lexicon.txt
 
@@ -31,21 +32,27 @@ cleantext=$dir/text.no_oov
 # oov to <SPOKEN_NOISE>
 # lexicon line: word char0 ... charn
 # text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
-cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
-  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
-  > $cleantext || exit 1;
+text_dir=$(dirname $text)
+split_name=$(basename $text)
+./local/split_data.sh $text_dir $text $split_name $nj
+
+utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \
+  cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+    {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+    \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1;
+cat ${text_dir}/split${nj}/*/${split_name}.no_oov  > $cleantext
 
 # compute word counts, sort in descending order
 # line: count word
-cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-   sort -nr > $dir/word.counts || exit 1;
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \
+   sort --parallel=`nproc` -nr > $dir/word.counts || exit 1;
 
 # Get counts from acoustic training transcripts, and add  one-count
 # for each word in the lexicon (but not silence, we don't want it
 # in the LM-- we'll add it optionally later).
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+   sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1;
 
 # word with <s> </s>
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist

diff --git a/speechx/examples/ngram/zh/local/split_data.sh b/speechx/examples/ngram/zh/local/split_data.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+set -eo pipefail
+
+data=$1
+scp=$2
+split_name=$3
+numsplit=$4
+
+# save in $data/split{n}
+# $scp to split
+# 
+
+if [[ ! $numsplit -gt 0 ]]; then
+  echo "Invalid num-split argument";
+  exit 1;
+fi
+
+directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
+scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
+
+# if this mkdir fails due to argument-list being too long, iterate.
+if ! mkdir -p $directories >&/dev/null; then
+  for n in `seq $numsplit`; do
+    mkdir -p $data/split${numsplit}/$n
+  done
+fi
+
+echo "utils/split_scp.pl $scp $scp_splits"
+utils/split_scp.pl $scp $scp_splits
diff --git a/speechx/examples/text_lm/README.md b/speechx/examples/text_lm/README.md
@@ -1,6 +1,35 @@
 # Text PreProcess for building ngram LM
 
-Output `text` file like this:
+## Input
+
+```
+data/
+|-- text
+```
+
+Input file is kaldi-style, which has `utt` at first column: 
+```
+Y0000000000_--5llN02F84_S00000  怎么样这些日子住得还习惯吧
+Y0000000000_--5llN02F84_S00002  挺好的
+Y0000000000_--5llN02F84_S00003  对了美静这段日子经常不和我们一起用餐
+Y0000000000_--5llN02F84_S00004  是不是对我回来有什么想法啊
+Y0000000000_--5llN02F84_S00005  哪有的事啊
+Y0000000000_--5llN02F84_S00006  她这两天挺累的身体也不太舒服
+Y0000000000_--5llN02F84_S00007  我让她多睡一会那就好如果要是觉得不方便
+Y0000000000_--5llN02F84_S00009  我就搬出去住
+Y0000000000_--5llN02F84_S00010  你看你这个人你就是疑心太重
+Y0000000000_--5llN02F84_S00011  你现在多好一切都井然有序的
+```
+
+
+## Output
+
+```
+data/
+`-- text.tn
+```
+
+Output file like this:
 
 ```
 BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购