Skip to content

[speechx] speedup ngram building #1729

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 21, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions speechx/examples/ds2_ol/aishell/local/split_data.sh
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
#!/usr/bin/env bash

set -eo pipefail

data=$1
feat_scp=$2
split_feat_name=$3
scp=$2
split_name=$3
numsplit=$4

# save in $data/split{n}
# $scp to split
#

if ! [ "$numsplit" -gt 0 ]; then
if [[ ! $numsplit -gt 0 ]]; then
echo "Invalid num-split argument";
exit 1;
fi

directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done)
echo $feat_split_scp
scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)

# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then
for n in `seq $numsplit`; do
mkdir -p $data/split${numsplit}/$n
done
fi

utils/split_scp.pl $feat_scp $feat_split_scp
echo "utils/split_scp.pl $scp $scp_splits"
utils/split_scp.pl $scp $scp_splits
45 changes: 23 additions & 22 deletions speechx/examples/ds2_ol/aishell/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ vocb_dir=$ckpt_dir/data/lang_char/
mkdir -p exp
exp=$PWD/exp

if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
pushd $data
Expand All @@ -42,11 +42,12 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi


if [ ! -d $ckpt_dir ]; then
if [ ! -f $ckpt_dir/data/mean_std.json ]; then
mkdir -p $ckpt_dir
wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
pushd $ckpt_dir
wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
popd
fi

lm=$data/zh_giga.no_cna_cmn.prune01244.klm
Expand All @@ -65,7 +66,7 @@ wer=./aishell_wer
export GLOG_logtostderr=1


if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 3. gen linear feat
cmvn=$data/cmvn.ark
cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
Expand All @@ -80,7 +81,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
--streaming_chunk=0.36
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
ctc-prefix-beam-search-decoder-ol \
Expand All @@ -92,10 +93,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
--result_wspecifier=ark,t:$data/split${nj}/JOB/result

cat $data/split${nj}/*/result > $exp/${label_file}
utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer}
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc-prefix-beam-search-decoder-ol \
Expand All @@ -108,21 +109,21 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm

cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
fi

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
wfst=$data/wfst/
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
pushd $wfst
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip aishell_graph.zip
popd
fi

wfst=$data/wfst/
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
pushd $wfst
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip aishell_graph.zip
popd
fi
graph_dir=$wfst/aishell_graph

graph_dir=$wfst/aishell_graph
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
# TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
wfst-decoder-ol \
Expand All @@ -136,5 +137,5 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg

cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
fi
utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg
fi
19 changes: 13 additions & 6 deletions speechx/examples/ngram/zh/local/aishell_train_lms.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# To be run from one directory above this script.
. ./path.sh

nj=40
text=data/local/lm/text
lexicon=data/local/dict/lexicon.txt

Expand Down Expand Up @@ -31,21 +32,27 @@ cleantext=$dir/text.no_oov
# oov to <SPOKEN_NOISE>
# lexicon line: word char0 ... charn
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
text_dir=$(dirname $text)
split_name=$(basename $text)
./local/split_data.sh $text_dir $text $split_name $nj

utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \
cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
\> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1;
cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext

# compute word counts, sort in descending order
# line: count word
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \
sort --parallel=`nproc` -nr > $dir/word.counts || exit 1;

# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1;

# word with <s> </s>
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
Expand Down
30 changes: 30 additions & 0 deletions speechx/examples/ngram/zh/local/split_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

set -eo pipefail

data=$1
scp=$2
split_name=$3
numsplit=$4

# save in $data/split{n}
# $scp to split
#

if [[ ! $numsplit -gt 0 ]]; then
echo "Invalid num-split argument";
exit 1;
fi

directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)

# if this mkdir fails due to argument-list being too long, iterate.
if ! mkdir -p $directories >&/dev/null; then
for n in `seq $numsplit`; do
mkdir -p $data/split${numsplit}/$n
done
fi

echo "utils/split_scp.pl $scp $scp_splits"
utils/split_scp.pl $scp $scp_splits
31 changes: 30 additions & 1 deletion speechx/examples/text_lm/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,35 @@
# Text PreProcess for building ngram LM

Output `text` file like this:
## Input

```
data/
|-- text
```

Input file is kaldi-style, which has `utt` at first column:
```
Y0000000000_--5llN02F84_S00000 怎么样这些日子住得还习惯吧
Y0000000000_--5llN02F84_S00002 挺好的
Y0000000000_--5llN02F84_S00003 对了美静这段日子经常不和我们一起用餐
Y0000000000_--5llN02F84_S00004 是不是对我回来有什么想法啊
Y0000000000_--5llN02F84_S00005 哪有的事啊
Y0000000000_--5llN02F84_S00006 她这两天挺累的身体也不太舒服
Y0000000000_--5llN02F84_S00007 我让她多睡一会那就好如果要是觉得不方便
Y0000000000_--5llN02F84_S00009 我就搬出去住
Y0000000000_--5llN02F84_S00010 你看你这个人你就是疑心太重
Y0000000000_--5llN02F84_S00011 你现在多好一切都井然有序的
```


## Output

```
data/
`-- text.tn
```

Output file like this:

```
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
Expand Down
Loading