mahsa7823 · mahsa7823 · Jul 15, 2018 · Jul 10, 2018 · Jul 10, 2018
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
@@ -15,6 +15,7 @@ dir=exp/chain/tdnn1b_sp
 lang=data/lang_combined_chain
 tree_dir=exp/chain/tree_sp
 cmd=queue.pl
+graph_affix=_combined
 
 # training options
 chunk_width=140,100,160
@@ -99,11 +100,10 @@ if [ $stage -le 3 ]; then
         --skip-scoring true \
         --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
         --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
-        $tree_dir/graph_combined ${datadir}_segmented_hires ${decode_dir} || exit 1
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
 
       # resolve ctm overlaping regions, and compute wer
-      cp ${datadir}/reftext ${datadir}_segmented_hires
-      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph_combined \
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
         ${decode_dir}
     ) || touch $dir/.error &
   done
@@ -165,11 +165,11 @@ if [ $stage -le 5 ]; then
         --skip-scoring true \
         --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
         --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_reseg_hires \
-        $tree_dir/graph_combined ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
 
       # resolve ctm overlaping regions, and compute wer
       cp ${datadir}/reftext ${datadir}_segmented_reseg_hires
-      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph_combined \
+      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph${graph_affix} \
         ${decode_dir}
     ) || touch $dir/.error &
   done

diff --git a/egs/material/s5/local/ctm_filter b/egs/material/s5/local/ctm_filter
@@ -0,0 +1,7 @@
+#!/usr/bin/perl
+
+while (<>) {
+  if ($_ !~ m/<(noise|unk|spnoise|sil)>/i) {
+    print $_;
+  }
+}
diff --git a/egs/material/s5/local/parse_dev_transcripts.py b/egs/material/s5/local/parse_dev_transcripts.py
@@ -0,0 +1,195 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+import re
+
+
+def normalize_text(text):
+    parts = text.strip().split()
+
+    for i, w in enumerate(parts):
+        if w in ["<no-speech>", "--", ".", "?", "~"]:
+            parts[i] = ""
+        elif w == "%incomplete":
+            parts[i] = "<unk>"
+        elif w in ["<cough>", "<laugh>", "<lipsmack>", "<hes>"]:
+            parts[i] = "<spnoise>"
+        elif w in ["<breath>", "<sta>"]:
+            parts[i] = "<noise>"
+        elif w in ["<int>", "(())", "<foreign>", "<overlap>", "<misc>"]:
+            parts[i] = "<unk>"
+
+        # change *word* into word
+        parts[i] = re.sub(r"^[*](\S+)[*]$", r"\1", parts[i])
+
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def write_segment(start_time, end_time, text, reco_id,
+                  segments_fh, utt2spk_fh, text_fh):
+    assert end_time > start_time
+
+    text = normalize_text(text)
+
+    utt_id = "{reco_id}-{st:06d}-{end:06d}".format(
+        reco_id=reco_id,
+        st=int(start_time * 100), end=int(end_time * 100))
+
+    print ("{utt_id} {reco_id} {st} {end}"
+           "".format(utt_id=utt_id, reco_id=reco_id,
+                     st=start_time, end=end_time),
+           file=segments_fh)
+    print ("{utt_id} {reco_id}"
+           "".format(utt_id=utt_id, reco_id=reco_id),
+           file=utt2spk_fh)
+    print ("{utt_id} {text}"
+           "".format(utt_id=utt_id, text=text),
+           file=text_fh)
+
+
+def parse_calls_transcript_file(transcript_file, segments_fh,
+                                utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    inline_start_time = -1
+    outline_start_time = -1
+
+    i = 0
+
+    for line in open(transcript_file):
+        parts = line.strip().split()
+
+        if i == 0 and not parts[0].startswith('0'):
+            raise Exception("Transcript file {0} does not start with 0.000"
+                            "".format(transcript_file))
+        i += 1
+
+        start_time = float(parts[0])
+        if len(parts) == 1:
+            # Last line in the file
+            write_segment(inline_start_time, start_time, inline_text, file_id + "_inLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            write_segment(outline_start_time, start_time, outline_text, file_id + "_outLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            break
+
+        assert parts[1] in ["inLine", "outLine"]
+
+        if parts[1] == "inLine":
+            reco_id = file_id + "_inLine"
+            if inline_start_time >= 0:
+                write_segment(inline_start_time, start_time, inline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            inline_text = " ".join(parts[2:])
+            inline_start_time = start_time
+        else:
+            reco_id = file_id + "_outLine"
+            if outline_start_time >= 0:
+                write_segment(outline_start_time, start_time, outline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            outline_text = " ".join(parts[2:])
+            outline_start_time = start_time
+
+
+def parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    start_time = -1
+    i = 0
+
+    with open(transcript_file) as fh:
+        line = fh.readline().strip()
+        if not line.startswith('['):
+            raise Exception("Transcript file {0} does not start with [0.000"
+                            "".format(transcript_file))
+        try:
+            start_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+        except Exception:
+            print("Could not parse line {0}".format(line), file=sys.stderr)
+            raise
+
+        text = fh.readline()
+        while text != '':
+            text = text.strip()
+            line = fh.readline().strip()
+            if not line.startswith('['):
+                raise Exception("Time-stamp in transcript file {0} does not start with [; error parsing line {1} after text {2}"
+                                "".format(transcript_file, line, text))
+            try:
+                end_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+            except Exception:
+                print("Could not parse line {0}".format(line), file=sys.stderr)
+                raise
+
+            write_segment(start_time, end_time, text, file_id,
+                          segments_fh, utt2spk_fh, text_fh)
+            start_time = end_time
+            text = fh.readline()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print ("Usage: {0} <corpus-root-dir> <calls-list> <non-calls-list> <data-dir>",
+               file=sys.stderr)
+        raise SystemExit(1)
+
+    root_path = sys.argv[1]
+    calls_list = open(sys.argv[2]).readlines()
+    non_calls_list = open(sys.argv[3]).readlines()
+    data_dir = sys.argv[4]
+
+    wav_scp_fh = open("{0}/wav.scp".format(data_dir), 'w')
+    utt2spk_fh = open("{0}/utt2spk".format(data_dir), 'w')
+    reco2file_and_channel_fh = open(
+        "{0}/reco2file_and_channel".format(data_dir), 'w')
+    text_fh = open("{0}/text".format(data_dir), 'w')
+    segments_fh = open("{0}/segments".format(data_dir), 'w')
+
+    for line in calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        for channel in [1, 2]:
+            reco_id = file_id + ("_inLine" if channel == 1 else "_outLine")
+            print ("{reco_id} {file_id} {channel}"
+                   "".format(reco_id=reco_id, file_id=file_id,
+                             channel="A" if channel == 1 else "B"),
+                   file=reco2file_and_channel_fh)
+            print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - remix {channel} |"
+                   "".format(reco_id=reco_id, wav_file=wav_file, channel=channel),
+                   file=wav_scp_fh)
+
+        parse_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh)
+
+    for line in non_calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        print ("{file_id} {file_id} 1"
+               "".format(file_id=file_id),
+               file=reco2file_and_channel_fh)
+        print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - |"
+               "".format(reco_id=file_id, wav_file=wav_file),
+               file=wav_scp_fh)
+
+        parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                        utt2spk_fh, text_fh)
+
+    wav_scp_fh.close()
+    utt2spk_fh.close()
+    reco2file_and_channel_fh.close()
+    text_fh.close()
+    segments_fh.close()
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
@@ -1,37 +1,43 @@
 #!/bin/sh
 set -euo pipefail
-set -e -o pipefail                                                              
-set -o nounset                              # Treat unset variables as an error 
 echo "$0 $@"
 
-data=$1
-graph_dir=$2
-decode_dir=$3
+stage=0
 
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <data-id> <graph-dir> <decode-dir>"
+  echo " e.g.: $0 analysis1 exp/chain/tdnn/graph exp/chain/tdnn/decode_analysis1_segmented"
+  exit 1
+fi
+
+data=$1
+graph_dir=$2
+decode_dir=$3
 
 # get recording-level CTMs from the lattice by resolving the overlapping
 # regions
 
-steps/get_ctm_fast.sh --frame-shift 0.03 \
-  data/${data}_hires/ ${graph_dir} \
-  ${decode_dir} ${decode_dir}/score_10/
-
-cat ${decode_dir}/score_10/ctm.* \
-  > ${decode_dir}/score_10/ctm
-
-awk '{print $2" "$2" 1"}' data/${data}_hires/segments > \
-  data/${data}_hires/reco2file_and_channel
-
-utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
-  ${decode_dir}/score_10/ctm \
-  - | utils/convert_ctm.pl data/${data}_hires/segments \
-  data/${data}_hires/reco2file_and_channel > \
-  ${decode_dir}/score_10/ctm_out
-
-# compute WER              
-local/score_segments.sh data/${data}_hires/ ${decode_dir}
-
+if [ $stage -le 0 ]; then
+  steps/get_ctm_fast.sh --cmd "$decode_cmd" --frame-shift 0.03 \
+    data/${data}_hires/ ${graph_dir} \
+    ${decode_dir} ${decode_dir}/score_10_0.0
+fi
+
+if [ $stage -le 1 ]; then
+  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
+    ${decode_dir}/score_10_0.0/ctm \
+    - | utils/convert_ctm.pl data/${data}_hires/segments data/${data}_hires/reco2file_and_channel > \
+    ${decode_dir}/score_10_0.0/${data}_hires.ctm
+fi
+
+if [ $stage -le 2 ]; then
+  # compute WER              
+  local/score_stm.sh --min-lmwt 10 --max-lmwt 10 --word-ins-penalty 0.0 \
+    --cmd "$decode_cmd" data/${data}_hires $graph_dir ${decode_dir}
+
+  grep -H Sum ${decode_dir}/score*/*.sys | utils/best_wer.sh
+fi