From bb411a143c242551c896e4a8b3fd7fe0f3d065b5 Mon Sep 17 00:00:00 2001 From: zhuang Date: Wed, 8 May 2019 17:52:06 -0400 Subject: [PATCH 1/5] add VB for callhome v2 --- .../v1/diarization/VB_diarization.py | 359 ++++++++++ .../v1/diarization/VB_resegmentation.py | 224 +++++++ .../v1/diarization/VB_resegmentation.sh | 102 +++ .../v1/diarization/convert_VB_model.py | 118 ++++ .../v1/diarization/convert_VB_model.sh | 53 ++ .../v1/diarization/kaldi_io.py | 627 ++++++++++++++++++ .../train_ivector_extractor_diag.sh | 151 +++++ .../v1/local/make_callhome.sh | 5 + egs/callhome_diarization/v2/run.sh | 51 +- src/ivectorbin/Makefile | 2 +- src/ivectorbin/ivector-extractor-copy.cc | 64 ++ 11 files changed, 1754 insertions(+), 2 deletions(-) create mode 100755 egs/callhome_diarization/v1/diarization/VB_diarization.py create mode 100755 egs/callhome_diarization/v1/diarization/VB_resegmentation.py create mode 100755 egs/callhome_diarization/v1/diarization/VB_resegmentation.sh create mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.py create mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.sh create mode 100755 egs/callhome_diarization/v1/diarization/kaldi_io.py create mode 100755 egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh create mode 100755 src/ivectorbin/ivector-extractor-copy.cc diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py new file mode 100755 index 00000000000..31af078efd2 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -0,0 +1,359 @@ +# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Revision History +# L. Burget 16/07/13 01:00AM - original version +# L. Burget 20/06/17 12:07AM - np.asarray replaced by .toarray() +# - minor bug fix in initializing q +# - minor bug fix in ELBO calculation +# - few more optimizations + +import numpy as np +from scipy.sparse import coo_matrix +import scipy.linalg as spl +import numexpr as ne # the dependency on this modul can be avoided by replacing + # logsumexp_ne and exp_ne with logsumexp and np.exp + +#[q sp Li] = +def VB_diarization(X, m, iE, w, V, sp=None, q=None, + maxSpeakers = 10, maxIters = 10, + epsilon = 1e-4, loopProb = 0.99, statScale = 1.0, + alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None, + plot=False, sparsityThr=0.001, llScale=1.0, minDur=1): + + """ + This a generalized version of speaker diarization described in: + + Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, + Montreal, CRIM, May 2008. + + Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone + Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal + Processing, December 2010. + + The generalization introduced in this implementation lies in using an HMM + instead of the simple mixture model when modeling generation of segments + (or even frames) from speakers. HMM limits the probability of switching + between speakers when changing frames, which makes it possible to use + the model on frame-by-frame bases without any need to iterate between + 1) clustering speech segments and 2) re-segmentation (i.e. as it was done in + the paper above). + + Inputs: + X - T x D array, where columns are D dimensional feature vectors for T frames + m - C x D array of GMM component means + iE - C x D array of GMM component inverse covariance matrix diagonals + w - C dimensional column vector of GMM component weights + V - R x C x D array of eigenvoices + maxSpeakers - maximum number of speakers expected in the utterance + maxIters - maximum number of algorithm iterations + epsilon - stop iterating, if obj. fun. improvement is less than epsilon + loopProb - probability of not switching speakers between frames + statScale - scale sufficient statiscits collected using UBM + llScale - scale UBM likelihood (i.e. llScale < 1.0 make atribution of + frames to UBM componets more uncertain) + sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory + as the posteriors are represented by sparse matrix) + alphaQInit - Dirichlet concentraion parameter for initializing q + downsample - perform diarization on input downsampled by this factor + VtiEV - C x (R**2+R)/2 matrix normally calculated by VB_diarization when + VtiEV is None. However, it can be pre-calculated using function + precalculate_VtiEV(V) and used across calls of VB_diarization. + minDur - minimum number of frames between speaker turns imposed by linear + chains of HMM states corresponding to each speaker. All the states + in a chain share the same output distribution + ref - T dim. integer vector with reference speaker ID (0:maxSpeakers) + per frame + plot - if set to True, plot per-frame speaker posteriors. + + Outputs: + q - S x T matrix of posteriors attribution each frame to one of S possible + speakers, where S is given by opts.maxSpeakers + sp - S dimensional column vector of ML learned speaker priors. Ideally, these + should allow to estimate # of speaker in the utterance as the + probabilities of the redundant speaker should converge to zero. + Li - values of auxiliary function (and DER and frame cross-entropy between q + and reference if 'ref' is provided) over iterations. + """ + + # The references to equations corresponds to the technical report: + # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors, + # Montreal, CRIM, May 2008. + + D=X.shape[1] # feature dimensionality + C=len(w) # number of mixture components + R=V.shape[0] # subspace rank + nframes=X.shape[0] + + if VtiEV is None: + VtiEV = precalculate_VtiEV(V, iE) + + V = V.reshape(V.shape[0],-1) + + if sp is None: + sp = np.ones(maxSpeakers)/maxSpeakers + else: + maxSpeakers = len(sp) + + if q is None: + # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit + q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers)) + q = q / q.sum(1, keepdims=True) + + # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics) + ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) + ll *= llScale + G = logsumexp_ne(ll, axis=1) + NN = exp_ne(ll - G[:,np.newaxis]) * statScale + NN[NN 0 and L - Li[-2][0] < epsilon: + if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!') + break + + if downsample is not None: + #upsample resulting q to match number of frames in the input utterance + q = downsampler.T.dot(q) + + return q, sp, Li + + +def precalculate_VtiEV(V, iE): + tril_ind = np.tril_indices(V.shape[0]) + VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype) + for c in range(V.shape[1]): + VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind] + return VtiEV + + +# Initialize q (per-frame speaker posteriors) from a reference +# (vector of per-frame zero based integer speaker IDs) +def frame_labels2posterior_mx(labels, maxSpeakers): + #initialize from reference + #pmx = np.zeros((len(labels), labels.max()+1)) + pmx = np.zeros((len(labels), maxSpeakers)) + pmx[np.arange(len(labels)), labels] = 1 + return pmx + +# Calculates Diarization Error Rate (DER) or per-frame cross-entropy between +# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame +# speaker posteriors). If expected=False, q is converted into hard labels before +# calculating DER. If expected=TRUE, posteriors in q are used to calculated +# "expected" DER. +def DER(q, ref, expected=True, xentropy=False): + from itertools import permutations + + if not expected: + # replce probabiities in q by zeros and ones + hard_labels = q.argmax(1) + q = np.zeros_like(q) + q[range(len(q)), hard_labels] = 1 + + err_mx = np.empty((ref.max()+1, q.shape[1])) + for s in range(err_mx.shape[0]): + tmpq = q[ref == s,:] + err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0) + + if err_mx.shape[0] < err_mx.shape[1]: + err_mx = err_mx.T + + # try all alignments (permutations) of reference and detected speaker + #could be written in more efficient way using dynamic programing + acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum() + for perm in permutations(range(err_mx.shape[0]))] + if xentropy: + return min(acc)/float(len(ref)) + else: + return (len(ref) - max(acc))/float(len(ref)) + + +############################################################################### +# Module private functions +############################################################################### +def logsumexp(x, axis=0): + xmax = x.max(axis) + x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis)) + infs = np.isinf(xmax) + if np.ndim(x) > 0: + x[infs] = xmax[infs] + elif infs: + x = xmax + return x + + +# The folowing two functions are only versions optimized for speed using numexpr +# module and can be replaced by logsumexp and np.exp functions to avoid +# the dependency on the module. +def logsumexp_ne(x, axis=0): + xmax = np.array(x).max(axis=axis) + xmax_e = np.expand_dims(xmax, axis) + x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis) + x = ne.evaluate("xmax + log(x)") + infs = np.isinf(xmax) + if np.ndim(x) > 0: + x[infs] = xmax[infs] + elif infs: + x = xmax + return x + + +def exp_ne(x, out=None): + return ne.evaluate("exp(x)", out=None) + + +# Convert vector with lower-triangular coefficients into symetric matrix +def tril_to_sym(tril): + R = np.sqrt(len(tril)*2).astype(int) + tril_ind = np.tril_indices(R) + S = np.empty((R,R)) + S[tril_ind] = tril + S[tril_ind[::-1]] = tril + return S + + +def logdet(A): + return 2*np.sum(np.log(np.diag(spl.cholesky(A)))) + + +def forward_backward(lls, tr, ip): + """ + Inputs: + lls - matrix of per-frame log HMM state output probabilities + tr - transition probability matrix + ip - vector of initial state probabilities (i.e. statrting in the state) + Outputs: + sp - matrix of per-frame state occupation posteriors + tll - total (forward) log-likelihood + lfw - log forward probabilities + lfw - log backward probabilities + """ + ltr = np.log(tr) + lfw = np.empty_like(lls) + lbw = np.empty_like(lls) + lfw[:] = -np.inf + lbw[:] = -np.inf + lfw[0] = lls[0] + np.log(ip) + lbw[-1] = 0.0 + + for ii in xrange(1,len(lls)): + lfw[ii] = lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1) + + for ii in reversed(xrange(len(lls)-1)): + lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1) + + tll = logsumexp(lfw[-1]) + sp = np.exp(lfw + lbw - tll) + return sp, tll, lfw, lbw diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py new file mode 100755 index 00000000000..65b72d55a8e --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python + +# Copyright 2019 Zili Huang + +# This script is evoked by diarization/VB_resegmentation.sh. It is a wrapper +# for Variational Bayes resegmentation. It shows how to use the code from +# Brno University of Technology to do resegmentation. + +import numpy as np +import VB_diarization +import pickle +import kaldi_io +import sys +import argparse + +def get_utt_list(utt2spk_filename): + with open(utt2spk_filename, 'r') as fh: + content = fh.readlines() + utt_list = [line.split()[0] for line in content] + print("{} utterances in total".format(len(utt_list))) + return utt_list + +# prepare utt2num_frames dictionary +def get_utt2num_frames(utt2num_frames_filename): + utt2num_frames = {} + with open(utt2num_frames_filename, 'r') as fh: + content = fh.readlines() + for line in content: + line = line.strip('\n') + line_split = line.split() + utt2num_frames[line_split[0]] = int(line_split[1]) + return utt2num_frames + +def create_ref(uttname, utt2num_frames, full_rttm_filename): + num_frames = utt2num_frames[uttname] + + # We use 0 to denote silence frames and 1 to denote overlapping frames. + ref = np.zeros(num_frames) + speaker_dict = {} + num_spk = 0 + + with open(full_rttm_filename, 'r') as fh: + content = fh.readlines() + for line in content: + line = line.strip('\n') + line_split = line.split() + uttname_line = line_split[1] + if uttname != uttname_line: + continue + start_time, duration = int(float(line_split[3]) * 100), int(float(line_split[4]) * 100) + end_time = start_time + duration + spkname = line_split[7] + if spkname not in speaker_dict.keys(): + spk_idx = num_spk + 2 + speaker_dict[spkname] = spk_idx + num_spk += 1 + + for i in range(start_time, end_time): + if i < 0: + raise ValueError("Time index less than 0") + elif i >= num_frames: + print("Time index exceeds number of frames") + break + else: + if ref[i] == 0: + ref[i] = speaker_dict[spkname] + else: + ref[i] = 1 # The overlapping speech is marked as 1. + return ref.astype(int) + +# create output rttm file +def create_rttm_output(uttname, predicted_label, output_dir, channel): + num_frames = len(predicted_label) + + start_idx = 0 + seg_list = [] + + last_label = predicted_label[0] + for i in range(num_frames): + if predicted_label[i] == last_label: # The speaker label remains the same. + continue + else: # The speaker label is different. + if last_label != 0: # Ignore the silence. + seg_list.append([start_idx, i, last_label]) + start_idx = i + last_label = predicted_label[i] + if last_label != 0: + seg_list.append([start_idx, num_frames, last_label]) + + with open("{}/{}_predict.rttm".format(output_dir, uttname), 'w') as fh: + for i in range(len(seg_list)): + start_frame = (seg_list[i])[0] + end_frame = (seg_list[i])[1] + label = (seg_list[i])[2] + duration = end_frame - start_frame + fh.write("SPEAKER {} {} {:.2f} {:.2f} {} \n".format(uttname, channel, start_frame / 100.0, duration / 100.0, label)) + return 0 + +def main(): + parser = argparse.ArgumentParser(description='VB Resegmentation Wrapper') + parser.add_argument('data_dir', type=str, help='Subset data directory') + parser.add_argument('init_rttm_filename', type=str, + help='The rttm file to initialize the VB system, usually the AHC cluster result') + parser.add_argument('output_dir', type=str, help='Output directory') + parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model') + parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model') + + parser.add_argument('--true-rttm-filename', type=str, default="None", + help='The true rttm label file') + parser.add_argument('--max-speakers', type=int, default=10, + help='Maximum number of speakers expected in the utterance (default: 10)') + parser.add_argument('--max-iters', type=int, default=10, + help='Maximum number of algorithm iterations (default: 10)') + parser.add_argument('--downsample', type=int, default=25, + help='Perform diarization on input downsampled by this factor (default: 25)') + parser.add_argument('--alphaQInit', type=float, default=100.0, + help='Dirichlet concentraion parameter for initializing q') + parser.add_argument('--sparsityThr', type=float, default=0.001, + help='Set occupations smaller that this threshold to 0.0 (saves memory as \ + the posteriors are represented by sparse matrix)') + parser.add_argument('--epsilon', type=float, default=1e-6, + help='Stop iterating, if obj. fun. improvement is less than epsilon') + parser.add_argument('--minDur', type=int, default=1, + help='Minimum number of frames between speaker turns imposed by linear \ + chains of HMM states corresponding to each speaker. All the states \ + in a chain share the same output distribution') + parser.add_argument('--loopProb', type=float, default=0.9, + help='Probability of not switching speakers between frames') + parser.add_argument('--statScale', type=float, default=0.2, + help='Scale sufficient statiscits collected using UBM') + parser.add_argument('--llScale', type=float, default=1.0, + help='Scale UBM likelihood (i.e. llScale < 1.0 make atribution of \ + frames to UBM componets more uncertain)') + parser.add_argument('--channel', type=int, default=0, + help='Channel information in the rttm file') + parser.add_argument('--initialize', type=int, default=1, + help='Whether to initalize the speaker posterior') + + args = parser.parse_args() + print(args) + init_rttm_filename = args.init_rttm_filename + + utt_list = get_utt_list("{}/utt2spk".format(args.data_dir)) + utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir)) + + # Load the diagonal UBM and i-vector extractor + with open(args.dubm_model, 'rb') as fh: + dubm_para = pickle.load(fh) + with open(args.ie_model, 'rb') as fh: + ie_para = pickle.load(fh) + + assert '' in dubm_para and '' in dubm_para and '' in dubm_para + DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para[''], dubm_para[''], dubm_para[''] + assert 'M' in ie_para + IE_M = np.transpose(ie_para['M'], (2, 0, 1)) + + m = DUBM_MEANS_INVVARS / DUBM_INV_VARS + iE = DUBM_INV_VARS + w = DUBM_WEIGHTS + V = IE_M + + # Load the MFCC features + feats_dict = {} + for key,mat in kaldi_io.read_mat_scp("{}/feats.scp".format(args.data_dir)): + feats_dict[key] = mat + + for utt in utt_list: + # Get the alignments from the clustering result. + # In init_ref, 0 denotes the silence silence frames + # 1 denotes the overlapping speech frames, the speaker + # label starts from 2. + init_ref = create_ref(utt, utt2num_frames, init_rttm_filename) + # Ground truth of the diarization. + if args.true_rttm_filename != "None": + true_ref = create_ref(utt, utt2num_frames, args.true_rttm_filename) + else: + true_ref = None + + X = feats_dict[utt] + X = X.astype(np.float64) + + # Keep only the voiced frames (0 denotes the silence + # frames, 1 denotes the overlapping speech frames). + mask = (init_ref >= 2) + X_voiced = X[mask] + init_ref_voiced = init_ref[mask] - 2 + + if args.true_rttm_filename != "None": + true_ref_voiced = true_ref[mask] - 2 + if np.sum(true_ref) == 0: + print("Warning: {} has no voiced frames in the label file".format(utt)) + continue + if X_voiced.shape[0] == 0: + print("Warning: {} has no voiced frames in the initialization file".format(utt)) + continue + + # Initialize the posterior of each speaker based on the clustering result. + if args.initialize: + q = VB_diarization.frame_labels2posterior_mx(init_ref_voiced, args.max_speakers) + else: + q = None + + # VB resegmentation + + # q - S x T matrix of posteriors attribution each frame to one of S possible + # speakers, where S is given by opts.maxSpeakers + # sp - S dimensional column vector of ML learned speaker priors. Ideally, these + # should allow to estimate # of speaker in the utterance as the + # probabilities of the redundant speaker should converge to zero. + # Li - values of auxiliary function (and DER and frame cross-entropy between q + # and reference if 'ref' is provided) over iterations. + q_out, sp_out, L_out = VB_diarization.VB_diarization(X_voiced, m, iE, w, V, sp=None, q=q, maxSpeakers=args.max_speakers, maxIters=args.max_iters, VtiEV=None, + downsample=args.downsample, alphaQInit=args.alphaQInit, sparsityThr=args.sparsityThr, epsilon=args.epsilon, minDur=args.minDur, + loopProb=args.loopProb, statScale=args.statScale, llScale=args.llScale, ref=None, plot=False) + predicted_label_voiced = np.argmax(q_out, 1) + 2 + predicted_label = (np.zeros(len(mask))).astype(int) + predicted_label[mask] = predicted_label_voiced + + # Create the output rttm file + create_rttm_output(utt, predicted_label, args.output_dir, args.channel) + return 0 + +if __name__ == "__main__": + main() diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh new file mode 100755 index 00000000000..7e448b4a580 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2019 Zili Huang + +# This script is a wrapper for Variational Bayes resegmentation. +# It shows how to use the code from Brno University of Technology +# to do resegmentation. + +# Begin configuration section. +nj=20 +cmd=run.pl +stage=0 +true_rttm_filename=None +max_speakers=10 +max_iters=10 +downsample=25 +alphaQInit=100.0 +sparsityThr=0.001 +epsilon=1e-6 +minDur=1 +loopProb=0.9 +statScale=0.2 +llScale=1.0 +channel=0 +initialize=1 +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: diarization/VB_resegmentation.sh " + echo "Variational Bayes Re-segmenatation" + echo "Options: " + echo " --cmd (utils/run.pl|utils/queue.pl ) # How to run jobs." + echo " --nj # Number of parallel jobs to run." + echo " --true-rttm-filename # The true rttm label file" + echo " --max-speakers # Maximum number of speakers" + echo " # expected in the utterance" + echo " # (default: 10)" + echo " --max-iters # Maximum number of algorithm" + echo " # iterations (default: 10)" + echo " --downsample # Perform diarization on input" + echo " # downsampled by this factor" + echo " # (default: 25)" + echo " --alphaQInit # Dirichlet concentraion" + echo " # parameter for initializing q" + echo " --sparsityThr # Set occupations smaller that" + echo " # this threshold to 0.0 (saves" + echo " # memory as the posteriors are" + echo " # represented by sparse matrix)" + echo " --epsilon # Stop iterating, if obj. fun." + echo " # improvement is less than" + echo " # epsilon" + echo " --minDur # Minimum number of frames" + echo " # between speaker turns imposed" + echo " # by linear chains of HMM" + echo " # state corresponding to each" + echo " # speaker. All the states in" + echo " # a chain share the same output" + echo " # distribution" + echo " --loopProb # Probability of not switching" + echo " # speakers between frames" + echo " --statScale # Scale sufficient statistics" + echo " # collected using UBM" + echo " --llScale # Scale UBM likelihood (i.e." + echo " # llScale < 1.0 make" + echo " # attribution of frames to UBM" + echo " # componets more uncertain)" + echo " --channel # Channel information in the rttm file" + echo " --initialize # Whether to initalize the" + echo " # speaker posterior (if not)" + echo " # the speaker posterior will be" + echo " # randomly initilized" + + exit 1; +fi + +data_dir=$1 +init_rttm_filename=$2 +output_dir=$3 +dubm_model=$4 +ie_model=$5 + +mkdir -p $output_dir/tmp + +sdata=$data_dir/split$nj; +utils/split_data.sh $data_dir $nj || exit 1; + +if [ $stage -le 0 ]; then + $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \ + python diarization/VB_resegmentation.py --true-rttm-filename $true_rttm_filename --max-speakers $max_speakers \ + --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \ + --sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \ + --loopProb $loopProb --statScale $statScale --llScale $llScale \ + --channel $channel --initialize $initialize \ + $sdata/JOB $init_rttm_filename $output_dir/tmp $dubm_model $ie_model || exit 1; + + cat $output_dir/tmp/* > $output_dir/rttm/VB_rttm +fi diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py new file mode 100755 index 00000000000..cf5da8bfa75 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# Copyright 2019 Zili Huang +# Apache 2.0 + +# This script is called by diarization/convert_VB_model.sh. +# It converts diagonal UBM and ivector extractor to numpy +# array format + +import numpy as np +import pickle +import sys + +def load_dubm(dubm_text): + para_dict = {} + with open(dubm_text, 'r') as fh: + content = fh.readlines() + state = 0 + data_array = [] + + for line in content: + line = line.strip('\n') + line_split = line.split() + if state == 0: + if len(line_split) == 1: + continue + elif len(line_split) == 2 and line_split[1] == "[": # Start of a multi-line matrix like and + para_name = line_split[0] + state = 1 + data_array = [] + elif len(line_split) >= 3 and line_split[1] == "[" and line_split[-1] == "]": # Single line vector like + para_name = line_split[0] + data_list = [] + for i in range(2, len(line_split) - 1): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + para_dict[para_name] = data_list + else: + raise ValueError("Condition not defined.") + elif state == 1: + if line_split[-1] == "]": # End of a multi-line matrix like and + data_list = [] + for i in range(len(line_split) - 1): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + data_array.append(data_list) + data_array = np.array(data_array) + para_dict[para_name] = data_array + state = 0 + else: + data_list = [] + for i in range(len(line_split)): + data_list.append(float(line_split[i])) + data_list = np.array(data_list) + data_array.append(data_list) + else: + raise ValueError("Condition not defined.") + return para_dict + +def load_ivector_extractor(ie_text): + para_dict = {} + with open(ie_text, 'r') as fh: + content = fh.readlines() + state = 0 + data_3dmatrix = [] + + for line in content: + line = line.strip('\n') + if line == " [": + break + if state == 0: + if not line.startswith(""): + continue + else: + state = 1 + data_matrix = [] + elif state == 1: + line_split = line.split() + if line_split[0] == "[": + data_matrix = [] + continue + elif line_split[-1] == "]": + data_array = [] + for i in range(len(line_split)-1): + data_array.append(float(line_split[i])) + data_matrix.append(data_array) + data_3dmatrix.append(data_matrix) + else: + data_array = [] + for i in range(len(line_split)): + data_array.append(float(line_split[i])) + data_matrix.append(data_array) + else: + raise ValueError("Condition not defined.") + para_dict['M'] = np.array(data_3dmatrix) + return para_dict + +def save_dict(para_dict, output_filename): + with open(output_filename, 'wb') as fh: + pickle.dump(para_dict, fh) + return 0 + +def main(): + dubm_model = sys.argv[1] + ivec_extractor_model = sys.argv[2] + output_dir = sys.argv[3] + + # the diagonal ubm parameter includes , , , + dubm_para = load_dubm(dubm_model) + save_dict(dubm_para, "{}/diag_ubm.pkl".format(output_dir)) + + # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] + ie_para = load_ivector_extractor(ivec_extractor_model) + save_dict(ie_para, "{}/ie.pkl".format(output_dir)) + return 0 + +if __name__ == "__main__": + main() diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh new file mode 100755 index 00000000000..60a6c3687e0 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Copyright 2019 Zili Huang +# Apache 2.0 + +# This script is part of VB resegmentation, it converts diagonal UBM and +# ivector extractor to numpy array format + +# begin configuration section. +stage=0 +cmd=run.pl + +# end configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " + echo " Options:" + echo " --stage (0|1) # start script from part-way through" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes" + echo "e.g.:" + echo "$0 exp/diag_ubm_1024/final.dubm exp/extractor_diag_c1024_i128/final.ie exp/VB" + exit 1; +fi + +gmm_model=$1 +ivec_extractor=$2 +VB_dir=$3 + +if [ $stage -le 0 ]; then + # Dump the diagonal UBM model into txt format. + "$train_cmd" $VB_dir/log/convert_diag_ubm.log \ + gmm-global-copy --binary=false \ + $gmm_model \ + $VB_dir/dubm.tmp || exit 1; + + # Dump the ivector extractor model into txt format. + "$train_cmd" $VB_dir/log/convert_ie.log \ + ivector-extractor-copy --binary=false \ + $ivec_extractor \ + $VB_dir/ie.tmp || exit 1; +fi + +if [ $stage -le 1 ]; then + # Convert txt to numpy format + python diarization/convert_VB_model.py $VB_dir/dubm.tmp $VB_dir/ie.tmp $VB_dir || exit 1; + + rm $VB_dir/dubm.tmp $VB_dir/ie.tmp || exit 1; +fi diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py new file mode 100755 index 00000000000..dae5599b8f1 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/kaldi_io.py @@ -0,0 +1,627 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) +# Licensed under the Apache License, Version 2.0 (the "License") + +import numpy as np +import sys, os, re, gzip, struct + +################################################# +# Adding kaldi tools to shell path, + +# Select kaldi, +if not 'KALDI_ROOT' in os.environ: + # Default! To change run python with 'export KALDI_ROOT=/some_dir python' + os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' + +# Add kaldi tools to path, +os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] + + +################################################# +# Define all custom exceptions, +class UnsupportedDataType(Exception): pass +class UnknownVectorHeader(Exception): pass +class UnknownMatrixHeader(Exception): pass + +class BadSampleSize(Exception): pass +class BadInputFormat(Exception): pass + +class SubprocessFailed(Exception): pass + +################################################# +# Data-type independent helper functions, + +def open_or_fd(file, mode='rb'): + """ fd = open_or_fd(file) + Open file, gzipped file, pipe, or forward the file-descriptor. + Eventually seeks in the 'file' argument contains ':offset' suffix. + """ + offset = None + try: + # strip 'ark:' prefix from r{x,w}filename (optional), + if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): + (prefix,file) = file.split(':',1) + # separate offset from filename (optional), + if re.search(':[0-9]+$', file): + (file,offset) = file.rsplit(':',1) + # input pipe? + if file[-1] == '|': + fd = popen(file[:-1], 'rb') # custom, + # output pipe? + elif file[0] == '|': + fd = popen(file[1:], 'wb') # custom, + # is it gzipped? + elif file.split('.')[-1] == 'gz': + fd = gzip.open(file, mode) + # a normal file... + else: + fd = open(file, mode) + except TypeError: + # 'file' is opened file descriptor, + fd = file + # Eventually seek to offset, + if offset != None: fd.seek(int(offset)) + return fd + +# based on '/usr/local/lib/python3.4/os.py' +def popen(cmd, mode="rb"): + if not isinstance(cmd, str): + raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) + + import subprocess, io, threading + + # cleanup function for subprocesses, + def cleanup(proc, cmd): + ret = proc.wait() + if ret > 0: + raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) + return + + # text-mode, + if mode == "r": + proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return io.TextIOWrapper(proc.stdout) + elif mode == "w": + proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return io.TextIOWrapper(proc.stdin) + # binary, + elif mode == "rb": + proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return proc.stdout + elif mode == "wb": + proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) + threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, + return proc.stdin + # sanity, + else: + raise ValueError("invalid mode %s" % mode) + + +def read_key(fd): + """ [key] = read_key(fd) + Read the utterance-key from the opened ark/stream descriptor 'fd'. + """ + key = '' + while 1: + char = fd.read(1).decode("latin1") + if char == '' : break + if char == ' ' : break + key += char + key = key.strip() + if key == '': return None # end of file, + assert(re.match('^\S+$',key) != None) # check format (no whitespace!) + return key + + +################################################# +# Integer vectors (alignments, ...), + +def read_ali_ark(file_or_fd): + """ Alias to 'read_vec_int_ark()' """ + return read_vec_int_ark(file_or_fd) + +def read_vec_int_ark(file_or_fd): + """ generator(key,vec) = read_vec_int_ark(file_or_fd) + Create generator of (key,vector) tuples, which reads from the ark file/stream. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Read ark to a 'dictionary': + d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + ali = read_vec_int(fd) + yield key, ali + key = read_key(fd) + finally: + if fd is not file_or_fd: fd.close() + +def read_vec_int(file_or_fd): + """ [int-vec] = read_vec_int(file_or_fd) + Read kaldi integer vector, ascii or binary input, + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode() + if binary == '\0B': # binary flag + assert(fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim + # Elements from int32 vector are sored in tuples: (sizeof(int32), value), + vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) + assert(vec[0]['size'] == 4) # int32 size, + ans = vec[:]['value'] # values are in 2nd column, + else: # ascii, + arr = (binary + fd.readline().decode()).strip().split() + try: + arr.remove('['); arr.remove(']') # optionally + except ValueError: + pass + ans = np.array(arr, dtype=int) + if fd is not file_or_fd : fd.close() # cleanup + return ans + +# Writing, +def write_vec_int(file_or_fd, v, key=''): + """ write_vec_int(f, v, key='') + Write a binary kaldi integer vector to filename or stream. + Arguments: + file_or_fd : filename or opened file descriptor for writing, + v : the vector to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the vector. + + Example of writing single vector: + kaldi_io.write_vec_int(filename, vec) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,vec in dict.iteritems(): + kaldi_io.write_vec_flt(f, vec, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # dim, + fd.write('\4'.encode()) # int32 type, + fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) + # data, + for i in range(len(v)): + fd.write('\4'.encode()) # int32 type, + fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# Float vectors (confidences, ivectors, ...), + +# Reading, +def read_vec_flt_scp(file_or_fd): + """ generator(key,mat) = read_vec_flt_scp(file_or_fd) + Returns generator of (key,vector) tuples, read according to kaldi scp. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the scp: + for key,vec in kaldi_io.read_vec_flt_scp(file): + ... + + Read scp to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + """ + fd = open_or_fd(file_or_fd) + try: + for line in fd: + (key,rxfile) = line.decode().split(' ') + vec = read_vec_flt(rxfile) + yield key, vec + finally: + if fd is not file_or_fd : fd.close() + +def read_vec_flt_ark(file_or_fd): + """ generator(key,vec) = read_vec_flt_ark(file_or_fd) + Create generator of (key,vector) tuples, reading from an ark file/stream. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Read ark to a 'dictionary': + d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + ali = read_vec_flt(fd) + yield key, ali + key = read_key(fd) + finally: + if fd is not file_or_fd: fd.close() + +def read_vec_flt(file_or_fd): + """ [flt-vec] = read_vec_flt(file_or_fd) + Read kaldi float vector, ascii or binary input, + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode() + if binary == '\0B': # binary flag + # Data type, + header = fd.read(3).decode() + if header == 'FV ': sample_size = 4 # floats + elif header == 'DV ': sample_size = 8 # doubles + else: raise UnknownVectorHeader("The header contained '%s'" % header) + assert(sample_size > 0) + # Dimension, + assert(fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim + # Read whole vector, + buf = fd.read(vec_size * sample_size) + if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') + elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') + else : raise BadSampleSize + return ans + else: # ascii, + arr = (binary + fd.readline().decode()).strip().split() + try: + arr.remove('['); arr.remove(']') # optionally + except ValueError: + pass + ans = np.array(arr, dtype=float) + if fd is not file_or_fd : fd.close() # cleanup + return ans + +# Writing, +def write_vec_flt(file_or_fd, v, key=''): + """ write_vec_flt(f, v, key='') + Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. + Arguments: + file_or_fd : filename or opened file descriptor for writing, + v : the vector to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the vector. + + Example of writing single vector: + kaldi_io.write_vec_flt(filename, vec) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,vec in dict.iteritems(): + kaldi_io.write_vec_flt(f, vec, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # Data-type, + if v.dtype == 'float32': fd.write('FV '.encode()) + elif v.dtype == 'float64': fd.write('DV '.encode()) + else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) + # Dim, + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim + # Data, + fd.write(v.tobytes()) + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# Float matrices (features, transformations, ...), + +# Reading, +def read_mat_scp(file_or_fd): + """ generator(key,mat) = read_mat_scp(file_or_fd) + Returns generator of (key,matrix) tuples, read according to kaldi scp. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the scp: + for key,mat in kaldi_io.read_mat_scp(file): + ... + + Read scp to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } + """ + fd = open_or_fd(file_or_fd) + try: + for line in fd: + (key,rxfile) = line.decode().split(' ') + mat = read_mat(rxfile) + yield key, mat + finally: + if fd is not file_or_fd : fd.close() + +def read_mat_ark(file_or_fd): + """ generator(key,mat) = read_mat_ark(file_or_fd) + Returns generator of (key,matrix) tuples, read from ark file/stream. + file_or_fd : scp, gzipped scp, pipe or opened file descriptor. + + Iterate the ark: + for key,mat in kaldi_io.read_mat_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + mat = read_mat(fd) + yield key, mat + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + +def read_mat(file_or_fd): + """ [mat] = read_mat(file_or_fd) + Reads single kaldi matrix, supports ascii and binary. + file_or_fd : file, gzipped file, pipe or opened file descriptor. + """ + fd = open_or_fd(file_or_fd) + try: + binary = fd.read(2).decode() + if binary == '\0B' : + mat = _read_mat_binary(fd) + else: + assert(binary == ' [') + mat = _read_mat_ascii(fd) + finally: + if fd is not file_or_fd: fd.close() + return mat + +def _read_mat_binary(fd): + # Data type + header = fd.read(3).decode() + # 'CM', 'CM2', 'CM3' are possible values, + if header.startswith('CM'): return _read_compressed_mat(fd, header) + elif header == 'FM ': sample_size = 4 # floats + elif header == 'DM ': sample_size = 8 # doubles + else: raise UnknownMatrixHeader("The header contained '%s'" % header) + assert(sample_size > 0) + # Dimensions + s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] + # Read whole matrix + buf = fd.read(rows * cols * sample_size) + if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') + elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') + else : raise BadSampleSize + mat = np.reshape(vec,(rows,cols)) + return mat + +def _read_mat_ascii(fd): + rows = [] + while 1: + line = fd.readline().decode() + if (len(line) == 0) : raise BadInputFormat # eof, should not happen! + if len(line.strip()) == 0 : continue # skip empty line + arr = line.strip().split() + if arr[-1] != ']': + rows.append(np.array(arr,dtype='float32')) # not last line + else: + rows.append(np.array(arr[:-1],dtype='float32')) # last line + mat = np.vstack(rows) + return mat + + +def _read_compressed_mat(fd, format): + """ Read a compressed matrix, + see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h + methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), + """ + assert(format == 'CM ') # The formats CM2, CM3 are not supported... + + # Format of header 'struct', + global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, + per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) + + # Read global header, + globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] + + # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] + # { cols }{ size } + col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) + col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) + data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, + + mat = np.zeros((cols,rows), dtype='float32') + p0 = col_headers[:, 0].reshape(-1, 1) + p25 = col_headers[:, 1].reshape(-1, 1) + p75 = col_headers[:, 2].reshape(-1, 1) + p100 = col_headers[:, 3].reshape(-1, 1) + mask_0_64 = (data <= 64) + mask_193_255 = (data > 192) + mask_65_192 = (~(mask_0_64 | mask_193_255)) + + mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) + mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) + mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) + + return mat.T # transpose! col-major -> row-major, + + +# Writing, +def write_mat(file_or_fd, m, key=''): + """ write_mat(f, m, key='') + Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. + Arguments: + file_or_fd : filename of opened file descriptor for writing, + m : the matrix to be stored, + key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. + + Example of writing single matrix: + kaldi_io.write_mat(filename, mat) + + Example of writing arkfile: + with open(ark_file,'w') as f: + for key,mat in dict.iteritems(): + kaldi_io.write_mat(f, mat, key=key) + """ + fd = open_or_fd(file_or_fd, mode='wb') + if sys.version_info[0] == 3: assert(fd.mode == 'wb') + try: + if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), + fd.write('\0B'.encode()) # we write binary! + # Data-type, + if m.dtype == 'float32': fd.write('FM '.encode()) + elif m.dtype == 'float64': fd.write('DM '.encode()) + else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) + # Dims, + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows + fd.write('\04'.encode()) + fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols + # Data, + fd.write(m.tobytes()) + finally: + if fd is not file_or_fd : fd.close() + + +################################################# +# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) +# Corresponds to: vector > > +# - outer vector: time axis +# - inner vector: records at the time +# - tuple: int = index, float = value +# + +def read_cnet_ark(file_or_fd): + """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ + return read_post_ark(file_or_fd) + +def read_post_ark(file_or_fd): + """ generator(key,vec>) = read_post_ark(file) + Returns generator of (key,posterior) tuples, read from ark file. + file_or_fd : ark, gzipped ark, pipe or opened file descriptor. + + Iterate the ark: + for key,post in kaldi_io.read_post_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:post for key,post in kaldi_io.read_post_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + post = read_post(fd) + yield key, post + key = read_key(fd) + finally: + if fd is not file_or_fd: fd.close() + +def read_post(file_or_fd): + """ [post] = read_post(file_or_fd) + Reads single kaldi 'Posterior' in binary format. + + The 'Posterior' is C++ type 'vector > >', + the outer-vector is usually time axis, inner-vector are the records + at given time, and the tuple is composed of an 'index' (integer) + and a 'float-value'. The 'float-value' can represent a probability + or any other numeric value. + + Returns vector of vectors of tuples. + """ + fd = open_or_fd(file_or_fd) + ans=[] + binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag + assert(fd.read(1).decode() == '\4'); # int-size + outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) + + # Loop over 'outer-vector', + for i in range(outer_vec_size): + assert(fd.read(1).decode() == '\4'); # int-size + inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) + data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) + assert(data[0]['size_idx'] == 4) + assert(data[0]['size_post'] == 4) + ans.append(data[['idx','post']].tolist()) + + if fd is not file_or_fd: fd.close() + return ans + + +################################################# +# Kaldi Confusion Network bin begin/end times, +# (kaldi stores CNs time info separately from the Posterior). +# + +def read_cntime_ark(file_or_fd): + """ generator(key,vec>) = read_cntime_ark(file_or_fd) + Returns generator of (key,cntime) tuples, read from ark file. + file_or_fd : file, gzipped file, pipe or opened file descriptor. + + Iterate the ark: + for key,time in kaldi_io.read_cntime_ark(file): + ... + + Read ark to a 'dictionary': + d = { key:time for key,time in kaldi_io.read_post_ark(file) } + """ + fd = open_or_fd(file_or_fd) + try: + key = read_key(fd) + while key: + cntime = read_cntime(fd) + yield key, cntime + key = read_key(fd) + finally: + if fd is not file_or_fd : fd.close() + +def read_cntime(file_or_fd): + """ [cntime] = read_cntime(file_or_fd) + Reads single kaldi 'Confusion Network time info', in binary format: + C++ type: vector >. + (begin/end times of bins at the confusion network). + + Binary layout is ' ...' + + file_or_fd : file, gzipped file, pipe or opened file descriptor. + + Returns vector of tuples. + """ + fd = open_or_fd(file_or_fd) + binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary + + assert(fd.read(1).decode() == '\4'); # int-size + vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) + + data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) + assert(data[0]['size_beg'] == 4) + assert(data[0]['size_end'] == 4) + ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), + + if fd is not file_or_fd : fd.close() + return ans + + +################################################# +# Segments related, +# + +# Segments as 'Bool vectors' can be handy, +# - for 'superposing' the segmentations, +# - for frame-selection in Speaker-ID experiments, +def read_segments_as_bool_vec(segments_file): + """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) + using kaldi 'segments' file for 1 wav, format : ' ' + - t-beg, t-end is in seconds, + - assumed 100 frames/second, + """ + segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) + # Sanity checks, + assert(len(segs) > 0) # empty segmentation is an error, + assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, + # Convert time to frame-indexes, + start = np.rint([100 * rec[2] for rec in segs]).astype(int) + end = np.rint([100 * rec[3] for rec in segs]).astype(int) + # Taken from 'read_lab_to_bool_vec', htk.py, + frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], + np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) + assert np.sum(end-start) == np.sum(frms) + return frms + diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh new file mode 100755 index 00000000000..61f7e4a14e0 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# Copyright 2013 Daniel Povey +# 2014 David Snyder +# 2019 Zili Huang +# Apache 2.0. + +# This script trains the i-vector extractor for VB resegmentation. This script is very similar to +# sid/train_ivector_extractor.sh. The only difference is that the UBM is assumed to be diagonal. + +# Begin configuration section. +nj=10 # this is the number of separate queue jobs we run, but each one + # contains num_processes sub-jobs.. the real number of threads we + # run is nj * num_processes * num_threads, and the number of + # separate pieces of data is nj * num_processes. +num_threads=4 +num_processes=4 # each job runs this many processes, each with --num-threads threads +cmd="run.pl" +stage=-4 +num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select +ivector_dim=400 # dimension of the extracted i-vector +use_weights=false # set to true to turn on the regression of log-weights on the ivector. +num_iters=10 +min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) +num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method) +cleanup=true +apply_cmn=true # If true, apply sliding window cepstral mean normalization +posterior_scale=1.0 # This scale helps to control for successve features being highly + # correlated. E.g. try 0.1 or 0.3 +sum_accs_opt= +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/ubm_2048_male/final.dubm data/train_male exp/extractor_male" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-iters <#iters|10> # Number of iterations of E-M" + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --num-processes # Number of processes for each queue job (relates" + echo " # to summing accs in memory)" + echo " --num-threads # Number of threads for each process (can't be usefully" + echo " # increased much above 4)" + echo " --stage # To control partial reruns" + echo " --num-gselect # Number of Gaussians to select using" + echo " # diagonal model." + echo " --sum-accs-opt # Option e.g. '-l hostname=a15' to localize" + echo " # sum-accs process to nfs server." + echo " --apply-cmn # if true, apply sliding window cepstral mean" + echo " # normalization to features" + exit 1; +fi + +gmm_model=$1 +data=$2 +dir=$3 +srcdir=$(dirname $gmm_model) + +for f in $gmm_model $data/feats.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +nj_full=$[$nj*$num_processes] +sdata=$data/split$nj_full; +utils/split_data.sh $data $nj_full || exit 1; + +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +if [ -f $srcdir/delta_opts ]; then + cp $srcdir/delta_opts $dir/ 2>/dev/null +fi + +parallel_opts="--num-threads $[$num_threads*$num_processes]" +## Set up features. +if $apply_cmn; then + feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" +else + feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" +fi + +# Initialize the i-vector extractor using the FGMM input +if [ $stage -le -2 ]; then + cp $gmm_model $dir/final.dubm || exit 1; + $cmd $dir/log/init.log \ + ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \ + "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1 +fi + +# Do Gaussian selection and posterior extracion + +if [ $stage -le -1 ]; then + echo $nj_full > $dir/num_jobs + echo "$0: doing Gaussian selection and posterior computation" + $cmd JOB=1:$nj_full $dir/log/gselect.JOB.log \ + gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$feats" ark:- \| \ + scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1; +else + if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then + echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)" + exit 1 + fi +fi + +x=0 +while [ $x -lt $num_iters ]; do + if [ $stage -le $x ]; then + rm $dir/.error 2>/dev/null + + Args=() # bash array of training commands for 1:nj, that put accs to stdout. + for j in $(seq $nj_full); do + Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g` + done + + echo "Accumulating stats (pass $x)" + for g in $(seq $nj); do + start=$[$num_processes*($g-1)+1] + $cmd $parallel_opts $dir/log/acc.$x.$g.log \ + ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \ + $dir/acc.$x.$g || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1; + accs="" + for j in $(seq $nj); do + accs+="$dir/acc.$x.$j " + done + echo "Summing accs (pass $x)" + $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \ + ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1; + echo "Updating model (pass $x)" + nt=$[$num_threads*$num_processes] # use the same number of threads that + # each accumulation process uses, since we + # can be sure the queue will support this many. + $cmd $parallel_opts $dir/log/update.$x.log \ + ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1; + rm $dir/acc.$x.* + $cleanup && rm $dir/acc.$x $dir/$x.ie + fi + x=$[$x+1] +done +$cleanup && rm -f $dir/post.*.gz +rm -f $dir/final.ie +ln -s $x.ie $dir/final.ie diff --git a/egs/callhome_diarization/v1/local/make_callhome.sh b/egs/callhome_diarization/v1/local/make_callhome.sh index caa8f679f22..21411fb6194 100755 --- a/egs/callhome_diarization/v1/local/make_callhome.sh +++ b/egs/callhome_diarization/v1/local/make_callhome.sh @@ -70,4 +70,9 @@ utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \ > $data_dir/callhome2/reco2num_spk +rm $data_dir/callhome/segments || exit 1; +awk '{print $1, $1}' $data_dir/callhome/wav.scp > $data_dir/callhome/utt2spk +utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt +utils/fix_data_dir.sh $data_dir/callhome + rm -rf $tmp_dir 2> /dev/null diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh index b79717e2348..8eb2d93f600 100755 --- a/egs/callhome_diarization/v2/run.sh +++ b/egs/callhome_diarization/v2/run.sh @@ -19,6 +19,8 @@ vaddir=`pwd`/mfcc data_root=/export/corpora5/LDC stage=0 nnet_dir=exp/xvector_nnet_1a/ +num_components=1024 +ivector_dim=400 # Prepare datasets if [ $stage -le 0 ]; then @@ -53,7 +55,7 @@ if [ $stage -le 1 ]; then # callhome1 and callhome2. Each partition is treated like a held-out # dataset, and used to estimate various quantities needed to perform # diarization on the other part (and vice versa). - for name in train callhome1 callhome2; do + for name in train callhome1 callhome2 callhome; do steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \ --cmd "$train_cmd" --write-utt2num-frames true \ data/$name exp/make_mfcc $mfccdir @@ -356,3 +358,50 @@ if [ $stage -le 11 ]; then # Compare to 8.69% in ../v1/run.sh echo "Using the oracle number of speakers, DER: $der%" fi + +# Variational Bayes resegmentation using the code from Brno University of Technology +# Please see https://speech.fit.vutbr.cz/software/vb-diarization-eigenvoice-and-hmm-priors +# for details +if [ $stage -le 12 ]; then + utils/subset_data_dir.sh data/train 32000 data/train_32k + # Train the diagonal UBM. + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \ + --nj 40 --num-threads 8 --subsample 1 --delta-order 0 --apply-cmn false \ + data/train_32k $num_components exp/diag_ubm_$num_components + + # Train the i-vector extractor. The UBM is assumed to be diagonal. + diarization/train_ivector_extractor_diag.sh \ + --cmd "$train_cmd --mem 35G" \ + --ivector-dim $ivector_dim --num-iters 5 --apply-cmn false \ + --num-threads 1 --num-processes 1 --nj 40 \ + exp/diag_ubm_$num_components/final.dubm data/train \ + exp/extractor_diag_c${num_components}_i${ivector_dim} + + # Converts diagonal UBM and ivector extractor to numpy array format + diarization/convert_VB_model.sh --cmd "$train_cmd" exp/diag_ubm_$num_components/final.dubm \ + exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie exp/VB +fi + +if [ $stage -le 13 ]; then + output_rttm_dir=exp/VB/rttm + mkdir -p $output_rttm_dir || exit 1; + cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \ + $nnet_dir/xvectors_callhome2/plda_scores/rttm > $output_rttm_dir/x_vector_rttm + init_rttm_file=$output_rttm_dir/x_vector_rttm + + # VB resegmentation. In this script, I use the x-vector result to + # initialize the VB system. You can also use i-vector result or random + # initize the VB system. + diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \ + --true_rttm_filename "None" --initialize 1 \ + data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; + + # Compute the DER after VB resegmentation + mkdir -p exp/VB/results || exit 1; + md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s $output_rttm_dir/VB_rttm 2> exp/VB/log/VB_DER.log \ + > exp/VB/results/VB_DER.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + exp/VB/results/VB_DER.txt) + # After VB resegmentation, DER: 6.48% + echo "After VB resegmentation, DER: $der%" +fi diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile index 5a738352d9c..8dc3498b83b 100644 --- a/src/ivectorbin/Makefile +++ b/src/ivectorbin/Makefile @@ -4,7 +4,7 @@ all: EXTRA_CXXFLAGS = -Wno-sign-compare include ../kaldi.mk -BINFILES = ivector-extractor-init ivector-extractor-acc-stats \ +BINFILES = ivector-extractor-init ivector-extractor-copy ivector-extractor-acc-stats \ ivector-extractor-sum-accs ivector-extractor-est \ ivector-extract compute-vad select-voiced-frames \ compute-vad-from-frame-likes merge-vads \ diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc new file mode 100755 index 00000000000..858981e6164 --- /dev/null +++ b/src/ivectorbin/ivector-extractor-copy.cc @@ -0,0 +1,64 @@ +// ivectorbin/ivector-extractor-init.cc + +// Copyright 2019 Zili Huang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "ivector/ivector-extractor.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using kaldi::int32; + + const char *usage = + "Copy an ivector-extractor\n" + "Usage: ivector-extractor-copy [options] \n" + "e.g.:\n" + " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n"; + + bool binary = true; + IvectorExtractorOptions ivector_opts; + ParseOptions po(usage); + po.Register("binary", &binary, "Write output in binary mode"); + ivector_opts.Register(&po); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string ivector_extractor_rxfilename = po.GetArg(1), + ivector_extractor_wxfilename = po.GetArg(2); + + IvectorExtractor extractor; + ReadKaldiObject(ivector_extractor_rxfilename, &extractor); + + WriteKaldiObject(extractor, ivector_extractor_wxfilename, binary); + + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} + From 57cd7723d1bff09b8357c7df7da8698df075089a Mon Sep 17 00:00:00 2001 From: zhuang Date: Wed, 8 May 2019 18:30:35 -0400 Subject: [PATCH 2/5] remove true_rttm_filename option --- .../v1/diarization/VB_diarization.py | 19 ++++++------------- .../v1/diarization/VB_resegmentation.py | 17 +++-------------- .../v1/diarization/VB_resegmentation.sh | 4 +--- egs/callhome_diarization/v2/run.sh | 2 +- 4 files changed, 11 insertions(+), 31 deletions(-) diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py index 31af078efd2..5ac7691fd86 100755 --- a/egs/callhome_diarization/v1/diarization/VB_diarization.py +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -203,19 +203,12 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)] if plot: - #import matplotlib.pyplot - #if ii == 0: matplotlib.pyplot.clf() - #matplotlib.pyplot.subplot(maxIters, 1, ii+1) - #matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2) - #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto', - # cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05)) - import matplotlib.pyplot as plt - if ii == 0: plt.clf() - plt.subplot(maxIters, 1, ii+1) - plt.plot(downsampler.T.dot(q), lw=2) - #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto', - # cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05)) - plt.savefig("result.pdf") + import matplotlib.pyplot + if ii == 0: matplotlib.pyplot.clf() + matplotlib.pyplot.subplot(maxIters, 1, ii+1) + matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2) + matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto', + cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05)) print ii, Li[-2] diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py index 65b72d55a8e..b90200778bc 100755 --- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py @@ -105,8 +105,6 @@ def main(): parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model') parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model') - parser.add_argument('--true-rttm-filename', type=str, default="None", - help='The true rttm label file') parser.add_argument('--max-speakers', type=int, default=10, help='Maximum number of speakers expected in the utterance (default: 10)') parser.add_argument('--max-iters', type=int, default=10, @@ -170,14 +168,10 @@ def main(): # 1 denotes the overlapping speech frames, the speaker # label starts from 2. init_ref = create_ref(utt, utt2num_frames, init_rttm_filename) - # Ground truth of the diarization. - if args.true_rttm_filename != "None": - true_ref = create_ref(utt, utt2num_frames, args.true_rttm_filename) - else: - true_ref = None - X = feats_dict[utt] - X = X.astype(np.float64) + # load MFCC features + X = (feats_dict[utt]).astype(np.float64) + assert len(init_ref) == len(X) # Keep only the voiced frames (0 denotes the silence # frames, 1 denotes the overlapping speech frames). @@ -185,11 +179,6 @@ def main(): X_voiced = X[mask] init_ref_voiced = init_ref[mask] - 2 - if args.true_rttm_filename != "None": - true_ref_voiced = true_ref[mask] - 2 - if np.sum(true_ref) == 0: - print("Warning: {} has no voiced frames in the label file".format(utt)) - continue if X_voiced.shape[0] == 0: print("Warning: {} has no voiced frames in the initialization file".format(utt)) continue diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh index 7e448b4a580..53ad10ddd4c 100755 --- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh @@ -10,7 +10,6 @@ nj=20 cmd=run.pl stage=0 -true_rttm_filename=None max_speakers=10 max_iters=10 downsample=25 @@ -36,7 +35,6 @@ if [ $# != 5 ]; then echo "Options: " echo " --cmd (utils/run.pl|utils/queue.pl ) # How to run jobs." echo " --nj # Number of parallel jobs to run." - echo " --true-rttm-filename # The true rttm label file" echo " --max-speakers # Maximum number of speakers" echo " # expected in the utterance" echo " # (default: 10)" @@ -91,7 +89,7 @@ utils/split_data.sh $data_dir $nj || exit 1; if [ $stage -le 0 ]; then $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \ - python diarization/VB_resegmentation.py --true-rttm-filename $true_rttm_filename --max-speakers $max_speakers \ + python diarization/VB_resegmentation.py --max-speakers $max_speakers \ --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \ --sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \ --loopProb $loopProb --statScale $statScale --llScale $llScale \ diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh index 8eb2d93f600..3dfb3ccb127 100755 --- a/egs/callhome_diarization/v2/run.sh +++ b/egs/callhome_diarization/v2/run.sh @@ -393,7 +393,7 @@ if [ $stage -le 13 ]; then # initialize the VB system. You can also use i-vector result or random # initize the VB system. diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \ - --true_rttm_filename "None" --initialize 1 \ + --initialize 1 \ data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; # Compute the DER after VB resegmentation From d7104be5967b474373cbba25da8819a4faca0b9e Mon Sep 17 00:00:00 2001 From: zhuang Date: Wed, 8 May 2019 20:27:50 -0400 Subject: [PATCH 3/5] add some comments --- .../v1/diarization/VB_resegmentation.py | 8 +++++--- .../v1/diarization/train_ivector_extractor_diag.sh | 4 ++-- src/ivectorbin/ivector-extractor-copy.cc | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py index b90200778bc..d89a082a4b3 100755 --- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py @@ -2,9 +2,11 @@ # Copyright 2019 Zili Huang -# This script is evoked by diarization/VB_resegmentation.sh. It is a wrapper -# for Variational Bayes resegmentation. It shows how to use the code from -# Brno University of Technology to do resegmentation. +# This script is evoked by diarization/VB_resegmentation.sh. It prepares the necessary +# inputs for the VB system and creates the output RTTM file. The inputs include data directory +# (data_dir), the rttm file to initialize the VB system(init_rttm_filename), the directory to +# output the rttm prediction(output_dir), path to diagonal UBM model(dubm_model) and path to +# i-vector extractor model(ie_model). import numpy as np import VB_diarization diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh index 61f7e4a14e0..9254012f3b0 100755 --- a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh +++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh @@ -5,8 +5,8 @@ # 2019 Zili Huang # Apache 2.0. -# This script trains the i-vector extractor for VB resegmentation. This script is very similar to -# sid/train_ivector_extractor.sh. The only difference is that the UBM is assumed to be diagonal. +# This script trains the i-vector extractor for VB resegmentation. It is very similar to +# sid/train_ivector_extractor.sh except that the UBM is assumed to be diagonal in this script. # Begin configuration section. nj=10 # this is the number of separate queue jobs we run, but each one diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc index 858981e6164..f04a6d20120 100755 --- a/src/ivectorbin/ivector-extractor-copy.cc +++ b/src/ivectorbin/ivector-extractor-copy.cc @@ -1,4 +1,4 @@ -// ivectorbin/ivector-extractor-init.cc +// ivectorbin/ivector-extractor-copy.cc // Copyright 2019 Zili Huang @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) { using kaldi::int32; const char *usage = - "Copy an ivector-extractor\n" + "Copy the i-vector extractor to a text file\n" "Usage: ivector-extractor-copy [options] \n" "e.g.:\n" " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n"; From 7ba2444dd6f2392d19d56cf2762329230092437e Mon Sep 17 00:00:00 2001 From: zhuang Date: Thu, 9 May 2019 11:45:31 -0400 Subject: [PATCH 4/5] fix python problem --- egs/callhome_diarization/v1/diarization/VB_diarization.py | 8 ++++---- .../v1/diarization/VB_resegmentation.sh | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py index 5ac7691fd86..5cc5135ec4a 100755 --- a/egs/callhome_diarization/v1/diarization/VB_diarization.py +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -23,8 +23,8 @@ import numpy as np from scipy.sparse import coo_matrix import scipy.linalg as spl -import numexpr as ne # the dependency on this modul can be avoided by replacing - # logsumexp_ne and exp_ne with logsumexp and np.exp +#import numexpr as ne # the dependency on this modul can be avoided by replacing +# # logsumexp_ne and exp_ne with logsumexp and np.exp #[q sp Li] = def VB_diarization(X, m, iE, w, V, sp=None, q=None, @@ -115,8 +115,8 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics) ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi)) ll *= llScale - G = logsumexp_ne(ll, axis=1) - NN = exp_ne(ll - G[:,np.newaxis]) * statScale + G = logsumexp(ll, axis=1) + NN = np.exp(ll - G[:,np.newaxis]) * statScale NN[NN Date: Thu, 30 May 2019 22:01:10 -0400 Subject: [PATCH 5/5] fix python problem, avoid pickle conversion --- .../v1/diarization/VB_diarization.py | 11 +- .../v1/diarization/VB_resegmentation.py | 17 +- .../v1/diarization/VB_resegmentation.sh | 21 +- .../v1/diarization/convert_VB_model.py | 47 +- .../v1/diarization/convert_VB_model.sh | 53 -- .../v1/diarization/kaldi_io.py | 627 ------------------ egs/callhome_diarization/v2/run.sh | 15 +- tools/extras/install_kaldi_io.sh | 6 + 8 files changed, 57 insertions(+), 740 deletions(-) delete mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.sh delete mode 100755 egs/callhome_diarization/v1/diarization/kaldi_io.py create mode 100755 tools/extras/install_kaldi_io.sh diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py index 5cc5135ec4a..62676d64510 100755 --- a/egs/callhome_diarization/v1/diarization/VB_diarization.py +++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz) # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -121,7 +122,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1) NN = coo_matrix(NN) # represent zero-order stats using sparse matrix - print 'Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape) + print('Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape)) LL = np.sum(G) # total log-likelihod as calculated using UBM mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D)) @@ -141,7 +142,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, if downsample is not None: # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames # This speeds-up diarization for the price of lowering its frame resolution - downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil(1.0 * nframes / downsample)), nframes)) + downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil((nframes - 1.0) / downsample)) + 1, nframes)) NN = downsampler.dot(NN) VtiEF = downsampler.dot(VtiEF) G = downsampler.dot(G) @@ -210,7 +211,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None, matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto', cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05)) - print ii, Li[-2] + print(ii, Li[-2]) if ii > 0 and L - Li[-2][0] < epsilon: @@ -341,10 +342,10 @@ def forward_backward(lls, tr, ip): lfw[0] = lls[0] + np.log(ip) lbw[-1] = 0.0 - for ii in xrange(1,len(lls)): + for ii in range(1,len(lls)): lfw[ii] = lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1) - for ii in reversed(xrange(len(lls)-1)): + for ii in reversed(range(len(lls)-1)): lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1) tll = logsumexp(lfw[-1]) diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py index d89a082a4b3..1e1d7191ec9 100755 --- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2019 Zili Huang @@ -10,10 +10,9 @@ import numpy as np import VB_diarization -import pickle import kaldi_io -import sys import argparse +from convert_VB_model import load_dubm, load_ivector_extractor def get_utt_list(utt2spk_filename): with open(utt2spk_filename, 'r') as fh: @@ -105,7 +104,7 @@ def main(): help='The rttm file to initialize the VB system, usually the AHC cluster result') parser.add_argument('output_dir', type=str, help='Output directory') parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model') - parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model') + parser.add_argument('ie_model', type=str, help='Path of the i-vector extractor model') parser.add_argument('--max-speakers', type=int, default=10, help='Maximum number of speakers expected in the utterance (default: 10)') @@ -138,17 +137,15 @@ def main(): args = parser.parse_args() print(args) - init_rttm_filename = args.init_rttm_filename utt_list = get_utt_list("{}/utt2spk".format(args.data_dir)) utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir)) # Load the diagonal UBM and i-vector extractor - with open(args.dubm_model, 'rb') as fh: - dubm_para = pickle.load(fh) - with open(args.ie_model, 'rb') as fh: - ie_para = pickle.load(fh) + dubm_para = load_dubm(args.dubm_model) + ie_para = load_ivector_extractor(args.ie_model) + # Check the diagonal UBM and i-vector extractor model assert '' in dubm_para and '' in dubm_para and '' in dubm_para DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para[''], dubm_para[''], dubm_para[''] assert 'M' in ie_para @@ -169,7 +166,7 @@ def main(): # In init_ref, 0 denotes the silence silence frames # 1 denotes the overlapping speech frames, the speaker # label starts from 2. - init_ref = create_ref(utt, utt2num_frames, init_rttm_filename) + init_ref = create_ref(utt, utt2num_frames, args.init_rttm_filename) # load MFCC features X = (feats_dict[utt]).astype(np.float64) diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh index 5be0b8a587f..765c4eee8b8 100755 --- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh +++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh @@ -88,13 +88,28 @@ sdata=$data_dir/split$nj; utils/split_data.sh $data_dir $nj || exit 1; if [ $stage -le 0 ]; then + # Dump the diagonal UBM model into txt format. + "$train_cmd" $output_dir/log/convert_diag_ubm.log \ + gmm-global-copy --binary=false \ + $dubm_model \ + $output_dir/tmp/dubm.tmp || exit 1; + + # Dump the ivector extractor model into txt format. + "$train_cmd" $output_dir/log/convert_ie.log \ + ivector-extractor-copy --binary=false \ + $ie_model \ + $output_dir/tmp/ie.tmp || exit 1; +fi + +if [ $stage -le 1 ]; then + # VB resegmentation $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \ - python2 diarization/VB_resegmentation.py --max-speakers $max_speakers \ + python3 diarization/VB_resegmentation.py --max-speakers $max_speakers \ --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \ --sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \ --loopProb $loopProb --statScale $statScale --llScale $llScale \ --channel $channel --initialize $initialize \ - $sdata/JOB $init_rttm_filename $output_dir/tmp $dubm_model $ie_model || exit 1; + $sdata/JOB $init_rttm_filename $output_dir/tmp $output_dir/tmp/dubm.tmp $output_dir/tmp/ie.tmp || exit 1; - cat $output_dir/tmp/* > $output_dir/rttm/VB_rttm + cat $output_dir/tmp/*.rttm > $output_dir/rttm/VB_rttm fi diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py index cf5da8bfa75..b1f25b0dbfd 100755 --- a/egs/callhome_diarization/v1/diarization/convert_VB_model.py +++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py @@ -3,21 +3,21 @@ # Copyright 2019 Zili Huang # Apache 2.0 -# This script is called by diarization/convert_VB_model.sh. -# It converts diagonal UBM and ivector extractor to numpy -# array format +# This script loads diagonal UBM and ivector extractor from text file. +import os import numpy as np -import pickle -import sys def load_dubm(dubm_text): + assert os.path.exists(dubm_text) + para_dict = {} - with open(dubm_text, 'r') as fh: - content = fh.readlines() state = 0 data_array = [] + with open(dubm_text, 'r') as fh: + content = fh.readlines() + for line in content: line = line.strip('\n') line_split = line.split() @@ -55,15 +55,18 @@ def load_dubm(dubm_text): data_array.append(data_list) else: raise ValueError("Condition not defined.") - return para_dict + return para_dict # the diagonal ubm parameter includes , , , def load_ivector_extractor(ie_text): + assert os.path.exists(ie_text) + para_dict = {} - with open(ie_text, 'r') as fh: - content = fh.readlines() state = 0 data_3dmatrix = [] + with open(ie_text, 'r') as fh: + content = fh.readlines() + for line in content: line = line.strip('\n') if line == " [": @@ -93,26 +96,4 @@ def load_ivector_extractor(ie_text): else: raise ValueError("Condition not defined.") para_dict['M'] = np.array(data_3dmatrix) - return para_dict - -def save_dict(para_dict, output_filename): - with open(output_filename, 'wb') as fh: - pickle.dump(para_dict, fh) - return 0 - -def main(): - dubm_model = sys.argv[1] - ivec_extractor_model = sys.argv[2] - output_dir = sys.argv[3] - - # the diagonal ubm parameter includes , , , - dubm_para = load_dubm(dubm_model) - save_dict(dubm_para, "{}/diag_ubm.pkl".format(output_dir)) - - # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] - ie_para = load_ivector_extractor(ivec_extractor_model) - save_dict(ie_para, "{}/ie.pkl".format(output_dir)) - return 0 - -if __name__ == "__main__": - main() + return para_dict # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh deleted file mode 100755 index 60a6c3687e0..00000000000 --- a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Zili Huang -# Apache 2.0 - -# This script is part of VB resegmentation, it converts diagonal UBM and -# ivector extractor to numpy array format - -# begin configuration section. -stage=0 -cmd=run.pl - -# end configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [options] " - echo " Options:" - echo " --stage (0|1) # start script from part-way through" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes" - echo "e.g.:" - echo "$0 exp/diag_ubm_1024/final.dubm exp/extractor_diag_c1024_i128/final.ie exp/VB" - exit 1; -fi - -gmm_model=$1 -ivec_extractor=$2 -VB_dir=$3 - -if [ $stage -le 0 ]; then - # Dump the diagonal UBM model into txt format. - "$train_cmd" $VB_dir/log/convert_diag_ubm.log \ - gmm-global-copy --binary=false \ - $gmm_model \ - $VB_dir/dubm.tmp || exit 1; - - # Dump the ivector extractor model into txt format. - "$train_cmd" $VB_dir/log/convert_ie.log \ - ivector-extractor-copy --binary=false \ - $ivec_extractor \ - $VB_dir/ie.tmp || exit 1; -fi - -if [ $stage -le 1 ]; then - # Convert txt to numpy format - python diarization/convert_VB_model.py $VB_dir/dubm.tmp $VB_dir/ie.tmp $VB_dir || exit 1; - - rm $VB_dir/dubm.tmp $VB_dir/ie.tmp || exit 1; -fi diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py deleted file mode 100755 index dae5599b8f1..00000000000 --- a/egs/callhome_diarization/v1/diarization/kaldi_io.py +++ /dev/null @@ -1,627 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -# Copyright 2014-2016 Brno University of Technology (author: Karel Vesely) -# Licensed under the Apache License, Version 2.0 (the "License") - -import numpy as np -import sys, os, re, gzip, struct - -################################################# -# Adding kaldi tools to shell path, - -# Select kaldi, -if not 'KALDI_ROOT' in os.environ: - # Default! To change run python with 'export KALDI_ROOT=/some_dir python' - os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk' - -# Add kaldi tools to path, -os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH'] - - -################################################# -# Define all custom exceptions, -class UnsupportedDataType(Exception): pass -class UnknownVectorHeader(Exception): pass -class UnknownMatrixHeader(Exception): pass - -class BadSampleSize(Exception): pass -class BadInputFormat(Exception): pass - -class SubprocessFailed(Exception): pass - -################################################# -# Data-type independent helper functions, - -def open_or_fd(file, mode='rb'): - """ fd = open_or_fd(file) - Open file, gzipped file, pipe, or forward the file-descriptor. - Eventually seeks in the 'file' argument contains ':offset' suffix. - """ - offset = None - try: - # strip 'ark:' prefix from r{x,w}filename (optional), - if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file): - (prefix,file) = file.split(':',1) - # separate offset from filename (optional), - if re.search(':[0-9]+$', file): - (file,offset) = file.rsplit(':',1) - # input pipe? - if file[-1] == '|': - fd = popen(file[:-1], 'rb') # custom, - # output pipe? - elif file[0] == '|': - fd = popen(file[1:], 'wb') # custom, - # is it gzipped? - elif file.split('.')[-1] == 'gz': - fd = gzip.open(file, mode) - # a normal file... - else: - fd = open(file, mode) - except TypeError: - # 'file' is opened file descriptor, - fd = file - # Eventually seek to offset, - if offset != None: fd.seek(int(offset)) - return fd - -# based on '/usr/local/lib/python3.4/os.py' -def popen(cmd, mode="rb"): - if not isinstance(cmd, str): - raise TypeError("invalid cmd type (%s, expected string)" % type(cmd)) - - import subprocess, io, threading - - # cleanup function for subprocesses, - def cleanup(proc, cmd): - ret = proc.wait() - if ret > 0: - raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret)) - return - - # text-mode, - if mode == "r": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdout) - elif mode == "w": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return io.TextIOWrapper(proc.stdin) - # binary, - elif mode == "rb": - proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdout - elif mode == "wb": - proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE) - threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread, - return proc.stdin - # sanity, - else: - raise ValueError("invalid mode %s" % mode) - - -def read_key(fd): - """ [key] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - key = '' - while 1: - char = fd.read(1).decode("latin1") - if char == '' : break - if char == ' ' : break - key += char - key = key.strip() - if key == '': return None # end of file, - assert(re.match('^\S+$',key) != None) # check format (no whitespace!) - return key - - -################################################# -# Integer vectors (alignments, ...), - -def read_ali_ark(file_or_fd): - """ Alias to 'read_vec_int_ark()' """ - return read_vec_int_ark(file_or_fd) - -def read_vec_int_ark(file_or_fd): - """ generator(key,vec) = read_vec_int_ark(file_or_fd) - Create generator of (key,vector) tuples, which reads from the ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_int(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_int(file_or_fd): - """ [int-vec] = read_vec_int(file_or_fd) - Read kaldi integer vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Elements from int32 vector are sored in tuples: (sizeof(int32), value), - vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size) - assert(vec[0]['size'] == 4) # int32 size, - ans = vec[:]['value'] # values are in 2nd column, - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=int) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_int(file_or_fd, v, key=''): - """ write_vec_int(f, v, key='') - Write a binary kaldi integer vector to filename or stream. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_int(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # dim, - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v.shape[0])) - # data, - for i in range(len(v)): - fd.write('\4'.encode()) # int32 type, - fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary, - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float vectors (confidences, ivectors, ...), - -# Reading, -def read_vec_flt_scp(file_or_fd): - """ generator(key,mat) = read_vec_flt_scp(file_or_fd) - Returns generator of (key,vector) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,vec in kaldi_io.read_vec_flt_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - vec = read_vec_flt(rxfile) - yield key, vec - finally: - if fd is not file_or_fd : fd.close() - -def read_vec_flt_ark(file_or_fd): - """ generator(key,vec) = read_vec_flt_ark(file_or_fd) - Create generator of (key,vector) tuples, reading from an ark file/stream. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Read ark to a 'dictionary': - d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - ali = read_vec_flt(fd) - yield key, ali - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_vec_flt(file_or_fd): - """ [flt-vec] = read_vec_flt(file_or_fd) - Read kaldi float vector, ascii or binary input, - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode() - if binary == '\0B': # binary flag - # Data type, - header = fd.read(3).decode() - if header == 'FV ': sample_size = 4 # floats - elif header == 'DV ': sample_size = 8 # doubles - else: raise UnknownVectorHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimension, - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim - # Read whole vector, - buf = fd.read(vec_size * sample_size) - if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - return ans - else: # ascii, - arr = (binary + fd.readline().decode()).strip().split() - try: - arr.remove('['); arr.remove(']') # optionally - except ValueError: - pass - ans = np.array(arr, dtype=float) - if fd is not file_or_fd : fd.close() # cleanup - return ans - -# Writing, -def write_vec_flt(file_or_fd, v, key=''): - """ write_vec_flt(f, v, key='') - Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename or opened file descriptor for writing, - v : the vector to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the vector. - - Example of writing single vector: - kaldi_io.write_vec_flt(filename, vec) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,vec in dict.iteritems(): - kaldi_io.write_vec_flt(f, vec, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if v.dtype == 'float32': fd.write('FV '.encode()) - elif v.dtype == 'float64': fd.write('DV '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype) - # Dim, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim - # Data, - fd.write(v.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# Float matrices (features, transformations, ...), - -# Reading, -def read_mat_scp(file_or_fd): - """ generator(key,mat) = read_mat_scp(file_or_fd) - Returns generator of (key,matrix) tuples, read according to kaldi scp. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the scp: - for key,mat in kaldi_io.read_mat_scp(file): - ... - - Read scp to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) } - """ - fd = open_or_fd(file_or_fd) - try: - for line in fd: - (key,rxfile) = line.decode().split(' ') - mat = read_mat(rxfile) - yield key, mat - finally: - if fd is not file_or_fd : fd.close() - -def read_mat_ark(file_or_fd): - """ generator(key,mat) = read_mat_ark(file_or_fd) - Returns generator of (key,matrix) tuples, read from ark file/stream. - file_or_fd : scp, gzipped scp, pipe or opened file descriptor. - - Iterate the ark: - for key,mat in kaldi_io.read_mat_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - mat = read_mat(fd) - yield key, mat - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_mat(file_or_fd): - """ [mat] = read_mat(file_or_fd) - Reads single kaldi matrix, supports ascii and binary. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - """ - fd = open_or_fd(file_or_fd) - try: - binary = fd.read(2).decode() - if binary == '\0B' : - mat = _read_mat_binary(fd) - else: - assert(binary == ' [') - mat = _read_mat_ascii(fd) - finally: - if fd is not file_or_fd: fd.close() - return mat - -def _read_mat_binary(fd): - # Data type - header = fd.read(3).decode() - # 'CM', 'CM2', 'CM3' are possible values, - if header.startswith('CM'): return _read_compressed_mat(fd, header) - elif header == 'FM ': sample_size = 4 # floats - elif header == 'DM ': sample_size = 8 # doubles - else: raise UnknownMatrixHeader("The header contained '%s'" % header) - assert(sample_size > 0) - # Dimensions - s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] - # Read whole matrix - buf = fd.read(rows * cols * sample_size) - if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32') - elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64') - else : raise BadSampleSize - mat = np.reshape(vec,(rows,cols)) - return mat - -def _read_mat_ascii(fd): - rows = [] - while 1: - line = fd.readline().decode() - if (len(line) == 0) : raise BadInputFormat # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line - arr = line.strip().split() - if arr[-1] != ']': - rows.append(np.array(arr,dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1],dtype='float32')) # last line - mat = np.vstack(rows) - return mat - - -def _read_compressed_mat(fd, format): - """ Read a compressed matrix, - see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h - methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...), - """ - assert(format == 'CM ') # The formats CM2, CM3 are not supported... - - # Format of header 'struct', - global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written, - per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')]) - - # Read global header, - globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0] - - # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ] - # { cols }{ size } - col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols) - col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32) - data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major, - - mat = np.zeros((cols,rows), dtype='float32') - p0 = col_headers[:, 0].reshape(-1, 1) - p25 = col_headers[:, 1].reshape(-1, 1) - p75 = col_headers[:, 2].reshape(-1, 1) - p100 = col_headers[:, 3].reshape(-1, 1) - mask_0_64 = (data <= 64) - mask_193_255 = (data > 192) - mask_65_192 = (~(mask_0_64 | mask_193_255)) - - mat += (p0 + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32) - mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32) - mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32) - - return mat.T # transpose! col-major -> row-major, - - -# Writing, -def write_mat(file_or_fd, m, key=''): - """ write_mat(f, m, key='') - Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats. - Arguments: - file_or_fd : filename of opened file descriptor for writing, - m : the matrix to be stored, - key (optional) : used for writing ark-file, the utterance-id gets written before the matrix. - - Example of writing single matrix: - kaldi_io.write_mat(filename, mat) - - Example of writing arkfile: - with open(ark_file,'w') as f: - for key,mat in dict.iteritems(): - kaldi_io.write_mat(f, mat, key=key) - """ - fd = open_or_fd(file_or_fd, mode='wb') - if sys.version_info[0] == 3: assert(fd.mode == 'wb') - try: - if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id), - fd.write('\0B'.encode()) # we write binary! - # Data-type, - if m.dtype == 'float32': fd.write('FM '.encode()) - elif m.dtype == 'float64': fd.write('DM '.encode()) - else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype) - # Dims, - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows - fd.write('\04'.encode()) - fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols - # Data, - fd.write(m.tobytes()) - finally: - if fd is not file_or_fd : fd.close() - - -################################################# -# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...) -# Corresponds to: vector > > -# - outer vector: time axis -# - inner vector: records at the time -# - tuple: int = index, float = value -# - -def read_cnet_ark(file_or_fd): - """ Alias of function 'read_post_ark()', 'cnet' = confusion network """ - return read_post_ark(file_or_fd) - -def read_post_ark(file_or_fd): - """ generator(key,vec>) = read_post_ark(file) - Returns generator of (key,posterior) tuples, read from ark file. - file_or_fd : ark, gzipped ark, pipe or opened file descriptor. - - Iterate the ark: - for key,post in kaldi_io.read_post_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:post for key,post in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - post = read_post(fd) - yield key, post - key = read_key(fd) - finally: - if fd is not file_or_fd: fd.close() - -def read_post(file_or_fd): - """ [post] = read_post(file_or_fd) - Reads single kaldi 'Posterior' in binary format. - - The 'Posterior' is C++ type 'vector > >', - the outer-vector is usually time axis, inner-vector are the records - at given time, and the tuple is composed of an 'index' (integer) - and a 'float-value'. The 'float-value' can represent a probability - or any other numeric value. - - Returns vector of vectors of tuples. - """ - fd = open_or_fd(file_or_fd) - ans=[] - binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag - assert(fd.read(1).decode() == '\4'); # int-size - outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - # Loop over 'outer-vector', - for i in range(outer_vec_size): - assert(fd.read(1).decode() == '\4'); # int-size - inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin) - data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size) - assert(data[0]['size_idx'] == 4) - assert(data[0]['size_post'] == 4) - ans.append(data[['idx','post']].tolist()) - - if fd is not file_or_fd: fd.close() - return ans - - -################################################# -# Kaldi Confusion Network bin begin/end times, -# (kaldi stores CNs time info separately from the Posterior). -# - -def read_cntime_ark(file_or_fd): - """ generator(key,vec>) = read_cntime_ark(file_or_fd) - Returns generator of (key,cntime) tuples, read from ark file. - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Iterate the ark: - for key,time in kaldi_io.read_cntime_ark(file): - ... - - Read ark to a 'dictionary': - d = { key:time for key,time in kaldi_io.read_post_ark(file) } - """ - fd = open_or_fd(file_or_fd) - try: - key = read_key(fd) - while key: - cntime = read_cntime(fd) - yield key, cntime - key = read_key(fd) - finally: - if fd is not file_or_fd : fd.close() - -def read_cntime(file_or_fd): - """ [cntime] = read_cntime(file_or_fd) - Reads single kaldi 'Confusion Network time info', in binary format: - C++ type: vector >. - (begin/end times of bins at the confusion network). - - Binary layout is ' ...' - - file_or_fd : file, gzipped file, pipe or opened file descriptor. - - Returns vector of tuples. - """ - fd = open_or_fd(file_or_fd) - binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary - - assert(fd.read(1).decode() == '\4'); # int-size - vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins) - - data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size) - assert(data[0]['size_beg'] == 4) - assert(data[0]['size_end'] == 4) - ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end), - - if fd is not file_or_fd : fd.close() - return ans - - -################################################# -# Segments related, -# - -# Segments as 'Bool vectors' can be handy, -# - for 'superposing' the segmentations, -# - for frame-selection in Speaker-ID experiments, -def read_segments_as_bool_vec(segments_file): - """ [ bool_vec ] = read_segments_as_bool_vec(segments_file) - using kaldi 'segments' file for 1 wav, format : ' ' - - t-beg, t-end is in seconds, - - assumed 100 frames/second, - """ - segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1) - # Sanity checks, - assert(len(segs) > 0) # empty segmentation is an error, - assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file, - # Convert time to frame-indexes, - start = np.rint([100 * rec[2] for rec in segs]).astype(int) - end = np.rint([100 * rec[3] for rec in segs]).astype(int) - # Taken from 'read_lab_to_bool_vec', htk.py, - frms = np.repeat(np.r_[np.tile([False,True], len(end)), False], - np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0]) - assert np.sum(end-start) == np.sum(frms) - return frms - diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh index 3dfb3ccb127..4ec3aa4f740 100755 --- a/egs/callhome_diarization/v2/run.sh +++ b/egs/callhome_diarization/v2/run.sh @@ -19,8 +19,8 @@ vaddir=`pwd`/mfcc data_root=/export/corpora5/LDC stage=0 nnet_dir=exp/xvector_nnet_1a/ -num_components=1024 -ivector_dim=400 +num_components=1024 # the number of UBM components (used for VB resegmentation) +ivector_dim=400 # the dimension of i-vector (used for VB resegmentation) # Prepare datasets if [ $stage -le 0 ]; then @@ -376,10 +376,6 @@ if [ $stage -le 12 ]; then --num-threads 1 --num-processes 1 --nj 40 \ exp/diag_ubm_$num_components/final.dubm data/train \ exp/extractor_diag_c${num_components}_i${ivector_dim} - - # Converts diagonal UBM and ivector extractor to numpy array format - diarization/convert_VB_model.sh --cmd "$train_cmd" exp/diag_ubm_$num_components/final.dubm \ - exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie exp/VB fi if [ $stage -le 13 ]; then @@ -391,10 +387,11 @@ if [ $stage -le 13 ]; then # VB resegmentation. In this script, I use the x-vector result to # initialize the VB system. You can also use i-vector result or random - # initize the VB system. + # initize the VB system. The following script uses kaldi_io. + # You could use `sh ../../../tools/extras/install_kaldi_io.sh` to install it diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \ - --initialize 1 \ - data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; + --initialize 1 data/callhome $init_rttm_file exp/VB \ + exp/diag_ubm_$num_components/final.dubm exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie || exit 1; # Compute the DER after VB resegmentation mkdir -p exp/VB/results || exit 1; diff --git a/tools/extras/install_kaldi_io.sh b/tools/extras/install_kaldi_io.sh new file mode 100755 index 00000000000..e3192be78a8 --- /dev/null +++ b/tools/extras/install_kaldi_io.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Install kaldi_io. Please refer to https://github.com/vesis84/kaldi-io-for-python +# for details. + +python3 -m pip install --user kaldi_io