From bb411a143c242551c896e4a8b3fd7fe0f3d065b5 Mon Sep 17 00:00:00 2001
From: zhuang <hzili1@a11.clsp.jhu.edu>
Date: Wed, 8 May 2019 17:52:06 -0400
Subject: [PATCH 1/5] add VB for callhome v2

---
 .../v1/diarization/VB_diarization.py          | 359 ++++++++++
 .../v1/diarization/VB_resegmentation.py       | 224 +++++++
 .../v1/diarization/VB_resegmentation.sh       | 102 +++
 .../v1/diarization/convert_VB_model.py        | 118 ++++
 .../v1/diarization/convert_VB_model.sh        |  53 ++
 .../v1/diarization/kaldi_io.py                | 627 ++++++++++++++++++
 .../train_ivector_extractor_diag.sh           | 151 +++++
 .../v1/local/make_callhome.sh                 |   5 +
 egs/callhome_diarization/v2/run.sh            |  51 +-
 src/ivectorbin/Makefile                       |   2 +-
 src/ivectorbin/ivector-extractor-copy.cc      |  64 ++
 11 files changed, 1754 insertions(+), 2 deletions(-)
 create mode 100755 egs/callhome_diarization/v1/diarization/VB_diarization.py
 create mode 100755 egs/callhome_diarization/v1/diarization/VB_resegmentation.py
 create mode 100755 egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
 create mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.py
 create mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.sh
 create mode 100755 egs/callhome_diarization/v1/diarization/kaldi_io.py
 create mode 100755 egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
 create mode 100755 src/ivectorbin/ivector-extractor-copy.cc

diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
new file mode 100755
index 00000000000..31af078efd2
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -0,0 +1,359 @@
+# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Revision History
+#   L. Burget   16/07/13 01:00AM - original version
+#   L. Burget   20/06/17 12:07AM - np.asarray replaced by .toarray()
+#                                - minor bug fix in initializing q
+#                                - minor bug fix in ELBO calculation
+#                                - few more optimizations
+
+import numpy as np
+from scipy.sparse import coo_matrix
+import scipy.linalg as spl
+import numexpr as ne # the dependency on this modul can be avoided by replacing
+                       # logsumexp_ne and exp_ne with logsumexp and np.exp
+
+#[q sp Li] =
+def VB_diarization(X, m, iE, w, V, sp=None, q=None,
+                   maxSpeakers = 10, maxIters = 10,
+                   epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
+                   alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None,
+                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1):
+
+  """
+  This a generalized version of speaker diarization described in:
+
+  Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  Montreal, CRIM, May 2008.
+
+  Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone
+  Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal
+  Processing, December 2010.
+
+  The generalization introduced in this implementation lies in using an HMM
+  instead of the simple mixture model when modeling generation of segments
+  (or even frames) from speakers. HMM limits the probability of switching
+  between speakers when changing frames, which makes it possible to use
+  the model on frame-by-frame bases without any need to iterate between
+  1) clustering speech segments and 2) re-segmentation (i.e. as it was done in
+  the paper above).
+
+  Inputs:
+  X  - T x D array, where columns are D dimensional feature vectors for T frames
+  m  - C x D array of GMM component means
+  iE - C x D array of GMM component inverse covariance matrix diagonals
+  w  - C dimensional column vector of GMM component weights
+  V  - R x C x D array of eigenvoices
+  maxSpeakers - maximum number of speakers expected in the utterance
+  maxIters    - maximum number of algorithm iterations
+  epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
+  loopProb    - probability of not switching speakers between frames
+  statScale   - scale sufficient statiscits collected using UBM
+  llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
+                frames to UBM componets more uncertain)
+  sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
+                as the posteriors are represented by sparse matrix)
+  alphaQInit  - Dirichlet concentraion parameter for initializing q
+  downsample  - perform diarization on input downsampled by this factor
+  VtiEV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                VtiEV is None. However, it can be pre-calculated using function
+                precalculate_VtiEV(V) and used across calls of VB_diarization.
+  minDur      - minimum number of frames between speaker turns imposed by linear
+                chains of HMM states corresponding to each speaker. All the states
+                in a chain share the same output distribution
+  ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
+                per frame
+  plot        - if set to True, plot per-frame speaker posteriors.
+
+   Outputs:
+   q  - S x T matrix of posteriors attribution each frame to one of S possible
+        speakers, where S is given by opts.maxSpeakers
+   sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        should allow to estimate # of speaker in the utterance as the
+        probabilities of the redundant speaker should converge to zero.
+   Li - values of auxiliary function (and DER and frame cross-entropy between q
+        and reference if 'ref' is provided) over iterations.
+  """
+
+  # The references to equations corresponds to the technical report:
+  # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  # Montreal, CRIM, May 2008.
+
+  D=X.shape[1]  # feature dimensionality
+  C=len(w)      # number of mixture components
+  R=V.shape[0]  # subspace rank
+  nframes=X.shape[0]
+
+  if VtiEV is None:
+    VtiEV = precalculate_VtiEV(V, iE)
+
+  V = V.reshape(V.shape[0],-1)
+
+  if sp is None:
+    sp = np.ones(maxSpeakers)/maxSpeakers
+  else:
+    maxSpeakers = len(sp)
+
+  if q is None:
+    # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit
+    q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+    q = q / q.sum(1, keepdims=True)
+
+  # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
+  ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
+  ll *= llScale
+  G = logsumexp_ne(ll, axis=1)
+  NN =  exp_ne(ll - G[:,np.newaxis]) * statScale
+  NN[NN<sparsityThr] = 0.0
+
+  #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
+  NN = coo_matrix(NN) # represent zero-order stats using sparse matrix
+  print 'Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape)
+  LL = np.sum(G) # total log-likelihod as calculated using UBM
+
+  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D))
+
+  #G = np.sum((NN.multiply(ll - np.log(w))).toarray(), 1) + Kx  # eq. (15) # Aleready calculated above
+
+  # Calculate per-frame first order statistics projected into the R-dim. subspace
+  # V^T \Sigma^{-1} F_m
+  F_s = coo_matrix((((X[NN.row]-m[NN.col])*NN.data[:,np.newaxis]).flat,
+                   (NN.row.repeat(D), NN.col.repeat(D)*D+np.tile(range(D), len(NN.col)))), shape=(nframes, D*C))
+  VtiEF = F_s.tocsr().dot((iE.flat * V).T) ; del F_s
+  ## The code above is only efficient implementation of the following comented code
+  #VtiEF = 0;
+  #for ii in range(C):
+  #  VtiEF = VtiEF + V[ii*D:(ii+1)*D,:].T.dot(NN[ii,:] * np.sqrt(iE[:,[ii]]) *  (X - m[:,[ii]]))
+
+  if downsample is not None:
+    # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames
+    # This speeds-up diarization for the price of lowering its frame resolution
+    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil(1.0 * nframes / downsample)), nframes))
+    NN    = downsampler.dot(NN)
+    VtiEF = downsampler.dot(VtiEF)
+    G     = downsampler.dot(G)
+    q     = downsampler.dot(q) / downsample
+  else:
+    downsampler=np.array(1)
+
+  Li = [[LL]] # for the 0-th iteration,
+  if ref is not None:
+    Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+  lls = np.zeros_like(q)
+  tr = np.eye(minDur*maxSpeakers, k=1)
+  ip = np.zeros(minDur*maxSpeakers)
+  for ii in range(maxIters):
+    L = 0 # objective function (37) (i.e. VB lower-bound on the evidence)
+    Ns =   NN.T.dot(q).T                             # bracket in eq. (34) for all 's'
+    VtNsiEV_flat = Ns.astype(VtiEV.dtype).dot(VtiEV) # eq. (34) except for 'I' for all 's'
+    VtiEFs = q.T.dot(VtiEF)                          # eq. (35) except for \Lambda_s^{-1} for all 's'
+    for sid in range(maxSpeakers):
+        invL = np.linalg.inv(np.eye(R) + tril_to_sym(VtNsiEV_flat[sid])) # eq. (34) inverse
+        a = invL.dot(VtiEFs[sid])                                        # eq. (35)
+        # eq. (29) except for the prior term \ln \pi_s. Our prior is given by HMM
+        # trasition probability matrix. Instead of eq. (30), we need to use
+        # forward-backwar algorithm to calculate per-frame speaker posteriors,
+        # where 'lls' plays role of HMM output log-probabilities
+        lls[:,sid] = G + VtiEF.dot(a) - 0.5 * NN.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (iE.flat * V)).sum(0)))
+        L += 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
+
+    # Construct transition probability matrix with linear chain of 'minDur'
+    # states for each of 'maxSpeaker' speaker. The last state in each chain has
+    # self-loop probability 'loopProb' and the transition probabilities to the
+    # initial chain states given by vector '(1-loopProb) * sp'. From all other,
+    #states, one must move to the next state in the chain with probability one.
+    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*sp
+    tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
+    ip[::minDur]=sp
+    # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
+    # for each speaker.
+    q, tll, lf, lb = forward_backward(lls.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
+
+    # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
+    # L now contains -KL{q(Y)||p(Y)}. Therefore, L+ttl is correct value for ELBO.
+    L += tll
+    Li.append([L])
+
+    # ML estimate of speaker prior probabilities (analogue to eq. (38))
+    sp = q[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
+                       + lb[1:,::minDur] + lls[1:] + np.log((1-loopProb)*sp)-tll).sum(0)
+    sp = sp / sp.sum()
+
+    # per-frame speaker posteriors (analogue to eq. (30)), obtained by summing
+    # HMM state posteriors corresponding to each speaker
+    q = q.reshape(len(q),maxSpeakers,minDur).sum(axis=2)
+
+
+    # if reference is provided, report DER, cross-entropy and plot the figures
+    if ref is not None:
+      Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+      if plot:
+        #import matplotlib.pyplot
+        #if ii == 0: matplotlib.pyplot.clf()
+        #matplotlib.pyplot.subplot(maxIters, 1, ii+1)
+        #matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
+        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
+        import matplotlib.pyplot as plt
+        if ii == 0: plt.clf()
+        plt.subplot(maxIters, 1, ii+1)
+        plt.plot(downsampler.T.dot(q), lw=2)
+        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
+        plt.savefig("result.pdf")
+        
+      print ii, Li[-2]
+
+
+    if ii > 0 and L - Li[-2][0] < epsilon:
+      if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
+      break
+
+  if downsample is not None:
+    #upsample resulting q to match number of frames in the input utterance
+    q = downsampler.T.dot(q)
+
+  return q, sp, Li
+
+
+def precalculate_VtiEV(V, iE):
+    tril_ind = np.tril_indices(V.shape[0])
+    VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    for c in range(V.shape[1]):
+        VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+    return VtiEV
+
+
+# Initialize q (per-frame speaker posteriors) from a reference
+# (vector of per-frame zero based integer speaker IDs)
+def frame_labels2posterior_mx(labels, maxSpeakers):
+    #initialize from reference
+    #pmx = np.zeros((len(labels), labels.max()+1))
+    pmx = np.zeros((len(labels), maxSpeakers))
+    pmx[np.arange(len(labels)), labels] = 1
+    return pmx
+
+# Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
+# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame
+# speaker posteriors). If expected=False, q is converted into hard labels before
+# calculating DER. If expected=TRUE, posteriors in q are used to calculated
+# "expected" DER.
+def DER(q, ref, expected=True, xentropy=False):
+    from itertools import permutations
+
+    if not expected:
+        # replce probabiities in q by zeros and ones
+        hard_labels = q.argmax(1)
+        q = np.zeros_like(q)
+        q[range(len(q)), hard_labels] = 1
+
+    err_mx = np.empty((ref.max()+1, q.shape[1]))
+    for s in range(err_mx.shape[0]):
+        tmpq = q[ref == s,:]
+        err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0)
+
+    if err_mx.shape[0] < err_mx.shape[1]:
+        err_mx = err_mx.T
+
+    # try all alignments (permutations) of reference and detected speaker
+    #could be written in more efficient way using dynamic programing
+    acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
+              for perm in permutations(range(err_mx.shape[0]))]
+    if xentropy:
+       return min(acc)/float(len(ref))
+    else:
+       return (len(ref) - max(acc))/float(len(ref))
+
+
+###############################################################################
+# Module private functions
+###############################################################################
+def logsumexp(x, axis=0):
+    xmax = x.max(axis)
+    x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis))
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+# The folowing two functions are only versions optimized for speed using numexpr
+# module and can be replaced by logsumexp and np.exp functions to avoid
+# the dependency on the module.
+def logsumexp_ne(x, axis=0):
+    xmax = np.array(x).max(axis=axis)
+    xmax_e = np.expand_dims(xmax, axis)
+    x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis)
+    x = ne.evaluate("xmax + log(x)")
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+def exp_ne(x, out=None):
+    return ne.evaluate("exp(x)", out=None)
+
+
+# Convert vector with lower-triangular coefficients into symetric matrix
+def tril_to_sym(tril):
+    R = np.sqrt(len(tril)*2).astype(int)
+    tril_ind = np.tril_indices(R)
+    S = np.empty((R,R))
+    S[tril_ind]       = tril
+    S[tril_ind[::-1]] = tril
+    return S
+
+
+def logdet(A):
+    return 2*np.sum(np.log(np.diag(spl.cholesky(A))))
+
+
+def forward_backward(lls, tr, ip):
+    """
+    Inputs:
+        lls - matrix of per-frame log HMM state output probabilities
+        tr  - transition probability matrix
+        ip  - vector of initial state probabilities (i.e. statrting in the state)
+    Outputs:
+        sp  - matrix of per-frame state occupation posteriors
+        tll - total (forward) log-likelihood
+        lfw - log forward probabilities
+        lfw - log backward probabilities
+    """
+    ltr = np.log(tr)
+    lfw = np.empty_like(lls)
+    lbw = np.empty_like(lls)
+    lfw[:] = -np.inf
+    lbw[:] = -np.inf
+    lfw[0] = lls[0] + np.log(ip)
+    lbw[-1] = 0.0
+
+    for ii in  xrange(1,len(lls)):
+        lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
+
+    for ii in reversed(xrange(len(lls)-1)):
+        lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
+
+    tll = logsumexp(lfw[-1])
+    sp = np.exp(lfw + lbw - tll)
+    return sp, tll, lfw, lbw
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
new file mode 100755
index 00000000000..65b72d55a8e
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+
+# Copyright 2019  Zili Huang
+
+# This script is evoked by diarization/VB_resegmentation.sh. It is a wrapper 
+# for Variational Bayes resegmentation. It shows how to use the code from 
+# Brno University of Technology to do resegmentation.
+
+import numpy as np
+import VB_diarization
+import pickle
+import kaldi_io
+import sys
+import argparse
+
+def get_utt_list(utt2spk_filename):
+    with open(utt2spk_filename, 'r') as fh:
+        content = fh.readlines()
+    utt_list = [line.split()[0] for line in content]
+    print("{} utterances in total".format(len(utt_list)))
+    return utt_list
+
+# prepare utt2num_frames dictionary
+def get_utt2num_frames(utt2num_frames_filename):
+    utt2num_frames = {}
+    with open(utt2num_frames_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        utt2num_frames[line_split[0]] = int(line_split[1])
+    return utt2num_frames
+
+def create_ref(uttname, utt2num_frames, full_rttm_filename):
+    num_frames = utt2num_frames[uttname]
+
+    # We use 0 to denote silence frames and 1 to denote overlapping frames.
+    ref = np.zeros(num_frames)
+    speaker_dict = {}
+    num_spk = 0
+
+    with open(full_rttm_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        uttname_line = line_split[1]
+        if uttname != uttname_line:
+            continue
+        start_time, duration = int(float(line_split[3]) * 100), int(float(line_split[4]) * 100)
+        end_time = start_time + duration
+        spkname = line_split[7]
+        if spkname not in speaker_dict.keys():
+            spk_idx = num_spk + 2
+            speaker_dict[spkname] = spk_idx
+            num_spk += 1
+        
+        for i in range(start_time, end_time):
+            if i < 0:
+                raise ValueError("Time index less than 0")
+            elif i >= num_frames:
+                print("Time index exceeds number of frames")
+                break
+            else:
+                if ref[i] == 0:
+                    ref[i] = speaker_dict[spkname] 
+                else:
+                    ref[i] = 1 # The overlapping speech is marked as 1.
+    return ref.astype(int)
+
+# create output rttm file
+def create_rttm_output(uttname, predicted_label, output_dir, channel):
+    num_frames = len(predicted_label)
+
+    start_idx = 0
+    seg_list = []
+
+    last_label = predicted_label[0]
+    for i in range(num_frames):
+        if predicted_label[i] == last_label: # The speaker label remains the same.
+            continue
+        else: # The speaker label is different.
+            if last_label != 0: # Ignore the silence.
+                seg_list.append([start_idx, i, last_label])
+            start_idx = i
+            last_label = predicted_label[i]
+    if last_label != 0:
+        seg_list.append([start_idx, num_frames, last_label])
+
+    with open("{}/{}_predict.rttm".format(output_dir, uttname), 'w') as fh:
+        for i in range(len(seg_list)):
+            start_frame = (seg_list[i])[0]
+            end_frame = (seg_list[i])[1]
+            label = (seg_list[i])[2]
+            duration = end_frame - start_frame
+            fh.write("SPEAKER {} {} {:.2f} {:.2f} <NA> <NA> {} <NA> <NA>\n".format(uttname, channel, start_frame / 100.0, duration / 100.0, label))
+    return 0
+
+def main():
+    parser = argparse.ArgumentParser(description='VB Resegmentation Wrapper')
+    parser.add_argument('data_dir', type=str, help='Subset data directory')
+    parser.add_argument('init_rttm_filename', type=str, 
+                        help='The rttm file to initialize the VB system, usually the AHC cluster result')
+    parser.add_argument('output_dir', type=str, help='Output directory')
+    parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model')
+    parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model')
+
+    parser.add_argument('--true-rttm-filename', type=str, default="None",
+                        help='The true rttm label file')
+    parser.add_argument('--max-speakers', type=int, default=10,
+                        help='Maximum number of speakers expected in the utterance (default: 10)')
+    parser.add_argument('--max-iters', type=int, default=10,
+                        help='Maximum number of algorithm iterations (default: 10)')
+    parser.add_argument('--downsample', type=int, default=25,
+                        help='Perform diarization on input downsampled by this factor (default: 25)')
+    parser.add_argument('--alphaQInit', type=float, default=100.0,
+                        help='Dirichlet concentraion parameter for initializing q')
+    parser.add_argument('--sparsityThr', type=float, default=0.001,
+                        help='Set occupations smaller that this threshold to 0.0 (saves memory as \
+                        the posteriors are represented by sparse matrix)')
+    parser.add_argument('--epsilon', type=float, default=1e-6,
+                        help='Stop iterating, if obj. fun. improvement is less than epsilon')
+    parser.add_argument('--minDur', type=int, default=1,
+                        help='Minimum number of frames between speaker turns imposed by linear \
+                        chains of HMM states corresponding to each speaker. All the states \
+                        in a chain share the same output distribution')
+    parser.add_argument('--loopProb', type=float, default=0.9,
+                        help='Probability of not switching speakers between frames')
+    parser.add_argument('--statScale', type=float, default=0.2,
+                        help='Scale sufficient statiscits collected using UBM')
+    parser.add_argument('--llScale', type=float, default=1.0,
+                        help='Scale UBM likelihood (i.e. llScale < 1.0 make atribution of \
+                        frames to UBM componets more uncertain)')
+    parser.add_argument('--channel', type=int, default=0,
+                        help='Channel information in the rttm file')
+    parser.add_argument('--initialize', type=int, default=1,
+                        help='Whether to initalize the speaker posterior')
+
+    args = parser.parse_args()
+    print(args)
+    init_rttm_filename = args.init_rttm_filename
+
+    utt_list = get_utt_list("{}/utt2spk".format(args.data_dir))
+    utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir))
+    
+    # Load the diagonal UBM and i-vector extractor
+    with open(args.dubm_model, 'rb') as fh:
+        dubm_para = pickle.load(fh)
+    with open(args.ie_model, 'rb') as fh:
+        ie_para = pickle.load(fh)
+
+    assert '<WEIGHTS>' in dubm_para and '<MEANS_INVVARS>' in dubm_para and '<INV_VARS>' in dubm_para
+    DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para['<WEIGHTS>'], dubm_para['<MEANS_INVVARS>'], dubm_para['<INV_VARS>']
+    assert 'M' in ie_para
+    IE_M = np.transpose(ie_para['M'], (2, 0, 1))
+    
+    m = DUBM_MEANS_INVVARS / DUBM_INV_VARS
+    iE = DUBM_INV_VARS
+    w = DUBM_WEIGHTS
+    V = IE_M
+
+    # Load the MFCC features
+    feats_dict = {}
+    for key,mat in kaldi_io.read_mat_scp("{}/feats.scp".format(args.data_dir)):
+        feats_dict[key] = mat
+
+    for utt in utt_list:
+        # Get the alignments from the clustering result.
+        # In init_ref, 0 denotes the silence silence frames
+        # 1 denotes the overlapping speech frames, the speaker
+        # label starts from 2.
+        init_ref = create_ref(utt, utt2num_frames, init_rttm_filename)
+        # Ground truth of the diarization.
+        if args.true_rttm_filename != "None":
+            true_ref = create_ref(utt, utt2num_frames, args.true_rttm_filename)
+        else:
+            true_ref = None
+
+        X = feats_dict[utt]
+        X = X.astype(np.float64)
+
+        # Keep only the voiced frames (0 denotes the silence 
+        # frames, 1 denotes the overlapping speech frames).
+        mask = (init_ref >= 2)
+        X_voiced = X[mask]
+        init_ref_voiced = init_ref[mask] - 2
+
+        if args.true_rttm_filename != "None": 
+            true_ref_voiced = true_ref[mask] - 2
+            if np.sum(true_ref) == 0:
+                print("Warning: {} has no voiced frames in the label file".format(utt))
+                continue
+        if X_voiced.shape[0] == 0:
+            print("Warning: {} has no voiced frames in the initialization file".format(utt))
+            continue
+
+        # Initialize the posterior of each speaker based on the clustering result.
+        if args.initialize:
+            q = VB_diarization.frame_labels2posterior_mx(init_ref_voiced, args.max_speakers)
+        else:
+            q = None
+        
+        # VB resegmentation
+
+        # q  - S x T matrix of posteriors attribution each frame to one of S possible
+        #      speakers, where S is given by opts.maxSpeakers
+        # sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        #      should allow to estimate # of speaker in the utterance as the
+        #      probabilities of the redundant speaker should converge to zero.
+        # Li - values of auxiliary function (and DER and frame cross-entropy between q
+        #      and reference if 'ref' is provided) over iterations.
+        q_out, sp_out, L_out = VB_diarization.VB_diarization(X_voiced, m, iE, w, V, sp=None, q=q, maxSpeakers=args.max_speakers, maxIters=args.max_iters, VtiEV=None,
+                                  downsample=args.downsample, alphaQInit=args.alphaQInit, sparsityThr=args.sparsityThr, epsilon=args.epsilon, minDur=args.minDur,
+                                  loopProb=args.loopProb, statScale=args.statScale, llScale=args.llScale, ref=None, plot=False)
+        predicted_label_voiced = np.argmax(q_out, 1) + 2
+        predicted_label = (np.zeros(len(mask))).astype(int)
+        predicted_label[mask] = predicted_label_voiced
+
+        # Create the output rttm file
+        create_rttm_output(utt, predicted_label, args.output_dir, args.channel)
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
new file mode 100755
index 00000000000..7e448b4a580
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Copyright 2019  Zili Huang
+
+# This script is a wrapper for Variational Bayes resegmentation.
+# It shows how to use the code from Brno University of Technology 
+# to do resegmentation.
+
+# Begin configuration section.
+nj=20
+cmd=run.pl
+stage=0
+true_rttm_filename=None
+max_speakers=10
+max_iters=10
+downsample=25
+alphaQInit=100.0
+sparsityThr=0.001
+epsilon=1e-6
+minDur=1
+loopProb=0.9
+statScale=0.2
+llScale=1.0
+channel=0
+initialize=1
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+  echo "Usage: diarization/VB_resegmentation.sh <data_dir> <init_rttm_filename> <output_dir> <dubm_model> <ie_model>"
+  echo "Variational Bayes Re-segmenatation"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # How to run jobs."
+  echo "  --nj <num-jobs|20>                               # Number of parallel jobs to run."
+  echo "  --true-rttm-filename <string|None>               # The true rttm label file"
+  echo "  --max-speakers <n|10>                            # Maximum number of speakers" 
+  echo "                                                   # expected in the utterance" 
+  echo "					           # (default: 10)"
+  echo "  --max-iters <n|10>                               # Maximum number of algorithm"
+  echo "                                                   # iterations (default: 10)" 
+  echo "  --downsample <n|25>                              # Perform diarization on input"
+  echo "                                                   # downsampled by this factor"
+  echo "                                                   # (default: 25)"
+  echo "  --alphaQInit <float|100.0>                       # Dirichlet concentraion"
+  echo "                                                   # parameter for initializing q"
+  echo "  --sparsityThr <float|0.001>                      # Set occupations smaller that"
+  echo "                                                   # this threshold to 0.0 (saves"
+  echo "                                                   # memory as the posteriors are"
+  echo "                                                   # represented by sparse matrix)"
+  echo "  --epsilon <float|1e-6>                           # Stop iterating, if obj. fun." 
+  echo "                                                   # improvement is less than" 
+  echo "				                   # epsilon"
+  echo "  --minDur <n|1>                                   # Minimum number of frames"
+  echo "                                                   # between speaker turns imposed"
+  echo "                                                   # by linear chains of HMM" 
+  echo "                                                   # state corresponding to each" 
+  echo "                                                   # speaker. All the states in"
+  echo "                                                   # a chain share the same output"
+  echo "                                                   # distribution"
+  echo "  --loopProb <float|0.9>                           # Probability of not switching"
+  echo "                                                   # speakers between frames"
+  echo "  --statScale <float|0.2>                          # Scale sufficient statistics" 
+  echo "                                                   # collected using UBM"
+  echo "  --llScale <float|1.0>                            # Scale UBM likelihood (i.e."
+  echo "                                                   # llScale < 1.0 make" 
+  echo "                                                   # attribution of frames to UBM"
+  echo "                                                   # componets more uncertain)" 
+  echo "  --channel <n|0>                                  # Channel information in the rttm file"
+  echo "  --initialize <n|1>                               # Whether to initalize the"
+  echo "                                                   # speaker posterior (if not)"
+  echo "                                                   # the speaker posterior will be"
+  echo "                                                   # randomly initilized"
+
+  exit 1;
+fi
+
+data_dir=$1
+init_rttm_filename=$2
+output_dir=$3
+dubm_model=$4
+ie_model=$5
+
+mkdir -p $output_dir/tmp
+
+sdata=$data_dir/split$nj;
+utils/split_data.sh $data_dir $nj || exit 1;
+
+if [ $stage -le 0 ]; then
+    $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
+      python diarization/VB_resegmentation.py --true-rttm-filename $true_rttm_filename --max-speakers $max_speakers \
+        --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
+	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
+	--loopProb $loopProb --statScale $statScale --llScale $llScale \
+	--channel $channel --initialize $initialize \
+        $sdata/JOB $init_rttm_filename $output_dir/tmp $dubm_model $ie_model || exit 1;
+
+    cat $output_dir/tmp/* > $output_dir/rttm/VB_rttm
+fi
diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
new file mode 100755
index 00000000000..cf5da8bfa75
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Copyright 2019  Zili Huang
+# Apache 2.0
+
+# This script is called by diarization/convert_VB_model.sh.
+# It converts diagonal UBM and ivector extractor to numpy 
+# array format
+
+import numpy as np
+import pickle
+import sys
+
+def load_dubm(dubm_text):
+    para_dict = {}
+    with open(dubm_text, 'r') as fh:
+        content = fh.readlines()
+    state = 0
+    data_array = []
+
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        if state == 0:
+            if len(line_split) == 1:
+                continue
+            elif len(line_split) == 2 and line_split[1] == "[": # Start of a multi-line matrix like <MEANS_INVVARS> and <INV_VARS> 
+                para_name = line_split[0]
+                state = 1
+                data_array = []
+            elif len(line_split) >= 3 and line_split[1] == "[" and line_split[-1] == "]": # Single line vector like <WEIGHTS>
+                para_name = line_split[0]
+                data_list = []
+                for i in range(2, len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                para_dict[para_name] = data_list
+            else:
+                raise ValueError("Condition not defined.")
+        elif state == 1:
+            if line_split[-1] == "]": # End of a multi-line matrix like <MEANS_INVVARS> and <INV_VARS>
+                data_list = []
+                for i in range(len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+                data_array = np.array(data_array)
+                para_dict[para_name] = data_array
+                state = 0
+            else:
+                data_list = []
+                for i in range(len(line_split)):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+        else:
+            raise ValueError("Condition not defined.")
+    return para_dict 
+
+def load_ivector_extractor(ie_text):
+    para_dict = {}
+    with open(ie_text, 'r') as fh:
+        content = fh.readlines()
+    state = 0
+    data_3dmatrix = []
+
+    for line in content:
+        line = line.strip('\n')
+        if line == "<SigmaInv> [":
+            break
+        if state == 0:
+            if not line.startswith("<M>"):
+                continue
+            else:
+                state = 1
+                data_matrix = []
+        elif state == 1:
+            line_split = line.split()
+            if line_split[0] == "[":
+                data_matrix = []
+                continue
+            elif line_split[-1] == "]":
+                data_array = []
+                for i in range(len(line_split)-1):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+                data_3dmatrix.append(data_matrix)
+            else:
+                data_array = []
+                for i in range(len(line_split)):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+        else:
+            raise ValueError("Condition not defined.")
+    para_dict['M'] = np.array(data_3dmatrix)
+    return para_dict 
+
+def save_dict(para_dict, output_filename):
+    with open(output_filename, 'wb') as fh:
+        pickle.dump(para_dict, fh)
+    return 0
+
+def main():
+    dubm_model = sys.argv[1]
+    ivec_extractor_model = sys.argv[2]
+    output_dir = sys.argv[3]
+
+    # the diagonal ubm parameter includes <GCONSTS>, <WEIGHTS>, <MEANS_INVVARS>, <INV_VARS>
+    dubm_para = load_dubm(dubm_model)
+    save_dict(dubm_para, "{}/diag_ubm.pkl".format(output_dir))
+
+    # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim]
+    ie_para = load_ivector_extractor(ivec_extractor_model)
+    save_dict(ie_para, "{}/ie.pkl".format(output_dir))
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh
new file mode 100755
index 00000000000..60a6c3687e0
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2019  Zili Huang
+# Apache 2.0
+
+# This script is part of VB resegmentation, it converts diagonal UBM and 
+# ivector extractor to numpy array format  
+
+# begin configuration section.
+stage=0
+cmd=run.pl
+
+# end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <gmm-model> <ivector-extractor-model> <VB-dir>"
+  echo " Options:"
+  echo "    --stage (0|1)  # start script from part-way through"
+  echo "    --cmd (run.pl|queue.pl...)  # specify how to run the sub-processes"
+  echo "e.g.:"
+  echo "$0 exp/diag_ubm_1024/final.dubm exp/extractor_diag_c1024_i128/final.ie exp/VB"
+  exit 1;
+fi
+
+gmm_model=$1
+ivec_extractor=$2
+VB_dir=$3
+
+if [ $stage -le 0 ]; then
+  # Dump the diagonal UBM model into txt format.
+  "$train_cmd" $VB_dir/log/convert_diag_ubm.log \
+    gmm-global-copy --binary=false \
+      $gmm_model \
+      $VB_dir/dubm.tmp || exit 1;
+
+  # Dump the ivector extractor model into txt format.
+  "$train_cmd" $VB_dir/log/convert_ie.log \
+    ivector-extractor-copy --binary=false \
+      $ivec_extractor \
+      $VB_dir/ie.tmp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # Convert txt to numpy format
+  python diarization/convert_VB_model.py $VB_dir/dubm.tmp $VB_dir/ie.tmp $VB_dir || exit 1;
+
+  rm $VB_dir/dubm.tmp $VB_dir/ie.tmp || exit 1;
+fi
diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py
new file mode 100755
index 00000000000..dae5599b8f1
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/kaldi_io.py
@@ -0,0 +1,627 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import numpy as np
+import sys, os, re, gzip, struct
+
+#################################################
+# Adding kaldi tools to shell path,
+
+# Select kaldi,
+if not 'KALDI_ROOT' in os.environ:
+  # Default! To change run python with 'export KALDI_ROOT=/some_dir python'
+  os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk'
+
+# Add kaldi tools to path,
+os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH']
+
+
+#################################################
+# Define all custom exceptions,
+class UnsupportedDataType(Exception): pass
+class UnknownVectorHeader(Exception): pass
+class UnknownMatrixHeader(Exception): pass
+
+class BadSampleSize(Exception): pass
+class BadInputFormat(Exception): pass
+
+class SubprocessFailed(Exception): pass
+
+#################################################
+# Data-type independent helper functions,
+
+def open_or_fd(file, mode='rb'):
+  """ fd = open_or_fd(file)
+   Open file, gzipped file, pipe, or forward the file-descriptor.
+   Eventually seeks in the 'file' argument contains ':offset' suffix.
+  """
+  offset = None
+  try:
+    # strip 'ark:' prefix from r{x,w}filename (optional),
+    if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file):
+      (prefix,file) = file.split(':',1)
+    # separate offset from filename (optional),
+    if re.search(':[0-9]+$', file):
+      (file,offset) = file.rsplit(':',1)
+    # input pipe?
+    if file[-1] == '|':
+      fd = popen(file[:-1], 'rb') # custom,
+    # output pipe?
+    elif file[0] == '|':
+      fd = popen(file[1:], 'wb') # custom,
+    # is it gzipped?
+    elif file.split('.')[-1] == 'gz':
+      fd = gzip.open(file, mode)
+    # a normal file...
+    else:
+      fd = open(file, mode)
+  except TypeError:
+    # 'file' is opened file descriptor,
+    fd = file
+  # Eventually seek to offset,
+  if offset != None: fd.seek(int(offset))
+  return fd
+
+# based on '/usr/local/lib/python3.4/os.py'
+def popen(cmd, mode="rb"):
+  if not isinstance(cmd, str):
+    raise TypeError("invalid cmd type (%s, expected string)" % type(cmd))
+
+  import subprocess, io, threading
+
+  # cleanup function for subprocesses,
+  def cleanup(proc, cmd):
+    ret = proc.wait()
+    if ret > 0:
+      raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret))
+    return
+
+  # text-mode,
+  if mode == "r":
+    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return io.TextIOWrapper(proc.stdout)
+  elif mode == "w":
+    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return io.TextIOWrapper(proc.stdin)
+  # binary,
+  elif mode == "rb":
+    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return proc.stdout
+  elif mode == "wb":
+    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
+    return proc.stdin
+  # sanity,
+  else:
+    raise ValueError("invalid mode %s" % mode)
+
+
+def read_key(fd):
+  """ [key] = read_key(fd)
+   Read the utterance-key from the opened ark/stream descriptor 'fd'.
+  """
+  key = ''
+  while 1:
+    char = fd.read(1).decode("latin1")
+    if char == '' : break
+    if char == ' ' : break
+    key += char
+  key = key.strip()
+  if key == '': return None # end of file,
+  assert(re.match('^\S+$',key) != None) # check format (no whitespace!)
+  return key
+
+
+#################################################
+# Integer vectors (alignments, ...),
+
+def read_ali_ark(file_or_fd):
+  """ Alias to 'read_vec_int_ark()' """
+  return read_vec_int_ark(file_or_fd)
+
+def read_vec_int_ark(file_or_fd):
+  """ generator(key,vec) = read_vec_int_ark(file_or_fd)
+   Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      ali = read_vec_int(fd)
+      yield key, ali
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_vec_int(file_or_fd):
+  """ [int-vec] = read_vec_int(file_or_fd)
+   Read kaldi integer vector, ascii or binary input,
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode()
+  if binary == '\0B': # binary flag
+    assert(fd.read(1).decode() == '\4'); # int-size
+    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
+    # Elements from int32 vector are sored in tuples: (sizeof(int32), value),
+    vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size)
+    assert(vec[0]['size'] == 4) # int32 size,
+    ans = vec[:]['value'] # values are in 2nd column,
+  else: # ascii,
+    arr = (binary + fd.readline().decode()).strip().split()
+    try:
+      arr.remove('['); arr.remove(']') # optionally
+    except ValueError:
+      pass
+    ans = np.array(arr, dtype=int)
+  if fd is not file_or_fd : fd.close() # cleanup
+  return ans
+
+# Writing,
+def write_vec_int(file_or_fd, v, key=''):
+  """ write_vec_int(f, v, key='')
+   Write a binary kaldi integer vector to filename or stream.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_int(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # dim,
+    fd.write('\4'.encode()) # int32 type,
+    fd.write(struct.pack(np.dtype('int32').char, v.shape[0]))
+    # data,
+    for i in range(len(v)):
+      fd.write('\4'.encode()) # int32 type,
+      fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary,
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# Float vectors (confidences, ivectors, ...),
+
+# Reading,
+def read_vec_flt_scp(file_or_fd):
+  """ generator(key,mat) = read_vec_flt_scp(file_or_fd)
+   Returns generator of (key,vector) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,vec in kaldi_io.read_vec_flt_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    for line in fd:
+      (key,rxfile) = line.decode().split(' ')
+      vec = read_vec_flt(rxfile)
+      yield key, vec
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_vec_flt_ark(file_or_fd):
+  """ generator(key,vec) = read_vec_flt_ark(file_or_fd)
+   Create generator of (key,vector<float>) tuples, reading from an ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      ali = read_vec_flt(fd)
+      yield key, ali
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_vec_flt(file_or_fd):
+  """ [flt-vec] = read_vec_flt(file_or_fd)
+   Read kaldi float vector, ascii or binary input,
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode()
+  if binary == '\0B': # binary flag
+    # Data type,
+    header = fd.read(3).decode()
+    if header == 'FV ': sample_size = 4 # floats
+    elif header == 'DV ': sample_size = 8 # doubles
+    else: raise UnknownVectorHeader("The header contained '%s'" % header)
+    assert(sample_size > 0)
+    # Dimension,
+    assert(fd.read(1).decode() == '\4'); # int-size
+    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
+    # Read whole vector,
+    buf = fd.read(vec_size * sample_size)
+    if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32')
+    elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64')
+    else : raise BadSampleSize
+    return ans
+  else: # ascii,
+    arr = (binary + fd.readline().decode()).strip().split()
+    try:
+      arr.remove('['); arr.remove(']') # optionally
+    except ValueError:
+      pass
+    ans = np.array(arr, dtype=float)
+  if fd is not file_or_fd : fd.close() # cleanup
+  return ans
+
+# Writing,
+def write_vec_flt(file_or_fd, v, key=''):
+  """ write_vec_flt(f, v, key='')
+   Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_flt(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # Data-type,
+    if v.dtype == 'float32': fd.write('FV '.encode())
+    elif v.dtype == 'float64': fd.write('DV '.encode())
+    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype)
+    # Dim,
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim
+    # Data,
+    fd.write(v.tobytes())
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# Float matrices (features, transformations, ...),
+
+# Reading,
+def read_mat_scp(file_or_fd):
+  """ generator(key,mat) = read_mat_scp(file_or_fd)
+   Returns generator of (key,matrix) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,mat in kaldi_io.read_mat_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    for line in fd:
+      (key,rxfile) = line.decode().split(' ')
+      mat = read_mat(rxfile)
+      yield key, mat
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_mat_ark(file_or_fd):
+  """ generator(key,mat) = read_mat_ark(file_or_fd)
+   Returns generator of (key,matrix) tuples, read from ark file/stream.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,mat in kaldi_io.read_mat_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      mat = read_mat(fd)
+      yield key, mat
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_mat(file_or_fd):
+  """ [mat] = read_mat(file_or_fd)
+   Reads single kaldi matrix, supports ascii and binary.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    binary = fd.read(2).decode()
+    if binary == '\0B' :
+      mat = _read_mat_binary(fd)
+    else:
+      assert(binary == ' [')
+      mat = _read_mat_ascii(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+  return mat
+
+def _read_mat_binary(fd):
+  # Data type
+  header = fd.read(3).decode()
+  # 'CM', 'CM2', 'CM3' are possible values,
+  if header.startswith('CM'): return _read_compressed_mat(fd, header)
+  elif header == 'FM ': sample_size = 4 # floats
+  elif header == 'DM ': sample_size = 8 # doubles
+  else: raise UnknownMatrixHeader("The header contained '%s'" % header)
+  assert(sample_size > 0)
+  # Dimensions
+  s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
+  # Read whole matrix
+  buf = fd.read(rows * cols * sample_size)
+  if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
+  elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
+  else : raise BadSampleSize
+  mat = np.reshape(vec,(rows,cols))
+  return mat
+
+def _read_mat_ascii(fd):
+  rows = []
+  while 1:
+    line = fd.readline().decode()
+    if (len(line) == 0) : raise BadInputFormat # eof, should not happen!
+    if len(line.strip()) == 0 : continue # skip empty line
+    arr = line.strip().split()
+    if arr[-1] != ']':
+      rows.append(np.array(arr,dtype='float32')) # not last line
+    else:
+      rows.append(np.array(arr[:-1],dtype='float32')) # last line
+      mat = np.vstack(rows)
+      return mat
+
+
+def _read_compressed_mat(fd, format):
+  """ Read a compressed matrix,
+      see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
+      methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
+  """
+  assert(format == 'CM ') # The formats CM2, CM3 are not supported...
+
+  # Format of header 'struct',
+  global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
+  per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
+
+  # Read global header,
+  globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
+
+  # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
+  #                         {           cols           }{     size         }
+  col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
+  col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
+  data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major,
+
+  mat = np.zeros((cols,rows), dtype='float32')
+  p0 = col_headers[:, 0].reshape(-1, 1)
+  p25 = col_headers[:, 1].reshape(-1, 1)
+  p75 = col_headers[:, 2].reshape(-1, 1)
+  p100 = col_headers[:, 3].reshape(-1, 1)
+  mask_0_64 = (data <= 64)
+  mask_193_255 = (data > 192)
+  mask_65_192 = (~(mask_0_64 | mask_193_255))
+
+  mat += (p0  + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
+  mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
+  mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)
+
+  return mat.T # transpose! col-major -> row-major,
+
+
+# Writing,
+def write_mat(file_or_fd, m, key=''):
+  """ write_mat(f, m, key='')
+  Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats.
+  Arguments:
+   file_or_fd : filename of opened file descriptor for writing,
+   m : the matrix to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the matrix.
+
+   Example of writing single matrix:
+   kaldi_io.write_mat(filename, mat)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,mat in dict.iteritems():
+       kaldi_io.write_mat(f, mat, key=key)
+  """
+  fd = open_or_fd(file_or_fd, mode='wb')
+  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
+  try:
+    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
+    fd.write('\0B'.encode()) # we write binary!
+    # Data-type,
+    if m.dtype == 'float32': fd.write('FM '.encode())
+    elif m.dtype == 'float64': fd.write('DM '.encode())
+    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype)
+    # Dims,
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows
+    fd.write('\04'.encode())
+    fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols
+    # Data,
+    fd.write(m.tobytes())
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+
+#################################################
+# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...)
+# Corresponds to: vector<vector<tuple<int,float> > >
+# - outer vector: time axis
+# - inner vector: records at the time
+# - tuple: int = index, float = value
+#
+
+def read_cnet_ark(file_or_fd):
+  """ Alias of function 'read_post_ark()', 'cnet' = confusion network """
+  return read_post_ark(file_or_fd)
+
+def read_post_ark(file_or_fd):
+  """ generator(key,vec<vec<int,float>>) = read_post_ark(file)
+   Returns generator of (key,posterior) tuples, read from ark file.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,post in kaldi_io.read_post_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:post for key,post in kaldi_io.read_post_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      post = read_post(fd)
+      yield key, post
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd: fd.close()
+
+def read_post(file_or_fd):
+  """ [post] = read_post(file_or_fd)
+   Reads single kaldi 'Posterior' in binary format.
+
+   The 'Posterior' is C++ type 'vector<vector<tuple<int,float> > >',
+   the outer-vector is usually time axis, inner-vector are the records
+   at given time,  and the tuple is composed of an 'index' (integer)
+   and a 'float-value'. The 'float-value' can represent a probability
+   or any other numeric value.
+
+   Returns vector of vectors of tuples.
+  """
+  fd = open_or_fd(file_or_fd)
+  ans=[]
+  binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag
+  assert(fd.read(1).decode() == '\4'); # int-size
+  outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
+
+  # Loop over 'outer-vector',
+  for i in range(outer_vec_size):
+    assert(fd.read(1).decode() == '\4'); # int-size
+    inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin)
+    data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size)
+    assert(data[0]['size_idx'] == 4)
+    assert(data[0]['size_post'] == 4)
+    ans.append(data[['idx','post']].tolist())
+
+  if fd is not file_or_fd: fd.close()
+  return ans
+
+
+#################################################
+# Kaldi Confusion Network bin begin/end times,
+# (kaldi stores CNs time info separately from the Posterior).
+#
+
+def read_cntime_ark(file_or_fd):
+  """ generator(key,vec<tuple<float,float>>) = read_cntime_ark(file_or_fd)
+   Returns generator of (key,cntime) tuples, read from ark file.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,time in kaldi_io.read_cntime_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:time for key,time in kaldi_io.read_post_ark(file) }
+  """
+  fd = open_or_fd(file_or_fd)
+  try:
+    key = read_key(fd)
+    while key:
+      cntime = read_cntime(fd)
+      yield key, cntime
+      key = read_key(fd)
+  finally:
+    if fd is not file_or_fd : fd.close()
+
+def read_cntime(file_or_fd):
+  """ [cntime] = read_cntime(file_or_fd)
+   Reads single kaldi 'Confusion Network time info', in binary format:
+   C++ type: vector<tuple<float,float> >.
+   (begin/end times of bins at the confusion network).
+
+   Binary layout is '<num-bins> <beg1> <end1> <beg2> <end2> ...'
+
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Returns vector of tuples.
+  """
+  fd = open_or_fd(file_or_fd)
+  binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary
+
+  assert(fd.read(1).decode() == '\4'); # int-size
+  vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
+
+  data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size)
+  assert(data[0]['size_beg'] == 4)
+  assert(data[0]['size_end'] == 4)
+  ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end),
+
+  if fd is not file_or_fd : fd.close()
+  return ans
+
+
+#################################################
+# Segments related,
+#
+
+# Segments as 'Bool vectors' can be handy,
+# - for 'superposing' the segmentations,
+# - for frame-selection in Speaker-ID experiments,
+def read_segments_as_bool_vec(segments_file):
+  """ [ bool_vec ] = read_segments_as_bool_vec(segments_file)
+   using kaldi 'segments' file for 1 wav, format : '<utt> <rec> <t-beg> <t-end>'
+   - t-beg, t-end is in seconds,
+   - assumed 100 frames/second,
+  """
+  segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1)
+  # Sanity checks,
+  assert(len(segs) > 0) # empty segmentation is an error,
+  assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file,
+  # Convert time to frame-indexes,
+  start = np.rint([100 * rec[2] for rec in segs]).astype(int)
+  end = np.rint([100 * rec[3] for rec in segs]).astype(int)
+  # Taken from 'read_lab_to_bool_vec', htk.py,
+  frms = np.repeat(np.r_[np.tile([False,True], len(end)), False],
+                   np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0])
+  assert np.sum(end-start) == np.sum(frms)
+  return frms
+
diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
new file mode 100755
index 00000000000..61f7e4a14e0
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# Copyright   2013  Daniel Povey
+#             2014  David Snyder
+#             2019  Zili Huang
+# Apache 2.0.
+
+# This script trains the i-vector extractor for VB resegmentation. This script is very similar to 
+# sid/train_ivector_extractor.sh. The only difference is that the UBM is assumed to be diagonal.
+
+# Begin configuration section.
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
+        # run is nj * num_processes * num_threads, and the number of
+        # separate pieces of data is nj * num_processes.
+num_threads=4
+num_processes=4 # each job runs this many processes, each with --num-threads threads
+cmd="run.pl"
+stage=-4
+num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
+ivector_dim=400 # dimension of the extracted i-vector
+use_weights=false # set to true to turn on the regression of log-weights on the ivector.
+num_iters=10
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method)
+cleanup=true
+apply_cmn=true # If true, apply sliding window cepstral mean normalization
+posterior_scale=1.0 # This scale helps to control for successve features being highly
+                    # correlated.  E.g. try 0.1 or 0.3
+sum_accs_opt=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <gmm-model> <data> <extractor-dir>"
+  echo " e.g.: $0 exp/ubm_2048_male/final.dubm data/train_male exp/extractor_male"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
+  echo "                                                   # to summing accs in memory)"
+  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
+  echo "                                                   # increased much above 4)"
+  echo "  --stage <stage|-4>                               # To control partial reruns"
+  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
+  echo "                                                   # sum-accs process to nfs server."
+  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
+  echo "                                                   # normalization to features"
+  exit 1;
+fi
+
+gmm_model=$1
+data=$2
+dir=$3
+srcdir=$(dirname $gmm_model)
+
+for f in $gmm_model $data/feats.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+nj_full=$[$nj*$num_processes]
+sdata=$data/split$nj_full;
+utils/split_data.sh $data $nj_full || exit 1;
+
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+if [ -f $srcdir/delta_opts ]; then
+  cp $srcdir/delta_opts $dir/ 2>/dev/null
+fi
+
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
+## Set up features.
+if $apply_cmn; then
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+else
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+fi
+
+# Initialize the i-vector extractor using the FGMM input
+if [ $stage -le -2 ]; then
+  cp $gmm_model $dir/final.dubm || exit 1;
+  $cmd $dir/log/init.log \
+    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
+     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
+fi
+
+# Do Gaussian selection and posterior extracion
+
+if [ $stage -le -1 ]; then
+  echo $nj_full > $dir/num_jobs
+  echo "$0: doing Gaussian selection and posterior computation"
+  $cmd JOB=1:$nj_full $dir/log/gselect.JOB.log \
+    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$feats" ark:- \| \
+    scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
+else
+  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
+    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
+    exit 1
+  fi
+fi
+
+x=0
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    rm $dir/.error 2>/dev/null
+
+    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
+    for j in $(seq $nj_full); do
+      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
+    done
+
+    echo "Accumulating stats (pass $x)"
+    for g in $(seq $nj); do
+      start=$[$num_processes*($g-1)+1]
+      $cmd $parallel_opts $dir/log/acc.$x.$g.log \
+        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
+          $dir/acc.$x.$g || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
+  fi
+  x=$[$x+1]
+done
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
+ln -s $x.ie $dir/final.ie
diff --git a/egs/callhome_diarization/v1/local/make_callhome.sh b/egs/callhome_diarization/v1/local/make_callhome.sh
index caa8f679f22..21411fb6194 100755
--- a/egs/callhome_diarization/v1/local/make_callhome.sh
+++ b/egs/callhome_diarization/v1/local/make_callhome.sh
@@ -70,4 +70,9 @@ utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk
 utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \
   > $data_dir/callhome2/reco2num_spk
 
+rm $data_dir/callhome/segments || exit 1;
+awk '{print $1, $1}' $data_dir/callhome/wav.scp > $data_dir/callhome/utt2spk
+utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt
+utils/fix_data_dir.sh $data_dir/callhome
+
 rm -rf $tmp_dir 2> /dev/null
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index b79717e2348..8eb2d93f600 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -19,6 +19,8 @@ vaddir=`pwd`/mfcc
 data_root=/export/corpora5/LDC
 stage=0
 nnet_dir=exp/xvector_nnet_1a/
+num_components=1024
+ivector_dim=400
 
 # Prepare datasets
 if [ $stage -le 0 ]; then
@@ -53,7 +55,7 @@ if [ $stage -le 1 ]; then
   # callhome1 and callhome2.  Each partition is treated like a held-out
   # dataset, and used to estimate various quantities needed to perform
   # diarization on the other part (and vice versa).
-  for name in train callhome1 callhome2; do
+  for name in train callhome1 callhome2 callhome; do
     steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \
       --cmd "$train_cmd" --write-utt2num-frames true \
       data/$name exp/make_mfcc $mfccdir
@@ -356,3 +358,50 @@ if [ $stage -le 11 ]; then
   # Compare to 8.69% in ../v1/run.sh
   echo "Using the oracle number of speakers, DER: $der%"
 fi
+
+# Variational Bayes resegmentation using the code from Brno University of Technology
+# Please see https://speech.fit.vutbr.cz/software/vb-diarization-eigenvoice-and-hmm-priors 
+# for details
+if [ $stage -le 12 ]; then
+  utils/subset_data_dir.sh data/train 32000 data/train_32k
+  # Train the diagonal UBM.
+  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
+    --nj 40 --num-threads 8 --subsample 1 --delta-order 0 --apply-cmn false \
+    data/train_32k $num_components exp/diag_ubm_$num_components
+
+  # Train the i-vector extractor. The UBM is assumed to be diagonal.
+  diarization/train_ivector_extractor_diag.sh \
+    --cmd "$train_cmd --mem 35G" \
+    --ivector-dim $ivector_dim --num-iters 5 --apply-cmn false \
+    --num-threads 1 --num-processes 1 --nj 40 \
+    exp/diag_ubm_$num_components/final.dubm data/train \
+    exp/extractor_diag_c${num_components}_i${ivector_dim}
+
+  # Converts diagonal UBM and ivector extractor to numpy array format
+  diarization/convert_VB_model.sh --cmd "$train_cmd" exp/diag_ubm_$num_components/final.dubm \
+    exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie exp/VB 
+fi
+
+if [ $stage -le 13 ]; then
+  output_rttm_dir=exp/VB/rttm
+  mkdir -p $output_rttm_dir || exit 1;
+  cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \
+    $nnet_dir/xvectors_callhome2/plda_scores/rttm > $output_rttm_dir/x_vector_rttm
+  init_rttm_file=$output_rttm_dir/x_vector_rttm
+
+  # VB resegmentation. In this script, I use the x-vector result to 
+  # initialize the VB system. You can also use i-vector result or random 
+  # initize the VB system.
+  diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \
+    --true_rttm_filename "None" --initialize 1 \
+    data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; 
+
+  # Compute the DER after VB resegmentation
+  mkdir -p exp/VB/results || exit 1;
+  md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s $output_rttm_dir/VB_rttm 2> exp/VB/log/VB_DER.log \
+    > exp/VB/results/VB_DER.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    exp/VB/results/VB_DER.txt)
+  # After VB resegmentation, DER: 6.48%
+  echo "After VB resegmentation, DER: $der%"
+fi
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 5a738352d9c..8dc3498b83b 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -4,7 +4,7 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-BINFILES = ivector-extractor-init ivector-extractor-acc-stats \
+BINFILES = ivector-extractor-init ivector-extractor-copy ivector-extractor-acc-stats \
            ivector-extractor-sum-accs ivector-extractor-est \
            ivector-extract compute-vad select-voiced-frames \
            compute-vad-from-frame-likes merge-vads \
diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc
new file mode 100755
index 00000000000..858981e6164
--- /dev/null
+++ b/src/ivectorbin/ivector-extractor-copy.cc
@@ -0,0 +1,64 @@
+// ivectorbin/ivector-extractor-init.cc
+
+// Copyright 2019  Zili Huang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "ivector/ivector-extractor.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using kaldi::int32;
+
+    const char *usage =
+        "Copy an ivector-extractor\n"
+        "Usage:  ivector-extractor-copy [options] <ivector-extractor-in> <ivector-extractor-out>\n"
+        "e.g.:\n"
+        " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n";
+
+    bool binary = true;
+    IvectorExtractorOptions ivector_opts;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write output in binary mode");
+    ivector_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string ivector_extractor_rxfilename = po.GetArg(1),
+        ivector_extractor_wxfilename = po.GetArg(2);
+
+    IvectorExtractor extractor;
+    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
+
+    WriteKaldiObject(extractor, ivector_extractor_wxfilename, binary);
+  
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+

From 57cd7723d1bff09b8357c7df7da8698df075089a Mon Sep 17 00:00:00 2001
From: zhuang <hzili1@a11.clsp.jhu.edu>
Date: Wed, 8 May 2019 18:30:35 -0400
Subject: [PATCH 2/5] remove true_rttm_filename option

---
 .../v1/diarization/VB_diarization.py          | 19 ++++++-------------
 .../v1/diarization/VB_resegmentation.py       | 17 +++--------------
 .../v1/diarization/VB_resegmentation.sh       |  4 +---
 egs/callhome_diarization/v2/run.sh            |  2 +-
 4 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
index 31af078efd2..5ac7691fd86 100755
--- a/egs/callhome_diarization/v1/diarization/VB_diarization.py
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -203,19 +203,12 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
       Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
 
       if plot:
-        #import matplotlib.pyplot
-        #if ii == 0: matplotlib.pyplot.clf()
-        #matplotlib.pyplot.subplot(maxIters, 1, ii+1)
-        #matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
-        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
-        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
-        import matplotlib.pyplot as plt
-        if ii == 0: plt.clf()
-        plt.subplot(maxIters, 1, ii+1)
-        plt.plot(downsampler.T.dot(q), lw=2)
-        #matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
-        #                         cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
-        plt.savefig("result.pdf")
+        import matplotlib.pyplot
+        if ii == 0: matplotlib.pyplot.clf()
+        matplotlib.pyplot.subplot(maxIters, 1, ii+1)
+        matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
+        matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+                                 cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
         
       print ii, Li[-2]
 
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
index 65b72d55a8e..b90200778bc 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -105,8 +105,6 @@ def main():
     parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model')
     parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model')
 
-    parser.add_argument('--true-rttm-filename', type=str, default="None",
-                        help='The true rttm label file')
     parser.add_argument('--max-speakers', type=int, default=10,
                         help='Maximum number of speakers expected in the utterance (default: 10)')
     parser.add_argument('--max-iters', type=int, default=10,
@@ -170,14 +168,10 @@ def main():
         # 1 denotes the overlapping speech frames, the speaker
         # label starts from 2.
         init_ref = create_ref(utt, utt2num_frames, init_rttm_filename)
-        # Ground truth of the diarization.
-        if args.true_rttm_filename != "None":
-            true_ref = create_ref(utt, utt2num_frames, args.true_rttm_filename)
-        else:
-            true_ref = None
 
-        X = feats_dict[utt]
-        X = X.astype(np.float64)
+        # load MFCC features
+        X = (feats_dict[utt]).astype(np.float64)
+        assert len(init_ref) == len(X)
 
         # Keep only the voiced frames (0 denotes the silence 
         # frames, 1 denotes the overlapping speech frames).
@@ -185,11 +179,6 @@ def main():
         X_voiced = X[mask]
         init_ref_voiced = init_ref[mask] - 2
 
-        if args.true_rttm_filename != "None": 
-            true_ref_voiced = true_ref[mask] - 2
-            if np.sum(true_ref) == 0:
-                print("Warning: {} has no voiced frames in the label file".format(utt))
-                continue
         if X_voiced.shape[0] == 0:
             print("Warning: {} has no voiced frames in the initialization file".format(utt))
             continue
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
index 7e448b4a580..53ad10ddd4c 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -10,7 +10,6 @@
 nj=20
 cmd=run.pl
 stage=0
-true_rttm_filename=None
 max_speakers=10
 max_iters=10
 downsample=25
@@ -36,7 +35,6 @@ if [ $# != 5 ]; then
   echo "Options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # How to run jobs."
   echo "  --nj <num-jobs|20>                               # Number of parallel jobs to run."
-  echo "  --true-rttm-filename <string|None>               # The true rttm label file"
   echo "  --max-speakers <n|10>                            # Maximum number of speakers" 
   echo "                                                   # expected in the utterance" 
   echo "					           # (default: 10)"
@@ -91,7 +89,7 @@ utils/split_data.sh $data_dir $nj || exit 1;
 
 if [ $stage -le 0 ]; then
     $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
-      python diarization/VB_resegmentation.py --true-rttm-filename $true_rttm_filename --max-speakers $max_speakers \
+      python diarization/VB_resegmentation.py --max-speakers $max_speakers \
         --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
 	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
 	--loopProb $loopProb --statScale $statScale --llScale $llScale \
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index 8eb2d93f600..3dfb3ccb127 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -393,7 +393,7 @@ if [ $stage -le 13 ]; then
   # initialize the VB system. You can also use i-vector result or random 
   # initize the VB system.
   diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \
-    --true_rttm_filename "None" --initialize 1 \
+    --initialize 1 \
     data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; 
 
   # Compute the DER after VB resegmentation

From d7104be5967b474373cbba25da8819a4faca0b9e Mon Sep 17 00:00:00 2001
From: zhuang <hzili1@a11.clsp.jhu.edu>
Date: Wed, 8 May 2019 20:27:50 -0400
Subject: [PATCH 3/5] add some comments

---
 .../v1/diarization/VB_resegmentation.py                   | 8 +++++---
 .../v1/diarization/train_ivector_extractor_diag.sh        | 4 ++--
 src/ivectorbin/ivector-extractor-copy.cc                  | 4 ++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
index b90200778bc..d89a082a4b3 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -2,9 +2,11 @@
 
 # Copyright 2019  Zili Huang
 
-# This script is evoked by diarization/VB_resegmentation.sh. It is a wrapper 
-# for Variational Bayes resegmentation. It shows how to use the code from 
-# Brno University of Technology to do resegmentation.
+# This script is evoked by diarization/VB_resegmentation.sh. It prepares the necessary
+# inputs for the VB system and creates the output RTTM file. The inputs include data directory
+# (data_dir), the rttm file to initialize the VB system(init_rttm_filename), the directory to
+# output the rttm prediction(output_dir), path to diagonal UBM model(dubm_model) and path to 
+# i-vector extractor model(ie_model).
 
 import numpy as np
 import VB_diarization
diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
index 61f7e4a14e0..9254012f3b0 100755
--- a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
+++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
@@ -5,8 +5,8 @@
 #             2019  Zili Huang
 # Apache 2.0.
 
-# This script trains the i-vector extractor for VB resegmentation. This script is very similar to 
-# sid/train_ivector_extractor.sh. The only difference is that the UBM is assumed to be diagonal.
+# This script trains the i-vector extractor for VB resegmentation. It is very similar to 
+# sid/train_ivector_extractor.sh except that the UBM is assumed to be diagonal in this script.
 
 # Begin configuration section.
 nj=10   # this is the number of separate queue jobs we run, but each one
diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc
index 858981e6164..f04a6d20120 100755
--- a/src/ivectorbin/ivector-extractor-copy.cc
+++ b/src/ivectorbin/ivector-extractor-copy.cc
@@ -1,4 +1,4 @@
-// ivectorbin/ivector-extractor-init.cc
+// ivectorbin/ivector-extractor-copy.cc
 
 // Copyright 2019  Zili Huang
 
@@ -29,7 +29,7 @@ int main(int argc, char *argv[]) {
     using kaldi::int32;
 
     const char *usage =
-        "Copy an ivector-extractor\n"
+        "Copy the i-vector extractor to a text file\n"
         "Usage:  ivector-extractor-copy [options] <ivector-extractor-in> <ivector-extractor-out>\n"
         "e.g.:\n"
         " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n";

From 7ba2444dd6f2392d19d56cf2762329230092437e Mon Sep 17 00:00:00 2001
From: zhuang <hzili1@a11.clsp.jhu.edu>
Date: Thu, 9 May 2019 11:45:31 -0400
Subject: [PATCH 4/5] fix python problem

---
 egs/callhome_diarization/v1/diarization/VB_diarization.py | 8 ++++----
 .../v1/diarization/VB_resegmentation.sh                   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
index 5ac7691fd86..5cc5135ec4a 100755
--- a/egs/callhome_diarization/v1/diarization/VB_diarization.py
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -23,8 +23,8 @@
 import numpy as np
 from scipy.sparse import coo_matrix
 import scipy.linalg as spl
-import numexpr as ne # the dependency on this modul can be avoided by replacing
-                       # logsumexp_ne and exp_ne with logsumexp and np.exp
+#import numexpr as ne # the dependency on this modul can be avoided by replacing
+#                       # logsumexp_ne and exp_ne with logsumexp and np.exp
 
 #[q sp Li] =
 def VB_diarization(X, m, iE, w, V, sp=None, q=None,
@@ -115,8 +115,8 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
   # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
   ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
   ll *= llScale
-  G = logsumexp_ne(ll, axis=1)
-  NN =  exp_ne(ll - G[:,np.newaxis]) * statScale
+  G = logsumexp(ll, axis=1)
+  NN =  np.exp(ll - G[:,np.newaxis]) * statScale
   NN[NN<sparsityThr] = 0.0
 
   #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
index 53ad10ddd4c..5be0b8a587f 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -89,7 +89,7 @@ utils/split_data.sh $data_dir $nj || exit 1;
 
 if [ $stage -le 0 ]; then
     $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
-      python diarization/VB_resegmentation.py --max-speakers $max_speakers \
+      python2 diarization/VB_resegmentation.py --max-speakers $max_speakers \
         --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
 	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
 	--loopProb $loopProb --statScale $statScale --llScale $llScale \

From d7971d9c6534e6469778517d4a00df133f0ba961 Mon Sep 17 00:00:00 2001
From: zhuang <hzili1@a11.clsp.jhu.edu>
Date: Thu, 30 May 2019 22:01:10 -0400
Subject: [PATCH 5/5] fix python problem, avoid pickle conversion

---
 .../v1/diarization/VB_diarization.py          |  11 +-
 .../v1/diarization/VB_resegmentation.py       |  17 +-
 .../v1/diarization/VB_resegmentation.sh       |  21 +-
 .../v1/diarization/convert_VB_model.py        |  47 +-
 .../v1/diarization/convert_VB_model.sh        |  53 --
 .../v1/diarization/kaldi_io.py                | 627 ------------------
 egs/callhome_diarization/v2/run.sh            |  15 +-
 tools/extras/install_kaldi_io.sh              |   6 +
 8 files changed, 57 insertions(+), 740 deletions(-)
 delete mode 100755 egs/callhome_diarization/v1/diarization/convert_VB_model.sh
 delete mode 100755 egs/callhome_diarization/v1/diarization/kaldi_io.py
 create mode 100755 tools/extras/install_kaldi_io.sh

diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
index 5cc5135ec4a..62676d64510 100755
--- a/egs/callhome_diarization/v1/diarization/VB_diarization.py
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,7 +122,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
 
   #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
   NN = coo_matrix(NN) # represent zero-order stats using sparse matrix
-  print 'Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape)
+  print('Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape))
   LL = np.sum(G) # total log-likelihod as calculated using UBM
 
   mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D))
@@ -141,7 +142,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
   if downsample is not None:
     # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames
     # This speeds-up diarization for the price of lowering its frame resolution
-    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil(1.0 * nframes / downsample)), nframes))
+    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil((nframes - 1.0) / downsample)) + 1, nframes))
     NN    = downsampler.dot(NN)
     VtiEF = downsampler.dot(VtiEF)
     G     = downsampler.dot(G)
@@ -210,7 +211,7 @@ def VB_diarization(X, m, iE, w, V, sp=None, q=None,
         matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
                                  cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
         
-      print ii, Li[-2]
+      print(ii, Li[-2])
 
 
     if ii > 0 and L - Li[-2][0] < epsilon:
@@ -341,10 +342,10 @@ def forward_backward(lls, tr, ip):
     lfw[0] = lls[0] + np.log(ip)
     lbw[-1] = 0.0
 
-    for ii in  xrange(1,len(lls)):
+    for ii in range(1,len(lls)):
         lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
 
-    for ii in reversed(xrange(len(lls)-1)):
+    for ii in reversed(range(len(lls)-1)):
         lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
 
     tll = logsumexp(lfw[-1])
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
index d89a082a4b3..1e1d7191ec9 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2019  Zili Huang
 
@@ -10,10 +10,9 @@
 
 import numpy as np
 import VB_diarization
-import pickle
 import kaldi_io
-import sys
 import argparse
+from convert_VB_model import load_dubm, load_ivector_extractor 
 
 def get_utt_list(utt2spk_filename):
     with open(utt2spk_filename, 'r') as fh:
@@ -105,7 +104,7 @@ def main():
                         help='The rttm file to initialize the VB system, usually the AHC cluster result')
     parser.add_argument('output_dir', type=str, help='Output directory')
     parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model')
-    parser.add_argument('ie_model', type=str, help='Path of the ivector extractor model')
+    parser.add_argument('ie_model', type=str, help='Path of the i-vector extractor model')
 
     parser.add_argument('--max-speakers', type=int, default=10,
                         help='Maximum number of speakers expected in the utterance (default: 10)')
@@ -138,17 +137,15 @@ def main():
 
     args = parser.parse_args()
     print(args)
-    init_rttm_filename = args.init_rttm_filename
 
     utt_list = get_utt_list("{}/utt2spk".format(args.data_dir))
     utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir))
     
     # Load the diagonal UBM and i-vector extractor
-    with open(args.dubm_model, 'rb') as fh:
-        dubm_para = pickle.load(fh)
-    with open(args.ie_model, 'rb') as fh:
-        ie_para = pickle.load(fh)
+    dubm_para = load_dubm(args.dubm_model)
+    ie_para = load_ivector_extractor(args.ie_model)
 
+    # Check the diagonal UBM and i-vector extractor model
     assert '<WEIGHTS>' in dubm_para and '<MEANS_INVVARS>' in dubm_para and '<INV_VARS>' in dubm_para
     DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para['<WEIGHTS>'], dubm_para['<MEANS_INVVARS>'], dubm_para['<INV_VARS>']
     assert 'M' in ie_para
@@ -169,7 +166,7 @@ def main():
         # In init_ref, 0 denotes the silence silence frames
         # 1 denotes the overlapping speech frames, the speaker
         # label starts from 2.
-        init_ref = create_ref(utt, utt2num_frames, init_rttm_filename)
+        init_ref = create_ref(utt, utt2num_frames, args.init_rttm_filename)
 
         # load MFCC features
         X = (feats_dict[utt]).astype(np.float64)
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
index 5be0b8a587f..765c4eee8b8 100755
--- a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -88,13 +88,28 @@ sdata=$data_dir/split$nj;
 utils/split_data.sh $data_dir $nj || exit 1;
 
 if [ $stage -le 0 ]; then
+  # Dump the diagonal UBM model into txt format.
+  "$train_cmd" $output_dir/log/convert_diag_ubm.log \
+    gmm-global-copy --binary=false \
+      $dubm_model \
+      $output_dir/tmp/dubm.tmp || exit 1;
+
+  # Dump the ivector extractor model into txt format.
+  "$train_cmd" $output_dir/log/convert_ie.log \
+    ivector-extractor-copy --binary=false \
+      $ie_model \
+      $output_dir/tmp/ie.tmp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+    # VB resegmentation
     $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
-      python2 diarization/VB_resegmentation.py --max-speakers $max_speakers \
+      python3 diarization/VB_resegmentation.py --max-speakers $max_speakers \
         --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
 	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
 	--loopProb $loopProb --statScale $statScale --llScale $llScale \
 	--channel $channel --initialize $initialize \
-        $sdata/JOB $init_rttm_filename $output_dir/tmp $dubm_model $ie_model || exit 1;
+        $sdata/JOB $init_rttm_filename $output_dir/tmp $output_dir/tmp/dubm.tmp $output_dir/tmp/ie.tmp || exit 1;
 
-    cat $output_dir/tmp/* > $output_dir/rttm/VB_rttm
+    cat $output_dir/tmp/*.rttm > $output_dir/rttm/VB_rttm
 fi
diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
index cf5da8bfa75..b1f25b0dbfd 100755
--- a/egs/callhome_diarization/v1/diarization/convert_VB_model.py
+++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
@@ -3,21 +3,21 @@
 # Copyright 2019  Zili Huang
 # Apache 2.0
 
-# This script is called by diarization/convert_VB_model.sh.
-# It converts diagonal UBM and ivector extractor to numpy 
-# array format
+# This script loads diagonal UBM and ivector extractor from text file.
 
+import os
 import numpy as np
-import pickle
-import sys
 
 def load_dubm(dubm_text):
+    assert os.path.exists(dubm_text)
+
     para_dict = {}
-    with open(dubm_text, 'r') as fh:
-        content = fh.readlines()
     state = 0
     data_array = []
 
+    with open(dubm_text, 'r') as fh:
+        content = fh.readlines()
+
     for line in content:
         line = line.strip('\n')
         line_split = line.split()
@@ -55,15 +55,18 @@ def load_dubm(dubm_text):
                 data_array.append(data_list)
         else:
             raise ValueError("Condition not defined.")
-    return para_dict 
+    return para_dict # the diagonal ubm parameter includes <GCONSTS>, <WEIGHTS>, <MEANS_INVVARS>, <INV_VARS> 
 
 def load_ivector_extractor(ie_text):
+    assert os.path.exists(ie_text)
+
     para_dict = {}
-    with open(ie_text, 'r') as fh:
-        content = fh.readlines()
     state = 0
     data_3dmatrix = []
 
+    with open(ie_text, 'r') as fh:
+        content = fh.readlines()
+
     for line in content:
         line = line.strip('\n')
         if line == "<SigmaInv> [":
@@ -93,26 +96,4 @@ def load_ivector_extractor(ie_text):
         else:
             raise ValueError("Condition not defined.")
     para_dict['M'] = np.array(data_3dmatrix)
-    return para_dict 
-
-def save_dict(para_dict, output_filename):
-    with open(output_filename, 'wb') as fh:
-        pickle.dump(para_dict, fh)
-    return 0
-
-def main():
-    dubm_model = sys.argv[1]
-    ivec_extractor_model = sys.argv[2]
-    output_dir = sys.argv[3]
-
-    # the diagonal ubm parameter includes <GCONSTS>, <WEIGHTS>, <MEANS_INVVARS>, <INV_VARS>
-    dubm_para = load_dubm(dubm_model)
-    save_dict(dubm_para, "{}/diag_ubm.pkl".format(output_dir))
-
-    # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim]
-    ie_para = load_ivector_extractor(ivec_extractor_model)
-    save_dict(ie_para, "{}/ie.pkl".format(output_dir))
-    return 0
-
-if __name__ == "__main__":
-    main()
+    return para_dict # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] 
diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh b/egs/callhome_diarization/v1/diarization/convert_VB_model.sh
deleted file mode 100755
index 60a6c3687e0..00000000000
--- a/egs/callhome_diarization/v1/diarization/convert_VB_model.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019  Zili Huang
-# Apache 2.0
-
-# This script is part of VB resegmentation, it converts diagonal UBM and 
-# ivector extractor to numpy array format  
-
-# begin configuration section.
-stage=0
-cmd=run.pl
-
-# end configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <gmm-model> <ivector-extractor-model> <VB-dir>"
-  echo " Options:"
-  echo "    --stage (0|1)  # start script from part-way through"
-  echo "    --cmd (run.pl|queue.pl...)  # specify how to run the sub-processes"
-  echo "e.g.:"
-  echo "$0 exp/diag_ubm_1024/final.dubm exp/extractor_diag_c1024_i128/final.ie exp/VB"
-  exit 1;
-fi
-
-gmm_model=$1
-ivec_extractor=$2
-VB_dir=$3
-
-if [ $stage -le 0 ]; then
-  # Dump the diagonal UBM model into txt format.
-  "$train_cmd" $VB_dir/log/convert_diag_ubm.log \
-    gmm-global-copy --binary=false \
-      $gmm_model \
-      $VB_dir/dubm.tmp || exit 1;
-
-  # Dump the ivector extractor model into txt format.
-  "$train_cmd" $VB_dir/log/convert_ie.log \
-    ivector-extractor-copy --binary=false \
-      $ivec_extractor \
-      $VB_dir/ie.tmp || exit 1;
-fi
-
-if [ $stage -le 1 ]; then
-  # Convert txt to numpy format
-  python diarization/convert_VB_model.py $VB_dir/dubm.tmp $VB_dir/ie.tmp $VB_dir || exit 1;
-
-  rm $VB_dir/dubm.tmp $VB_dir/ie.tmp || exit 1;
-fi
diff --git a/egs/callhome_diarization/v1/diarization/kaldi_io.py b/egs/callhome_diarization/v1/diarization/kaldi_io.py
deleted file mode 100755
index dae5599b8f1..00000000000
--- a/egs/callhome_diarization/v1/diarization/kaldi_io.py
+++ /dev/null
@@ -1,627 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
-# Licensed under the Apache License, Version 2.0 (the "License")
-
-import numpy as np
-import sys, os, re, gzip, struct
-
-#################################################
-# Adding kaldi tools to shell path,
-
-# Select kaldi,
-if not 'KALDI_ROOT' in os.environ:
-  # Default! To change run python with 'export KALDI_ROOT=/some_dir python'
-  os.environ['KALDI_ROOT']='/mnt/matylda5/iveselyk/Tools/kaldi-trunk'
-
-# Add kaldi tools to path,
-os.environ['PATH'] = os.popen('echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/').readline().strip() + ':' + os.environ['PATH']
-
-
-#################################################
-# Define all custom exceptions,
-class UnsupportedDataType(Exception): pass
-class UnknownVectorHeader(Exception): pass
-class UnknownMatrixHeader(Exception): pass
-
-class BadSampleSize(Exception): pass
-class BadInputFormat(Exception): pass
-
-class SubprocessFailed(Exception): pass
-
-#################################################
-# Data-type independent helper functions,
-
-def open_or_fd(file, mode='rb'):
-  """ fd = open_or_fd(file)
-   Open file, gzipped file, pipe, or forward the file-descriptor.
-   Eventually seeks in the 'file' argument contains ':offset' suffix.
-  """
-  offset = None
-  try:
-    # strip 'ark:' prefix from r{x,w}filename (optional),
-    if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file):
-      (prefix,file) = file.split(':',1)
-    # separate offset from filename (optional),
-    if re.search(':[0-9]+$', file):
-      (file,offset) = file.rsplit(':',1)
-    # input pipe?
-    if file[-1] == '|':
-      fd = popen(file[:-1], 'rb') # custom,
-    # output pipe?
-    elif file[0] == '|':
-      fd = popen(file[1:], 'wb') # custom,
-    # is it gzipped?
-    elif file.split('.')[-1] == 'gz':
-      fd = gzip.open(file, mode)
-    # a normal file...
-    else:
-      fd = open(file, mode)
-  except TypeError:
-    # 'file' is opened file descriptor,
-    fd = file
-  # Eventually seek to offset,
-  if offset != None: fd.seek(int(offset))
-  return fd
-
-# based on '/usr/local/lib/python3.4/os.py'
-def popen(cmd, mode="rb"):
-  if not isinstance(cmd, str):
-    raise TypeError("invalid cmd type (%s, expected string)" % type(cmd))
-
-  import subprocess, io, threading
-
-  # cleanup function for subprocesses,
-  def cleanup(proc, cmd):
-    ret = proc.wait()
-    if ret > 0:
-      raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret))
-    return
-
-  # text-mode,
-  if mode == "r":
-    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
-    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
-    return io.TextIOWrapper(proc.stdout)
-  elif mode == "w":
-    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
-    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
-    return io.TextIOWrapper(proc.stdin)
-  # binary,
-  elif mode == "rb":
-    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
-    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
-    return proc.stdout
-  elif mode == "wb":
-    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
-    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
-    return proc.stdin
-  # sanity,
-  else:
-    raise ValueError("invalid mode %s" % mode)
-
-
-def read_key(fd):
-  """ [key] = read_key(fd)
-   Read the utterance-key from the opened ark/stream descriptor 'fd'.
-  """
-  key = ''
-  while 1:
-    char = fd.read(1).decode("latin1")
-    if char == '' : break
-    if char == ' ' : break
-    key += char
-  key = key.strip()
-  if key == '': return None # end of file,
-  assert(re.match('^\S+$',key) != None) # check format (no whitespace!)
-  return key
-
-
-#################################################
-# Integer vectors (alignments, ...),
-
-def read_ali_ark(file_or_fd):
-  """ Alias to 'read_vec_int_ark()' """
-  return read_vec_int_ark(file_or_fd)
-
-def read_vec_int_ark(file_or_fd):
-  """ generator(key,vec) = read_vec_int_ark(file_or_fd)
-   Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
-   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
-
-   Read ark to a 'dictionary':
-   d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    key = read_key(fd)
-    while key:
-      ali = read_vec_int(fd)
-      yield key, ali
-      key = read_key(fd)
-  finally:
-    if fd is not file_or_fd: fd.close()
-
-def read_vec_int(file_or_fd):
-  """ [int-vec] = read_vec_int(file_or_fd)
-   Read kaldi integer vector, ascii or binary input,
-  """
-  fd = open_or_fd(file_or_fd)
-  binary = fd.read(2).decode()
-  if binary == '\0B': # binary flag
-    assert(fd.read(1).decode() == '\4'); # int-size
-    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
-    # Elements from int32 vector are sored in tuples: (sizeof(int32), value),
-    vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size)
-    assert(vec[0]['size'] == 4) # int32 size,
-    ans = vec[:]['value'] # values are in 2nd column,
-  else: # ascii,
-    arr = (binary + fd.readline().decode()).strip().split()
-    try:
-      arr.remove('['); arr.remove(']') # optionally
-    except ValueError:
-      pass
-    ans = np.array(arr, dtype=int)
-  if fd is not file_or_fd : fd.close() # cleanup
-  return ans
-
-# Writing,
-def write_vec_int(file_or_fd, v, key=''):
-  """ write_vec_int(f, v, key='')
-   Write a binary kaldi integer vector to filename or stream.
-   Arguments:
-   file_or_fd : filename or opened file descriptor for writing,
-   v : the vector to be stored,
-   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
-
-   Example of writing single vector:
-   kaldi_io.write_vec_int(filename, vec)
-
-   Example of writing arkfile:
-   with open(ark_file,'w') as f:
-     for key,vec in dict.iteritems():
-       kaldi_io.write_vec_flt(f, vec, key=key)
-  """
-  fd = open_or_fd(file_or_fd, mode='wb')
-  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
-  try:
-    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
-    fd.write('\0B'.encode()) # we write binary!
-    # dim,
-    fd.write('\4'.encode()) # int32 type,
-    fd.write(struct.pack(np.dtype('int32').char, v.shape[0]))
-    # data,
-    for i in range(len(v)):
-      fd.write('\4'.encode()) # int32 type,
-      fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary,
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-
-#################################################
-# Float vectors (confidences, ivectors, ...),
-
-# Reading,
-def read_vec_flt_scp(file_or_fd):
-  """ generator(key,mat) = read_vec_flt_scp(file_or_fd)
-   Returns generator of (key,vector) tuples, read according to kaldi scp.
-   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
-
-   Iterate the scp:
-   for key,vec in kaldi_io.read_vec_flt_scp(file):
-     ...
-
-   Read scp to a 'dictionary':
-   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    for line in fd:
-      (key,rxfile) = line.decode().split(' ')
-      vec = read_vec_flt(rxfile)
-      yield key, vec
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-def read_vec_flt_ark(file_or_fd):
-  """ generator(key,vec) = read_vec_flt_ark(file_or_fd)
-   Create generator of (key,vector<float>) tuples, reading from an ark file/stream.
-   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
-
-   Read ark to a 'dictionary':
-   d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    key = read_key(fd)
-    while key:
-      ali = read_vec_flt(fd)
-      yield key, ali
-      key = read_key(fd)
-  finally:
-    if fd is not file_or_fd: fd.close()
-
-def read_vec_flt(file_or_fd):
-  """ [flt-vec] = read_vec_flt(file_or_fd)
-   Read kaldi float vector, ascii or binary input,
-  """
-  fd = open_or_fd(file_or_fd)
-  binary = fd.read(2).decode()
-  if binary == '\0B': # binary flag
-    # Data type,
-    header = fd.read(3).decode()
-    if header == 'FV ': sample_size = 4 # floats
-    elif header == 'DV ': sample_size = 8 # doubles
-    else: raise UnknownVectorHeader("The header contained '%s'" % header)
-    assert(sample_size > 0)
-    # Dimension,
-    assert(fd.read(1).decode() == '\4'); # int-size
-    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
-    # Read whole vector,
-    buf = fd.read(vec_size * sample_size)
-    if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32')
-    elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64')
-    else : raise BadSampleSize
-    return ans
-  else: # ascii,
-    arr = (binary + fd.readline().decode()).strip().split()
-    try:
-      arr.remove('['); arr.remove(']') # optionally
-    except ValueError:
-      pass
-    ans = np.array(arr, dtype=float)
-  if fd is not file_or_fd : fd.close() # cleanup
-  return ans
-
-# Writing,
-def write_vec_flt(file_or_fd, v, key=''):
-  """ write_vec_flt(f, v, key='')
-   Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats.
-   Arguments:
-   file_or_fd : filename or opened file descriptor for writing,
-   v : the vector to be stored,
-   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
-
-   Example of writing single vector:
-   kaldi_io.write_vec_flt(filename, vec)
-
-   Example of writing arkfile:
-   with open(ark_file,'w') as f:
-     for key,vec in dict.iteritems():
-       kaldi_io.write_vec_flt(f, vec, key=key)
-  """
-  fd = open_or_fd(file_or_fd, mode='wb')
-  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
-  try:
-    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
-    fd.write('\0B'.encode()) # we write binary!
-    # Data-type,
-    if v.dtype == 'float32': fd.write('FV '.encode())
-    elif v.dtype == 'float64': fd.write('DV '.encode())
-    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype)
-    # Dim,
-    fd.write('\04'.encode())
-    fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim
-    # Data,
-    fd.write(v.tobytes())
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-
-#################################################
-# Float matrices (features, transformations, ...),
-
-# Reading,
-def read_mat_scp(file_or_fd):
-  """ generator(key,mat) = read_mat_scp(file_or_fd)
-   Returns generator of (key,matrix) tuples, read according to kaldi scp.
-   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
-
-   Iterate the scp:
-   for key,mat in kaldi_io.read_mat_scp(file):
-     ...
-
-   Read scp to a 'dictionary':
-   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    for line in fd:
-      (key,rxfile) = line.decode().split(' ')
-      mat = read_mat(rxfile)
-      yield key, mat
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-def read_mat_ark(file_or_fd):
-  """ generator(key,mat) = read_mat_ark(file_or_fd)
-   Returns generator of (key,matrix) tuples, read from ark file/stream.
-   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
-
-   Iterate the ark:
-   for key,mat in kaldi_io.read_mat_ark(file):
-     ...
-
-   Read ark to a 'dictionary':
-   d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    key = read_key(fd)
-    while key:
-      mat = read_mat(fd)
-      yield key, mat
-      key = read_key(fd)
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-def read_mat(file_or_fd):
-  """ [mat] = read_mat(file_or_fd)
-   Reads single kaldi matrix, supports ascii and binary.
-   file_or_fd : file, gzipped file, pipe or opened file descriptor.
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    binary = fd.read(2).decode()
-    if binary == '\0B' :
-      mat = _read_mat_binary(fd)
-    else:
-      assert(binary == ' [')
-      mat = _read_mat_ascii(fd)
-  finally:
-    if fd is not file_or_fd: fd.close()
-  return mat
-
-def _read_mat_binary(fd):
-  # Data type
-  header = fd.read(3).decode()
-  # 'CM', 'CM2', 'CM3' are possible values,
-  if header.startswith('CM'): return _read_compressed_mat(fd, header)
-  elif header == 'FM ': sample_size = 4 # floats
-  elif header == 'DM ': sample_size = 8 # doubles
-  else: raise UnknownMatrixHeader("The header contained '%s'" % header)
-  assert(sample_size > 0)
-  # Dimensions
-  s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
-  # Read whole matrix
-  buf = fd.read(rows * cols * sample_size)
-  if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
-  elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
-  else : raise BadSampleSize
-  mat = np.reshape(vec,(rows,cols))
-  return mat
-
-def _read_mat_ascii(fd):
-  rows = []
-  while 1:
-    line = fd.readline().decode()
-    if (len(line) == 0) : raise BadInputFormat # eof, should not happen!
-    if len(line.strip()) == 0 : continue # skip empty line
-    arr = line.strip().split()
-    if arr[-1] != ']':
-      rows.append(np.array(arr,dtype='float32')) # not last line
-    else:
-      rows.append(np.array(arr[:-1],dtype='float32')) # last line
-      mat = np.vstack(rows)
-      return mat
-
-
-def _read_compressed_mat(fd, format):
-  """ Read a compressed matrix,
-      see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
-      methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
-  """
-  assert(format == 'CM ') # The formats CM2, CM3 are not supported...
-
-  # Format of header 'struct',
-  global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
-  per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])
-
-  # Read global header,
-  globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]
-
-  # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
-  #                         {           cols           }{     size         }
-  col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
-  col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
-  data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major,
-
-  mat = np.zeros((cols,rows), dtype='float32')
-  p0 = col_headers[:, 0].reshape(-1, 1)
-  p25 = col_headers[:, 1].reshape(-1, 1)
-  p75 = col_headers[:, 2].reshape(-1, 1)
-  p100 = col_headers[:, 3].reshape(-1, 1)
-  mask_0_64 = (data <= 64)
-  mask_193_255 = (data > 192)
-  mask_65_192 = (~(mask_0_64 | mask_193_255))
-
-  mat += (p0  + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
-  mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
-  mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)
-
-  return mat.T # transpose! col-major -> row-major,
-
-
-# Writing,
-def write_mat(file_or_fd, m, key=''):
-  """ write_mat(f, m, key='')
-  Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats.
-  Arguments:
-   file_or_fd : filename of opened file descriptor for writing,
-   m : the matrix to be stored,
-   key (optional) : used for writing ark-file, the utterance-id gets written before the matrix.
-
-   Example of writing single matrix:
-   kaldi_io.write_mat(filename, mat)
-
-   Example of writing arkfile:
-   with open(ark_file,'w') as f:
-     for key,mat in dict.iteritems():
-       kaldi_io.write_mat(f, mat, key=key)
-  """
-  fd = open_or_fd(file_or_fd, mode='wb')
-  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
-  try:
-    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
-    fd.write('\0B'.encode()) # we write binary!
-    # Data-type,
-    if m.dtype == 'float32': fd.write('FM '.encode())
-    elif m.dtype == 'float64': fd.write('DM '.encode())
-    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype)
-    # Dims,
-    fd.write('\04'.encode())
-    fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows
-    fd.write('\04'.encode())
-    fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols
-    # Data,
-    fd.write(m.tobytes())
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-
-#################################################
-# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...)
-# Corresponds to: vector<vector<tuple<int,float> > >
-# - outer vector: time axis
-# - inner vector: records at the time
-# - tuple: int = index, float = value
-#
-
-def read_cnet_ark(file_or_fd):
-  """ Alias of function 'read_post_ark()', 'cnet' = confusion network """
-  return read_post_ark(file_or_fd)
-
-def read_post_ark(file_or_fd):
-  """ generator(key,vec<vec<int,float>>) = read_post_ark(file)
-   Returns generator of (key,posterior) tuples, read from ark file.
-   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
-
-   Iterate the ark:
-   for key,post in kaldi_io.read_post_ark(file):
-     ...
-
-   Read ark to a 'dictionary':
-   d = { key:post for key,post in kaldi_io.read_post_ark(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    key = read_key(fd)
-    while key:
-      post = read_post(fd)
-      yield key, post
-      key = read_key(fd)
-  finally:
-    if fd is not file_or_fd: fd.close()
-
-def read_post(file_or_fd):
-  """ [post] = read_post(file_or_fd)
-   Reads single kaldi 'Posterior' in binary format.
-
-   The 'Posterior' is C++ type 'vector<vector<tuple<int,float> > >',
-   the outer-vector is usually time axis, inner-vector are the records
-   at given time,  and the tuple is composed of an 'index' (integer)
-   and a 'float-value'. The 'float-value' can represent a probability
-   or any other numeric value.
-
-   Returns vector of vectors of tuples.
-  """
-  fd = open_or_fd(file_or_fd)
-  ans=[]
-  binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag
-  assert(fd.read(1).decode() == '\4'); # int-size
-  outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
-
-  # Loop over 'outer-vector',
-  for i in range(outer_vec_size):
-    assert(fd.read(1).decode() == '\4'); # int-size
-    inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin)
-    data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size)
-    assert(data[0]['size_idx'] == 4)
-    assert(data[0]['size_post'] == 4)
-    ans.append(data[['idx','post']].tolist())
-
-  if fd is not file_or_fd: fd.close()
-  return ans
-
-
-#################################################
-# Kaldi Confusion Network bin begin/end times,
-# (kaldi stores CNs time info separately from the Posterior).
-#
-
-def read_cntime_ark(file_or_fd):
-  """ generator(key,vec<tuple<float,float>>) = read_cntime_ark(file_or_fd)
-   Returns generator of (key,cntime) tuples, read from ark file.
-   file_or_fd : file, gzipped file, pipe or opened file descriptor.
-
-   Iterate the ark:
-   for key,time in kaldi_io.read_cntime_ark(file):
-     ...
-
-   Read ark to a 'dictionary':
-   d = { key:time for key,time in kaldi_io.read_post_ark(file) }
-  """
-  fd = open_or_fd(file_or_fd)
-  try:
-    key = read_key(fd)
-    while key:
-      cntime = read_cntime(fd)
-      yield key, cntime
-      key = read_key(fd)
-  finally:
-    if fd is not file_or_fd : fd.close()
-
-def read_cntime(file_or_fd):
-  """ [cntime] = read_cntime(file_or_fd)
-   Reads single kaldi 'Confusion Network time info', in binary format:
-   C++ type: vector<tuple<float,float> >.
-   (begin/end times of bins at the confusion network).
-
-   Binary layout is '<num-bins> <beg1> <end1> <beg2> <end2> ...'
-
-   file_or_fd : file, gzipped file, pipe or opened file descriptor.
-
-   Returns vector of tuples.
-  """
-  fd = open_or_fd(file_or_fd)
-  binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary
-
-  assert(fd.read(1).decode() == '\4'); # int-size
-  vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)
-
-  data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size)
-  assert(data[0]['size_beg'] == 4)
-  assert(data[0]['size_end'] == 4)
-  ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end),
-
-  if fd is not file_or_fd : fd.close()
-  return ans
-
-
-#################################################
-# Segments related,
-#
-
-# Segments as 'Bool vectors' can be handy,
-# - for 'superposing' the segmentations,
-# - for frame-selection in Speaker-ID experiments,
-def read_segments_as_bool_vec(segments_file):
-  """ [ bool_vec ] = read_segments_as_bool_vec(segments_file)
-   using kaldi 'segments' file for 1 wav, format : '<utt> <rec> <t-beg> <t-end>'
-   - t-beg, t-end is in seconds,
-   - assumed 100 frames/second,
-  """
-  segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1)
-  # Sanity checks,
-  assert(len(segs) > 0) # empty segmentation is an error,
-  assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file,
-  # Convert time to frame-indexes,
-  start = np.rint([100 * rec[2] for rec in segs]).astype(int)
-  end = np.rint([100 * rec[3] for rec in segs]).astype(int)
-  # Taken from 'read_lab_to_bool_vec', htk.py,
-  frms = np.repeat(np.r_[np.tile([False,True], len(end)), False],
-                   np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0])
-  assert np.sum(end-start) == np.sum(frms)
-  return frms
-
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index 3dfb3ccb127..4ec3aa4f740 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -19,8 +19,8 @@ vaddir=`pwd`/mfcc
 data_root=/export/corpora5/LDC
 stage=0
 nnet_dir=exp/xvector_nnet_1a/
-num_components=1024
-ivector_dim=400
+num_components=1024 # the number of UBM components (used for VB resegmentation)
+ivector_dim=400 # the dimension of i-vector (used for VB resegmentation)
 
 # Prepare datasets
 if [ $stage -le 0 ]; then
@@ -376,10 +376,6 @@ if [ $stage -le 12 ]; then
     --num-threads 1 --num-processes 1 --nj 40 \
     exp/diag_ubm_$num_components/final.dubm data/train \
     exp/extractor_diag_c${num_components}_i${ivector_dim}
-
-  # Converts diagonal UBM and ivector extractor to numpy array format
-  diarization/convert_VB_model.sh --cmd "$train_cmd" exp/diag_ubm_$num_components/final.dubm \
-    exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie exp/VB 
 fi
 
 if [ $stage -le 13 ]; then
@@ -391,10 +387,11 @@ if [ $stage -le 13 ]; then
 
   # VB resegmentation. In this script, I use the x-vector result to 
   # initialize the VB system. You can also use i-vector result or random 
-  # initize the VB system.
+  # initize the VB system. The following script uses kaldi_io. 
+  # You could use `sh ../../../tools/extras/install_kaldi_io.sh` to install it
   diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \
-    --initialize 1 \
-    data/callhome $init_rttm_file exp/VB exp/VB/diag_ubm.pkl exp/VB/ie.pkl || exit 1; 
+    --initialize 1 data/callhome $init_rttm_file exp/VB \
+    exp/diag_ubm_$num_components/final.dubm exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie || exit 1; 
 
   # Compute the DER after VB resegmentation
   mkdir -p exp/VB/results || exit 1;
diff --git a/tools/extras/install_kaldi_io.sh b/tools/extras/install_kaldi_io.sh
new file mode 100755
index 00000000000..e3192be78a8
--- /dev/null
+++ b/tools/extras/install_kaldi_io.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Install kaldi_io. Please refer to https://github.com/vesis84/kaldi-io-for-python 
+# for details.
+
+python3 -m pip install --user kaldi_io