kaldi_io.py

# Copyright 2014    Yajie Miao    Carnegie Mellon University
#           2015    Yun Wang      Carnegie Mellon University

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

import numpy as np
np.set_printoptions(threshold=np.nan)
np.set_printoptions(linewidth=np.nan)
import gzip
import feat
import scipy.io.wavfile as wav
import struct
from collections import OrderedDict
import os
from shutil import copyfile

#this function reads an alignment file generated by kaldi
# filename: path to alignment file
def read_alignments(filename):
	with gzip.open(filename, 'rb') as f:
		alignments = {}
		for line in f:
			data = line.replace(' \n','').split(' ')
			alignments[data[0]] = np.asarray(map(int,data[1:len(data)])) #segment:alignment
	return alignments

#this function reads a segment file that is used in kaldi
# filename: path to segment file
def read_segments(filename):
	with open(filename) as f:
		segments = OrderedDict()
		for line in f:
			data = line.replace('\n','').split(' ') #seg utt begin end
			if data[1] not in segments:
				segments[data[1]] = [(data[0], float(data[2]), float(data[3]))] #utt: [(seg , begin, end)]
			else:
				segments[data[1]].append((data[0], float(data[2]), float(data[3])))
	return segments

#this function reads the wav.scp file used in kaldi	
# filename: path to wav scp file
def read_wavfiles(filename):
	with open(filename) as f:
		wavfiles = OrderedDict()
		for line in f:
			data = line.replace('\n','').split(' ')
			if len(data) == 2: #wav.scp contains filenames
				wavfiles[data[0]] = (data[1], False) #utterance:(filename, not extended)
			else: #wav.scp contains extended filenames
				wavfiles[data[0]] = (line[len(data[0])+1:len(line)-1], True) #utterance: (extended filename, extended)
	return wavfiles
	
#this function will read the utt2spk file used in kaldi
# filename: path to wav scp file
def read_utt2spk(filename):
	with open(filename) as f:
		utt2spk = {}
		for line in f:
			data = line.replace('\n','').split(' ')
			utt2spk[data[0]] = data[1]
	return utt2spk

#this function creates a dummy neural nnet for kaldi with the transition model taken from the gmm	
# gmm_dir: directory of the gmm model
# nnet_dir: directory where the nnet should end up
# feat_dir: directory of the testing features
def create_dummy(gmm_dir, nnet_dir, feat_dir, num_labels):
	
	gmm_filename = gmm_dir + '/final.mdl'
	gmm_textname = gmm_dir + '/final.txt'
	nnet_filename = nnet_dir + '/final.mdl'
	nnet_textname = nnet_dir + '/final.txt'
	
	#copy the gmm to text format using kaldi
	os.system("gmm-copy --binary=false %s %s" % (gmm_filename, gmm_textname))
	
	gmm = open(gmm_textname, 'rb')
	nnet = open(nnet_textname, 'wb')
	
	#read the transition model from the gmm model and write it in the nnet model
	line = gmm.readline()
	nnet.write(line)
	while line != '</TransitionModel> \n' and line != '' and line != None:
		line = gmm.readline()
		nnet.write(line)
	gmm.close()
		
	#create an identity weight matrix and write it in nnet model
	idmat = np.identity(num_labels, np.float32)
	nnet.write('<Nnet> <NumComponents> 1 <Components>\n')
	nnet.write('<AffineComponentPreconditionedOnline> <LearningRate> 0 <LinearParams>  [\n')
	for i in range(num_labels-1):
		s = str(idmat[i,:])
		nnet.write(' ' + s[1:len(s)-1] + ' \n')
	s = str(idmat[i,:])
	nnet.write(' ' + s[1:len(s)-1] + ' ]\n')
	s = str(np.zeros([num_labels,], np.float32))
	
	#add zero bias params
	nnet.write('<BiasParams>  [' + s[1:len(s)-1] + ' ]\n')
	nnet.write('<RankIn> 20 <RankOut> 80 <UpdatePeriod> 4 <NumSamplesHistory> 2000 <Alpha> 4 <MaxChangePerSample> 0.075 </AffineComponentPreconditionedOnline>\n')
	s = str(np.ones([num_labels,],np.float32))
	nnet.write('</Components> </Nnet>  [' + s[1:len(s)-1] + ' ]')

	nnet.close()
	
	#copy the nnet to binary format using kaldi
	os.system("nnet-am-copy --binary=true %s %s" % (nnet_textname, nnet_filename))
	
	#delete the text gmm and nnet
	os.remove(gmm_textname)
	os.remove(nnet_textname)
	
	#copy some kaldi files to nnet dir
	copyfile(gmm_dir + '/tree', nnet_dir + '/tree')
	copyfile(feat_dir + '/utt2spk', nnet_dir + '/utt2spk')
	copyfile(feat_dir + '/spk2utt', nnet_dir + '/spk2utt')
	copyfile(feat_dir + '/text', nnet_dir + '/text')
	copyfile(feat_dir + '/wav.scp', nnet_dir + '/wav.scp')
	
	#create an empty cmvn_opts file
	t = open(nnet_dir + '/cmvn_opts', 'w+')
	t.close()
	
	
# Class to read Kaldi features. Each time, it reads one line of the .scp file
# and reads in the corresponding features into a numpy matrix. It only supports
# binary-formatted .ark files. Text and compressed .ark files are not supported.
# this function has been adapted from pdnn toolkit (see licence at the top of this file)(https://github.com/yajiemiao/pdnn)
class KaldiReadIn(object):

	def __init__(self, scp_path, scp_data = None, utt_ids = None):
	
		self.scp_position = 0
		if scp_data == None and utt_ids == None:
			fin = open(scp_path,"r")
			self.utt_ids = []
			self.scp_data = []
			line = fin.readline()
			while line != '' and line != None:
				utt_id, path_pos = line.replace('\n','').split(' ')
				path, pos = path_pos.split(':')
				self.utt_ids.append(utt_id)
				self.scp_data.append((path, pos))
				line = fin.readline()

			fin.close()
		else:		
			self.scp_data = scp_data
			self.utt_ids = utt_ids
		
	def read_utt_data(self, index):
		ark_read_buffer = open(self.scp_data[index][0], 'rb')
		ark_read_buffer.seek(int(self.scp_data[index][1]),0)
		header = struct.unpack('<xcccc', ark_read_buffer.read(5))
		if header[0] != "B":
			print "Input .ark file is not binary"; exit(1)
		if header[1] == "C":
			print "Input .ark file is compressed"; exit(1)

		rows = 0; cols= 0
		m, rows = struct.unpack('<bi', ark_read_buffer.read(5))
		n, cols = struct.unpack('<bi', ark_read_buffer.read(5))

		tmp_mat = np.frombuffer(ark_read_buffer.read(rows * cols * 4), dtype=np.float32)
		utt_mat = np.reshape(tmp_mat, (rows, cols))

		ark_read_buffer.close()
		
		return utt_mat
	
	def read_next_utt(self):
		
		if len(self.scp_data) == 0:
			return None , None, True 
		
		if self.scp_position >= len(self.scp_data): #if at end of file loop around
			looped = True
			self.scp_position = 0
		else: 
			looped = False
		
		self.scp_position += 1
		
		return self.utt_ids[self.scp_position-1], self.read_utt_data(self.scp_position-1), looped
		
	def read_next_scp(self):
		
		if self.scp_position >= len(self.scp_data): #if at end of file loop around
			self.scp_position = 0
			
		self.scp_position += 1
		
		return self.utt_ids[self.scp_position-1]
		
	def read_previous_scp(self):
		
		if self.scp_position < 0: #if at beginning of file loop around
			self.scp_position = len(self.scp_data) - 1
			
		self.scp_position -= 1
		
		return self.utt_ids[self.scp_position+1]
		
	def read_utt(self, utt_id):
		
		return self.read_utt_data(self.utt_ids.index(utt_id))
		
	def split(self, num_utt):
		reader = KaldiReadIn(None, self.scp_data[0:num_utt], self.utt_ids[0:num_utt])
		self.scp_data = self.scp_data[num_utt:len(self.scp_data)]
		self.utt_ids = self.utt_ids[num_utt:len(self.utt_ids)]
		return reader
	
		
        
# Class to write numpy matrices into Kaldi .ark file and create the corresponding .scp file. 
# It only supports binary-formatted .ark files. Text and compressed .ark files are not supported.
# the inspiration of this function came from pdnn toolkit (see licence at the top of this file)(https://github.com/yajiemiao/pdnn)
class KaldiWriteOut(object):

	def __init__(self, scp_path):

		self.scp_path = scp_path
		self.scp_file_write = open(self.scp_path,"w")
    
	def write_next_utt(self, ark_path, utt_id, utt_mat):
		ark_file_write = open(ark_path,"ab")
		utt_mat = np.asarray(utt_mat, dtype=np.float32)
		rows, cols = utt_mat.shape
		ark_file_write.write(struct.pack('<%ds'%(len(utt_id)), utt_id))
		pos = ark_file_write.tell()
		ark_file_write.write(struct.pack('<xcccc','B','F','M',' '))
		ark_file_write.write(struct.pack('<bi', 4, rows))
		ark_file_write.write(struct.pack('<bi', 4, cols))
		ark_file_write.write(utt_mat)
		self.scp_file_write.write('%s %s:%s\n' % (utt_id, ark_path, pos))
		ark_file_write.close()
		
	def close(self):
		self.scp_file_write.close()