Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions egs/iam/s5/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export cmd="queue.pl"
export train_cmd="queue.pl"
export decode_cmd="queue.pl --mem 4G"
1 change: 1 addition & 0 deletions egs/iam/s5/image
288 changes: 288 additions & 0 deletions egs/iam/s5/local/augment_and_make_feature_vect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
#!/usr/bin/env python
import random
import argparse
import os
import sys
import scipy.io as sio
import numpy as np
from scipy import misc
from scipy.ndimage.interpolation import affine_transform
import math
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)

parser = argparse.ArgumentParser(
description="""Generates and saves the feature vectors""")
parser.add_argument(
'dir', type=str, help='directory of images.scp and is also output directory')
parser.add_argument('--seg', type=str, default='1',
help='JOB number of images.JOB.scp if run in parallel mode')
parser.add_argument('--out-ark', type=str, default='-',
help='where to write the output feature file')
parser.add_argument('--scale-size', type=int, default=40,
help='size to scale the height of all images')
parser.add_argument('--padding', type=int, default=5,
help='size to scale the height of all images')
parser.add_argument('--vertical-shift', type=int, default=10,
help='total number of padding pixel per column')
args = parser.parse_args()


def write_kaldi_matrix(file_handle, matrix, key):
file_handle.write(key + " [ ")
num_rows = len(matrix)
if num_rows == 0:
raise Exception("Matrix is empty")
num_cols = len(matrix[0])

for row_index in range(len(matrix)):
if num_cols != len(matrix[row_index]):
raise Exception("All the rows of a matrix are expected to "
"have the same length")
file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
if row_index != num_rows - 1:
file_handle.write("\n")
file_handle.write(" ]\n")


def get_scaled_image(im):
scale_size = args.scale_size
sx = im.shape[1] # width
sy = im.shape[0] # height
scale = (1.0 * scale_size) / sy
nx = int(scale_size)
ny = int(scale * sx)
im = misc.imresize(im, (nx, ny))
padding_x = max(5, int((args.padding / 100) * im.shape[1]))
padding_y = im.shape[0]
im_pad = np.concatenate(
(255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
im_pad1 = np.concatenate(
(im_pad, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
return im_pad1


def contrast_normalization(im, low_pct, high_pct):
element_number = im.size
rows = im.shape[0]
cols = im.shape[1]
im_contrast = np.zeros(shape=im.shape)
low_index = int(low_pct * element_number)
high_index = int(high_pct * element_number)
sorted_im = np.sort(im, axis=None)
low_thred = sorted_im[low_index]
high_thred = sorted_im[high_index]
for i in range(rows):
for j in range(cols):
if im[i, j] > high_thred:
im_contrast[i, j] = 255 # lightest to white
elif im[i, j] < low_thred:
im_contrast[i, j] = 0 # darkest to black
else:
# linear normalization
im_contrast[i, j] = (im[i, j] - low_thred) * \
255 / (high_thred - low_thred)
return im_contrast


def geometric_moment(frame, p, q):
m = 0
for i in range(frame.shape[1]):
for j in range(frame.shape[0]):
m += (i ** p) * (j ** q) * frame[i][i]
return m


def central_moment(frame, p, q):
u = 0
x_bar = geometric_moment(frame, 1, 0) / \
geometric_moment(frame, 0, 0) # m10/m00
y_bar = geometric_moment(frame, 0, 1) / \
geometric_moment(frame, 0, 0) # m01/m00
for i in range(frame.shape[1]):
for j in range(frame.shape[0]):
u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
return u


def height_normalization(frame, w, h):
frame_normalized = np.zeros(shape=(h, w))
alpha = 4
x_bar = geometric_moment(frame, 1, 0) / \
geometric_moment(frame, 0, 0) # m10/m00
y_bar = geometric_moment(frame, 0, 1) / \
geometric_moment(frame, 0, 0) # m01/m00
sigma_x = (alpha * ((central_moment(frame, 2, 0) /
geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00)
sigma_y = (alpha * ((central_moment(frame, 0, 2) /
geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00)
for x in range(w):
for y in range(h):
i = int((x / w - 0.5) * sigma_x + x_bar)
j = int((y / h - 0.5) * sigma_y + y_bar)
frame_normalized[x][y] = frame[i][j]
return frame_normalized


def find_slant(im):
rows = im.shape[0]
cols = im.shape[1]
sum_max = 0
slant_degree = 0
for shear_degree in range(-45, 45, 5):
sum = 0
shear_rad = shear_degree / 360.0 * 2 * math.pi
shear_matrix = np.array([[1, 0],
[np.tan(shear_rad), 1]])
sheared_im = affine_transform(im, shear_matrix, cval=255.0)
for j in range(cols):
foreground = (sheared_im[:, j] < 100)
number = np.sum(foreground)
# print(number)
if number != 0:
start_point = -1
end_point = -1
start_point = 0
for i in range(rows):
if foreground[i] == 1:
start_point = i
break
for i in range(rows - 1, -1, -1):
if foreground[i] == 1:
end_point = i
break
length = end_point - start_point + 1
#print(number, length)
if length == number:
sum = sum + number * number
#print(shear_degree, sum)
if sum > sum_max:
sum_max = sum
slant_degree = shear_degree
return slant_degree


def deslant(im, shear):
padding_x = int(abs(np.tan(shear)) * im.shape[0])
padding_y = im.shape[0]
if shear > 0:
im_pad = np.concatenate(
(255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
else:
im_pad = np.concatenate(
(im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)

shear_matrix = np.array([[1, 0],
[np.tan(shear), 1]])
# sheared_im = affine_transform(image, shear_matrix, output_shape=(
# im.shape[0], im.shape[1] + abs(int(im.shape[0] * np.tan(shear)))), cval=128.0)
sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
return sheared_im


def vertical_shift(im, mode='mid'):
total = args.vertical_shift
if mode == 'mid':
top = total / 2
bottom = total - top
elif mode == 'top': # more padding on top
top = random.randint(total / 2, total)
bottom = total - top
elif mode == 'bottom': # more padding on bottom
top = random.randint(0, total / 2)
bottom = total - top
width = im.shape[1]
im_pad = np.concatenate(
(255 * np.ones((top, width), dtype=int), im), axis=0)
im_pad = np.concatenate(
(im_pad, 255 * np.ones((bottom, width), dtype=int)), axis=0)
return im_pad


def image_augment(im, out_fh, image_id):
random.seed(1)
shift_setting = ['mid', 'top', 'bottom']
image_shift_id = []
for i in range(3):
image_shift_id.append(image_id + '_shift' + str(i + 1))
im_shift = vertical_shift(im, shift_setting[i])
im_scaled = get_scaled_image(im_shift)
data = np.transpose(im_scaled, (1, 0))
data = np.divide(data, 255.0)
new_scp_list.append(image_id + '_shift' + str(i + 1))
write_kaldi_matrix(out_fh, data, image_shift_id[i])


# main #
new_scp_list = list()
text_file = os.path.join(args.dir, 'backup', 'text')
text_dict = dict() # stores imageID and text

with open(text_file) as text_fh:
for uttID_text in text_fh:
uttID_text = uttID_text.strip()
uttID_text_vect = uttID_text.split(" ")
uttID = uttID_text_vect[0]
imageID = uttID.split("_")[1]
text_vect = uttID_text_vect[1:]
text = " ".join(text_vect)
text_dict[imageID] = text

utt2spk_file = os.path.join(args.dir, 'backup', 'utt2spk')
uttID_spk_dict = dict() # stores imageID and speaker

with open(utt2spk_file) as utt2spk_fh:
for uttID_spk in utt2spk_fh:
uttID_spk = uttID_spk.strip()
uttID_spk_vect = uttID_spk.split(" ")
uttID = uttID_spk_vect[0]
imageID = uttID.split("_")[1]
spk = uttID_spk_vect[1]
uttID_spk_dict[imageID] = spk

image_file = os.path.join(args.dir, 'backup', 'images.scp')
uttID_path_dict = dict() # stores imageID and image path

with open(image_file) as image_fh:
for uttID_path in image_fh:
uttID_path = uttID_path.strip()
uttID_path_vect = uttID_path.split(" ")
uttID = uttID_path_vect[0]
imageID = uttID.split("_")[1]
path = uttID_path_vect[1]
uttID_path_dict[imageID] = path

scp_name = 'images.scp'
data_list_path = os.path.join(args.dir, 'backup', scp_name)

if args.out_ark == '-':
out_fh = sys.stdout
else:
out_fh = open(args.out_ark, 'wb')

text_file = os.path.join(args.dir, 'text')
text_fh = open(text_file, 'w+')

utt2spk_file = os.path.join(args.dir, 'utt2spk')
utt2spk_fh = open(utt2spk_file, 'w+')

image_file = os.path.join(args.dir, 'images.scp')
image_fh = open(image_file, 'w+')

with open(data_list_path) as f:
for line in f:
line = line.strip()
line_vect = line.split(' ')
image_id = line_vect[0]
image_path = line_vect[1]
im = misc.imread(image_path)
#im_contrast = contrast_normalization(im, 0.05, 0.2)
#shear = (find_slant(im_contrast) / 360.0) * 2 * math.pi
im_scaled = get_scaled_image(im)
image_augment(im_scaled, out_fh, image_id)

for uttID in new_scp_list:
imageID = uttID.split("_")[1]
text_fh.write(uttID + ' ' + text_dict[imageID] + '\n')
utt2spk_fh.write(uttID + ' ' + uttID_spk_dict[imageID] + '\n')
image_fh.write(uttID + ' ' + uttID_path_dict[imageID] + '\n')
Loading