From 5fe6cb224592cfd9590f6bd20ba3c5c2114a510c Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 17:05:34 -0400 Subject: [PATCH 01/67] minor change --- .../v1/local/chain/run_cnn_e2eali_1b.sh | 2 +- .../v1/local/chain/run_flatstart_cnn1a.sh | 2 +- egs/madcat_ar/v1/local/extract_features.sh | 4 + egs/madcat_ar/v1/local/process_data.py | 72 +-- egs/madcat_ar/v1/local/tl/augment_data.sh | 34 ++ ...eate_line_image_from_page_image.py.augment | 528 ++++++++++++++++++ .../v1/local/tl/imp/make_features.py | 170 ++++++ egs/madcat_ar/v1/local/tl/imp/process_data.py | 215 +++++++ .../v1/local/tl/imp/process_waldo_data.py | 62 ++ .../v1/local/tl/not_much_imp/prepare_data.sh | 49 ++ .../tl/not_much_imp/run_cnn_e2eali_1b.sh | 246 ++++++++ .../v1/local/tl/not_much_imp/run_end2end.sh | 125 +++++ .../tl/not_much_imp/run_flatstart_cnn1a.sh | 168 ++++++ .../v1/local/tl/not_much_imp/score.sh | 6 + .../v1/local/tl/run_textlocalization.sh | 128 +++++ egs/madcat_ar/v1/run_end2end.sh | 5 +- egs/wsj/s5/utils/lang/make_lexicon_fst.py | 2 +- 17 files changed, 1770 insertions(+), 48 deletions(-) create mode 100755 egs/madcat_ar/v1/local/tl/augment_data.sh create mode 100755 egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment create mode 100755 egs/madcat_ar/v1/local/tl/imp/make_features.py create mode 100755 egs/madcat_ar/v1/local/tl/imp/process_data.py create mode 100755 egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py create mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh create mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh create mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh create mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh create mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/score.sh create mode 100755 egs/madcat_ar/v1/local/tl/run_textlocalization.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh index 75c246f5ffe..55df0cad4b7 100755 --- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh @@ -193,7 +193,7 @@ if [ $stage -le 5 ]; then --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh index 2c85e982ce6..4eea10a8441 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh @@ -33,7 +33,7 @@ num_jobs_final=16 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= l2_regularize=0.00005 -frames_per_iter=1000000 +frames_per_iter=2000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train lang_test=lang_test diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 70c5498626c..56a8443e328 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -1,7 +1,11 @@ #!/bin/bash + # Copyright 2017 Yiwen Shao # 2018 Ashish Arora +# Apache 2.0 +# This script runs the make features script in parallel. + nj=4 cmd=run.pl feat_dim=40 diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index b57500cf2fa..920cb6f700b 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -42,6 +42,8 @@ help='Path to the downloaded (and extracted) writing conditions file 2') parser.add_argument('writing_condition3', type=str, help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -97,50 +99,40 @@ def check_writing_condition(wc_dict): Returns: (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True - return True - -def get_word_line_mapping(madcat_file_path): +def read_text(madcat_file_path): """ Maps every word in the page image to a corresponding line. Args: - madcat_file_path (string): complete path and name of the madcat xml file + madcat_file_path (string): complete path and name of the madcat xml file corresponding to the page image. Returns: + dict: Mapping every word in the page image to a corresponding line. """ + + word_line_dict = dict() doc = minidom.parse(madcat_file_path) zone = doc.getElementsByTagName('zone') for node in zone: line_id = node.getAttribute('id') - line_word_dict[line_id] = list() word_image = node.getElementsByTagName('token-image') for tnode in word_image: word_id = tnode.getAttribute('id') - line_word_dict[line_id].append(word_id) word_line_dict[word_id] = line_id - -def read_text(madcat_file_path): - """ Maps every word in the page image to a corresponding line. - Args: - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - Returns: - dict: Mapping every word in the page image to a corresponding line. - """ text_line_word_dict = dict() - doc = minidom.parse(madcat_file_path) segment = doc.getElementsByTagName('segment') for node in segment: token = node.getElementsByTagName('token') for tnode in token: ref_word_id = tnode.getAttribute('ref_id') word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue - word = unicodedata.normalize('NFKC',word) ref_line_id = word_line_dict[ref_word_id] if ref_line_id not in text_line_word_dict: text_line_word_dict[ref_line_id] = list() @@ -160,7 +152,6 @@ def get_line_image_location(): ### main ### - print("Processing '{}' data...".format(args.out_dir)) text_file = os.path.join(args.out_dir, 'text') @@ -188,24 +179,19 @@ def get_line_image_location(): madcat_xml_path, image_file_path, wc_dict = check_file_location() if wc_dict is None or not check_writing_condition(wc_dict): continue - if madcat_xml_path is not None: - madcat_doc = minidom.parse(madcat_xml_path) - writer = madcat_doc.getElementsByTagName('writer') - writer_id = writer[0].getAttribute('id') - line_word_dict = dict() - word_line_dict = dict() - get_word_line_mapping(madcat_xml_path) - text_line_word_dict = read_text(madcat_xml_path) - base_name = os.path.basename(image_file_path) - base_name, b = base_name.split('.tif') - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for lineID in sorted(text_line_word_dict): + updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[lineID] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment b/egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment new file mode 100755 index 00000000000..da2b0f0a62f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment @@ -0,0 +1,528 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import sys +import argparse +import os +import xml.dom.minidom as minidom +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple +import random +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage +parser = argparse.ArgumentParser(description="Creates line images from page image", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded madcat data directory 1') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded madcat data directory 2') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded madcat data directory 3') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument('--padding', type=int, default=400, + help='padding across horizontal/verticle direction') +parser.add_argument('--pixel-scaling', type=int, default=30, + help='padding across horizontal/verticle direction') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = im.size[0] / 2 + center_y = im.size[1] / 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +def dilate_polygon(points, amount_increase): + """ Increases size of polygon given as a list of tuples. Assumes points in polygon are given in CCW + """ + expanded_points = [] + for index, point in enumerate(points): + prev_point = points[(index - 1) % len(points)] + next_point = points[(index + 1) % len(points)] + prev_edge = np.subtract(point, prev_point) + next_edge = np.subtract(next_point, point) + + prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) + prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) + next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) + next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) + + bisect = np.add(prev_normal, next_normal) + bisect = np.divide(bisect, np.linalg.norm(bisect)) + + cos_theta = np.dot(next_normal, bisect) + hyp = amount_increase / cos_theta + + new_point = np.around(point + hyp * bisect) + new_point = new_point.astype(int) + new_point = new_point.tolist() + new_point = tuple(new_point) + expanded_points.append(new_point) + return expanded_points + + +def set_line_image_data(image, line_id, image_file_name, image_fh): + """ Given an image, saves a flipped line image. Line image file name + is formed by appending the line id at the end page image name. + """ + + base_name = os.path.splitext(os.path.basename(image_file_name))[0] + line_id = '_' + line_id.zfill(4) + line_image_file_name = base_name + line_id + '.png' + image_path = os.path.join(args.out_dir, line_image_file_name) + imgray = image.convert('L') + imgray_rev_arr = np.fliplr(imgray) + imgray_rev = toimage(imgray_rev_arr) + imgray_rev.save(image_path) + image_fh.write(image_path + '\n') + + +def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh): + """ Given a page image, extracts the line images from it. + Input + ----- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + im_wo_pad = Image.open(image_file_name) + im = pad_image(im_wo_pad) + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + id = node.getAttribute('id') + token_image = node.getElementsByTagName('token-image') + minimum_bounding_box_input = [] + for token_node in token_image: + word_point = token_node.getElementsByTagName('point') + for word_node in word_point: + word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) + minimum_bounding_box_input.append(word_coordinate) + updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) + points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + + +def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): + """ Returns the complete path of the page image and corresponding + xml file. + Returns + ------- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Given writing condition file path, returns a dictionary which have writing condition + of each page image. + Returns + ------ + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict, base_name): + """ Given writing condition dictionary, checks if a page image is writing + in a specifed writing condition. + It is used to create subset of dataset based on writing condition. + Returns + (bool): True if writing condition matches. + """ + #return True + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + + return True + + +### main ### + +def main(): + + wc_dict1 = parse_writing_conditions(args.writing_condition1) + wc_dict2 = parse_writing_conditions(args.writing_condition2) + wc_dict3 = parse_writing_conditions(args.writing_condition3) + output_directory = args.out_dir + image_file = os.path.join(output_directory, 'images.scp') + image_fh = open(image_file, 'w', encoding='utf-8') + + splits_handle = open(args.data_splits, 'r') + splits_data = splits_handle.read().strip().split('\n') + prev_base_name = '' + for line in splits_data: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) + if wc_dict is None or not check_writing_condition(wc_dict, base_name): + continue + if madcat_file_path is not None: + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + + +if __name__ == '__main__': + main() + diff --git a/egs/madcat_ar/v1/local/tl/imp/make_features.py b/egs/madcat_ar/v1/local/tl/imp/make_features.py new file mode 100755 index 00000000000..e9d10ecc87e --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/imp/make_features.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" +import random +import argparse +import os +import sys +import numpy as np +from scipy import misc +import math + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +parser.add_argument('--vertical-shift', type=int, default=16, + help='total number of padding pixel per column') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + return im + + +def horizontal_pad(im, allowed_lengths = None): + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] # width + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = int(padding // 2) + right_padding = padding - left_padding + dim_y = im.shape[0] # height + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'mid': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad + +### main ### +random.seed(1) +data_list_path = args.images_scp_path +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'w') + +allowed_lengths = None +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(allowed_len_handle) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +aug_setting = ['mid', 'notmid'] +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im) + im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) + if im_horizontal_padded is None: + num_fail += 1 + continue + if args.augment: + im_shift = vertical_shift(im_horizontal_padded, shift_setting[1]) + else: + im_shift = vertical_shift(im_horizontal_padded, shift_setting[0]) + data = np.transpose(im_shift, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (image too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/tl/imp/process_data.py b/egs/madcat_ar/v1/local/tl/imp/process_data.py new file mode 100755 index 00000000000..c21beb1be70 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/imp/process_data.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +""" This script reads MADCAT files and creates the following files (for the + data subset selected via --dataset) :text, utt2spk, images.scp. + Eg. local/process_data.py data/local /export/corpora/LDC/LDC2012T15 /export/corpora/LDC/LDC2013T09 + /export/corpora/LDC/LDC2013T15 data/download/data_splits/madcat.train.raw.lineid + data/dev data/local/lines/images.scp + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 وجه وعقل غارق حتّى النخاع + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +parser.add_argument('images_scp_path', type=str, + help='Path of input images.scp file(maps line image and location)') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") +args = parser.parse_args() + + +def check_file_location(): + """ Returns the complete path of the page image and corresponding + xml file. + Args: + Returns: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Returns a dictionary which have writing condition of each page image. + Args: + writing_conditions(string): complete path of writing condition file. + Returns: + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict): + """ Checks if a given page image is writing in a given writing condition. + It is used to create subset of dataset based on writing condition. + Args: + wc_dict (dict): dictionary with key as page image name and value as writing condition. + Returns: + (bool): True if writing condition matches. + """ + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True + + +def read_text(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + Returns: + dict: Mapping every word in the page image to a corresponding line. + """ + + word_line_dict = dict() + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + line_id = node.getAttribute('id') + word_image = node.getElementsByTagName('token-image') + for tnode in word_image: + word_id = tnode.getAttribute('id') + word_line_dict[word_id] = line_id + + text_line_word_dict = dict() + segment = doc.getElementsByTagName('segment') + for node in segment: + token = node.getElementsByTagName('token') + for tnode in token: + ref_word_id = tnode.getAttribute('ref_id') + word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue + ref_line_id = word_line_dict[ref_word_id] + if ref_line_id not in text_line_word_dict: + text_line_word_dict[ref_line_id] = list() + text_line_word_dict[ref_line_id].append(word) + return text_line_word_dict + + +def get_line_image_location(): + image_loc_dict = dict() # Stores image base name and location + image_loc_vect = input_image_fh.read().strip().split("\n") + for line in image_loc_vect: + base_name = os.path.basename(line) + location_vect = line.split('/') + location = "/".join(location_vect[:-1]) + image_loc_dict[base_name]=location + return image_loc_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +input_image_file = args.images_scp_path +input_image_fh = open(input_image_file, 'r', encoding='utf-8') + +wc_dict1 = parse_writing_conditions(args.writing_condition1) +wc_dict2 = parse_writing_conditions(args.writing_condition2) +wc_dict3 = parse_writing_conditions(args.writing_condition3) +image_loc_dict = get_line_image_location() + +image_num = 0 +with open(args.data_splits) as f: + prev_base_name = '' + for line in f: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_xml_path, image_file_path, wc_dict = check_file_location() + if wc_dict is None or not check_writing_condition(wc_dict): + continue + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for line_id in sorted(text_line_word_dict): + if args.augment: + key = (line_id + '.')[:-1] + for i in range(0, 3): + location_id = '_' + line_id + '_scale' + str(i) + line_image_file_name = base_name + location_id + '.png' + location = image_loc_dict[line_image_file_name] + image_file_path = os.path.join(location, line_image_file_name) + line = text_line_word_dict[key] + text = ' '.join(line) + base_line_image_file_name = line_image_file_name.split('.png')[0] + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 + else: + updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[line_id] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py new file mode 100755 index 00000000000..df8b6c5149f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('image_transcription_file', type=str, + help='Path to the file containing line image path and transcription information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +args = parser.parse_args() + + +def read_image_text(image_text_path): + """ Given the file path containing, mapping information of line image + and transcription, it returns a dict. The dict contains this mapping + info. It can be accessed via line_id and will provide transcription. + Returns: + -------- + dict: line_id and transcription mapping + """ + image_transcription_dict = dict() + with open(image_text_path, encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + image_path = line_vect[0] + line_id = os.path.basename(image_path).split('.png')[0] + transcription = line_vect[1:] + #transcription = " ".join(transcription) + #image_transcription_dict[line_id] = transcription + joined_transcription = list() + for word in transcription: + joined_transcription.append(word) + joined_transcription = " ".join(joined_transcription) + image_transcription_dict[line_id] = joined_transcription + return image_transcription_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +image_transcription_dict = read_image_text(args.image_transcription_file) +for line_id in image_transcription_dict: + writer_id = line_id.strip().split('_')[-3] + updated_line_id = line_id + '.png' + image_file_path = os.path.join('lines', updated_line_id) + text = image_transcription_dict[line_id] + utt_id = line_id + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh new file mode 100755 index 00000000000..5fe41e7cf4c --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script prepares the training and test data for MADCAT Arabic dataset +# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. + +# Eg. local/prepare_data.sh +# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ +# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 +# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 +# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif + +stage=0 +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits +images_scp_dir=data/local + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +mkdir -p data/{train,test,dev} + +if [ $stage -le 1 ]; then + echo "$0: Processing dev, train and test data...$(date)" + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 + + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 + + local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + + for dataset in dev test train; do + echo "$0: Fixing data directory for dataset: $dataset." + image/fix_data_dir.sh data/$dataset + done +fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..f44b12667e9 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1b +# WER 10.78 +# CER 2.99 +# Final train prob -0.0587 +# Final valid prob -0.0609 +# Final train prob (xent) -0.4471 +# Final valid prob (xent) -0.4653 +# Parameters 3.37M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +#exp/chain/cnn_e2eali_1b: num-iters=179 nj=8..16 num-params=3.4M dim=40->416 combine=-0.058->-0.058 (over 3) xent:train/valid[118,178,final]=(-0.463,-0.445,-0.447/-0.477,-0.462,-0.465) logprob:train/valid[118,178,final]=(-0.062,-0.059,-0.059/-0.063,-0.061,-0.061) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh new file mode 100755 index 00000000000..126d1d56c8f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora +set -e +stage=0 +nj=30 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/LDC2012T15, +# https://catalog.ldc.upenn.edu/LDC2013T09/, +# https://catalog.ldc.upenn.edu/LDC2013T15/. +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} + +if [ $stage -le 0 ]; then + echo "$0: Downloading data splits..." + echo "Date: $(date)." + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 +fi + +if [ $stage -le 1 ]; then + for dataset in dev train; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +if [ $stage -le 3 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + echo "$0: Obtaining image groups. calling get_allowed_lengths" + echo "Date: $(date)." + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train +fi + +if [ $stage -le 4 ]; then + # for dataset in train dev; do + # echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " + # echo "Date: $(date)." + # local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + # steps/compute_cmvn_stats.sh data/$dataset || exit 1; + # done + # echo "$0: Fixing data directory for train dataset $(date)." + # utils/fix_data_dir.sh data/train + + local/make_features.py data/test/images.scp --feat-dim 40 \ + --allowed_len_file_path data/test/allowed_lengths.txt --no-augment | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/test/data/images.ark,data/test/feats.scp +fi + +if [ $stage -le 5 ]; then + echo "$0: Preparing dictionary and lang..." + cut -d' ' -f2- data/train/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + for set in test train dev ; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c data/train/bpe.out | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 7 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj +fi + +if [ $stage -le 8 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 9 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj --stage 2 +fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..4893dcfea08 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.71 +# CER 2.85 +# Final train prob -0.0859 +# Final valid prob -0.1266 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 2.94M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) + +set -e + + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=2 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh new file mode 100755 index 00000000000..1d84815fc69 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh @@ -0,0 +1,6 @@ + +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh new file mode 100755 index 00000000000..5d27476d3e1 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora +set -e +stage=0 +nj=70 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/LDC2012T15, +# https://catalog.ldc.upenn.edu/LDC2013T09/, +# https://catalog.ldc.upenn.edu/LDC2013T15/. +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} + +if [ $stage -le 0 ]; then + echo "$0: Downloading data splits..." + echo "Date: $(date)." + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 +fi + +if [ $stage -le 1 ]; then + for dataset in test train dev; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +if [ $stage -le 3 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + echo "$0: Obtaining image groups. calling get_allowed_lengths" + echo "Date: $(date)." + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train +fi + +if [ $stage -le 4 ]; then + for dataset in test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + done + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 5 ]; then + echo "$0: Preparing dictionary and lang..." + cut -d' ' -f2- data/train/text | local/reverse.py | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | local/reverse.py | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 7 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj +fi + +if [ $stage -le 8 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 9 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj +fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 5d27476d3e1..95af220fd3e 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -81,12 +81,13 @@ fi if [ $stage -le 5 ]; then echo "$0: Preparing dictionary and lang..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | + utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py index 67ed0ac2789..89c50b2f069 100755 --- a/egs/wsj/s5/utils/lang/make_lexicon_fst.py +++ b/egs/wsj/s5/utils/lang/make_lexicon_fst.py @@ -72,7 +72,7 @@ def read_lexiconp(filename): with open(filename, 'r', encoding='latin-1') as f: whitespace = re.compile("[ \t]+") for line in f: - a = whitespace.split(line.strip()) + a = whitespace.split(line.rstrip('\n')) if len(a) < 2: print("{0}: error: found bad line '{1}' in lexicon file {2} ".format( sys.argv[0], line.strip(), filename), file=sys.stderr) From c3443d28ec1b0629dddb2ab02e276d981f451ff6 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 20:56:21 -0400 Subject: [PATCH 02/67] updating run_end2end for text localization --- egs/madcat_ar/v1/local/score.sh | 4 +- .../v1/local/tl/not_much_imp/run_end2end.sh | 96 +++++++++---------- .../v1/local/tl/not_much_imp/score.sh | 6 -- egs/madcat_ar/v1/run_end2end.sh | 71 ++++++-------- 4 files changed, 77 insertions(+), 100 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/score.sh diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh index 2c11aba3e13..31564d25326 100755 --- a/egs/madcat_ar/v1/local/score.sh +++ b/egs/madcat_ar/v1/local/score.sh @@ -1,5 +1,5 @@ #!/bin/bash -steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh index 126d1d56c8f..e21bf7b73dc 100755 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh @@ -7,9 +7,7 @@ nj=30 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -23,18 +21,22 @@ data_splits_dir=data/download/data_splits . ./path.sh . ./utils/parse_options.sh # e.g. this parses the above options # if supplied. +./local/check_tools.sh + mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)." local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in dev train; do + for dataset in train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ @@ -42,9 +44,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -52,74 +52,70 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train -fi -if [ $stage -le 4 ]; then - # for dataset in train dev; do - # echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - # echo "Date: $(date)." - # local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - # steps/compute_cmvn_stats.sh data/$dataset || exit 1; - # done - # echo "$0: Fixing data directory for train dataset $(date)." - # utils/fix_data_dir.sh data/train + for dataset in dev train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + utils/fix_data_dir.sh data/train local/make_features.py data/test/images.scp --feat-dim 40 \ - --allowed_len_file_path data/test/allowed_lengths.txt --no-augment | \ + --allowed_len_file_path data/test/allowed_lengths.txt | \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:data/test/data/images.ark,data/test/feats.scp fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." - cut -d' ' -f2- data/train/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out - for set in test train dev ; do +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | local/reverse.py | \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c data/train/bpe.out | sed 's/@@//g' > data/$set/bpe_text + cut -d' ' -f2- data/$set/text | local/reverse.py | \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | + utils/lang/bpe/apply_bpe.py -c data/local/bpe.out \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." local/chain/run_flatstart_cnn1a.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." - local/chain/run_cnn_e2eali_1b.sh --nj $nj --stage 2 +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" + local/chain/run_cnn_e2eali_1b.sh --nj $nj fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh deleted file mode 100755 index 1d84815fc69..00000000000 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/score.sh +++ /dev/null @@ -1,6 +0,0 @@ - -#!/bin/bash - - -steps/scoring/score_kaldi_wer.sh "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 95af220fd3e..74753aaa45d 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -7,9 +7,7 @@ nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -27,15 +25,17 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then for dataset in test train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ @@ -44,9 +44,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -54,76 +52,65 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train -fi -if [ $stage -le 4 ]; then for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." + echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ utils/lang/bpe/prepend_words.py --encoding 'utf-8' | - utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.out \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." local/chain/run_flatstart_cnn1a.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/chain/run_cnn_e2eali_1b.sh --nj $nj fi From 9c6a923e89ad8da94c428bfd6af01e4e12ecfb8a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 22:30:34 -0400 Subject: [PATCH 03/67] adding higher language model --- egs/madcat_ar/v1/local/train_lm.sh | 42 +++++++++++++++------- egs/madcat_ar/v1/run_end2end.sh | 4 +-- egs/wsj/s5/utils/lang/bpe/prepend_words.py | 11 ++++-- 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index 3b8a382cb00..85cb06480a3 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -6,20 +6,19 @@ # 2017 Hossein Hadian # Apache 2.0 # -# This script trains a LM on the MADCAT training transcriptions. +# This script trains a LM on the training transcriptions and corpus text. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation set -e stage=0 - +dir=data/local/local_lm +order=6 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; -dir=data/local/local_lm lm_dir=${dir}/data -segments=data/train/segmented_words mkdir -p $dir @@ -43,12 +42,10 @@ bypass_metaparam_optim_opt= # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. -#bypass_metaparam_optim_opt= # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done - if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text @@ -65,7 +62,7 @@ if [ $stage -le 0 ]; then # use the training data as an additional data source. # we can later fold the dev data into this. - cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work @@ -75,12 +72,10 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from MADCAT text - cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi -order=3 - if [ $stage -le 1 ]; then # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list @@ -88,7 +83,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 madcat=1' + min_counts='train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" @@ -96,13 +91,34 @@ if [ $stage -le 1 ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm - train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ --limit-unk-history=true \ ${bypass_metaparam_optim_opt} \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500k n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 74753aaa45d..717a629ae60 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -76,7 +76,7 @@ if [ $stage -le 2 ]; then cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ utils/lang/bpe/prepend_words.py --encoding 'utf-8' | - utils/lang/bpe/apply_bpe.py -c data/local/bpe.out \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old @@ -94,7 +94,7 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi diff --git a/egs/wsj/s5/utils/lang/bpe/prepend_words.py b/egs/wsj/s5/utils/lang/bpe/prepend_words.py index face771c7ca..d497344e850 100755 --- a/egs/wsj/s5/utils/lang/bpe/prepend_words.py +++ b/egs/wsj/s5/utils/lang/bpe/prepend_words.py @@ -4,11 +4,16 @@ # the beginning of the words for finding the initial-space of every word # after decoding. +import argparse import sys, io -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') +parser = argparse.ArgumentParser(description="Prepends '|' to the beginning of every word") +parser.add_argument('--encoding', type=str, default='latin-1', + help='Type of encoding') +args = parser.parse_args() + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding=args.encoding) +output = io.TextIOWrapper(sys.stdout.buffer, encoding=args.encoding) for line in infile: output.write(' '.join([ "|"+word for word in line.split()]) + '\n') - From 2c87fe5eecc60afd00a056e9f11acf55bc4bf54b Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 22:37:13 -0400 Subject: [PATCH 04/67] fixing bug --- egs/madcat_ar/v1/run_end2end.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 717a629ae60..3696284ed1e 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -75,7 +75,7 @@ if [ $stage -le 2 ]; then for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text From 053fbdb09491c48b9bf72084b1a5bb8d41c9de26 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 22:50:58 -0400 Subject: [PATCH 05/67] minor fix --- egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh | 10 +++++----- egs/madcat_ar/v1/run_end2end.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh index e21bf7b73dc..01072c565bd 100755 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh @@ -32,7 +32,7 @@ if [ $stage -le 0 ]; then echo "Exiting with status 1 to avoid data corruption" exit 1; fi - echo "$0: Downloading data splits...$(date)." + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 @@ -80,13 +80,13 @@ if [ $stage -le 2 ]; then for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | - utils/lang/bpe/apply_bpe.py -c data/local/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - rm -f data/$set/bpe_text data/$set/ids + #rm -f data/$set/bpe_text data/$set/ids done echo "$0:Preparing dictionary and lang..." @@ -99,7 +99,7 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 3696284ed1e..856ddb97f11 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -81,7 +81,7 @@ if [ $stage -le 2 ]; then mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - rm -f data/$set/bpe_text data/$set/ids + #rm -f data/$set/bpe_text data/$set/ids done echo "$0:Preparing dictionary and lang..." From 837fd4dfcd717d022dc9cab508571c47d385b796 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 14 Sep 2018 22:56:24 -0400 Subject: [PATCH 06/67] adding augmentation --- .../v1/local/tl/not_much_imp/run_end2end.sh | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh index 01072c565bd..1ff5b549180 100755 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh +++ b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh @@ -56,22 +56,25 @@ if [ $stage -le 1 ]; then echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - - for dataset in dev train; do + for set in dev test train; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." - utils/fix_data_dir.sh data/train + image/fix_data_dir.sh data/train - local/make_features.py data/test/images.scp --feat-dim 40 \ - --allowed_len_file_path data/test/allowed_lengths.txt | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/test/data/images.ark,data/test/feats.scp fi if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ From 9c1d5533a501f80532032984c70c4bca280453b6 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 15:50:37 -0400 Subject: [PATCH 07/67] updating parameters --- .../v1/local/chain/run_flatstart_cnn1b.sh | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh new file mode 100644 index 00000000000..71130edf244 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.71 +# CER 2.85 +# Final train prob -0.0859 +# Final valid prob -0.1266 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 2.94M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) + +set -e + +# configs for 'chain' +stage=0 +nj=70 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=550 +minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 +common_egs_dir= +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +lang_test=lang_test +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir From 47b6508c80f6cb0dfa3b74a4562843d877012a6a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 15:59:00 -0400 Subject: [PATCH 08/67] updating parameters --- .../v1/local/chain/run_flatstart_cnn1b.sh | 27 +++++++------------ egs/madcat_ar/v1/path.sh | 1 + 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh index 71130edf244..2374c2181c1 100644 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh @@ -23,7 +23,7 @@ stage=0 nj=70 train_stage=-10 get_egs_stage=-10 -affix=1a +affix=1b # training options tdnn_dim=550 @@ -94,20 +94,10 @@ if [ $stage -le 2 ]; then conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 - relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 + ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 @@ -123,15 +113,16 @@ if [ $stage -le 3 ]; then steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ - --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter 2000000 \ diff --git a/egs/madcat_ar/v1/path.sh b/egs/madcat_ar/v1/path.sh index 2d17b17a84a..252d4ab04fe 100755 --- a/egs/madcat_ar/v1/path.sh +++ b/egs/madcat_ar/v1/path.sh @@ -3,4 +3,5 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +export CUDA_CACHE_DISABLE=1 export LC_ALL=C From 18f585e46a597651fa4323f7c561011ce5cf7384 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 16:58:16 -0400 Subject: [PATCH 09/67] updating parameters --- egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh | 1 + egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh index 4eea10a8441..67c76fdfd37 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh @@ -125,6 +125,7 @@ if [ $stage -le 3 ]; then --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter $frames_per_iter \ diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh old mode 100644 new mode 100755 index 2374c2181c1..901903a9bba --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh @@ -117,7 +117,6 @@ if [ $stage -le 3 ]; then --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ - --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ From cf22d16ecc0cf933101e8c61aaaeb198796d3af1 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 18:11:14 -0400 Subject: [PATCH 10/67] minor cleaning and higher order language model --- .../v1/local/chain/run_cnn_e2eali_1b.sh | 2 +- .../v1/local/chain/run_flatstart_cnn1a.sh | 3 +- .../create_line_image_from_page_image.py | 15 ++-- egs/madcat_ar/v1/local/process_data.py | 72 +++++++----------- egs/madcat_ar/v1/local/score.sh | 4 +- egs/madcat_ar/v1/local/train_lm.sh | 42 ++++++---- egs/madcat_ar/v1/run_end2end.sh | 76 ++++++++----------- 7 files changed, 103 insertions(+), 111 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh index 75c246f5ffe..55df0cad4b7 100755 --- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh @@ -193,7 +193,7 @@ if [ $stage -le 5 ]; then --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh index 2c85e982ce6..67c76fdfd37 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh @@ -33,7 +33,7 @@ num_jobs_final=16 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= l2_regularize=0.00005 -frames_per_iter=1000000 +frames_per_iter=2000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train lang_test=lang_test @@ -125,6 +125,7 @@ if [ $stage -le 3 ]; then --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter $frames_per_iter \ diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index ba35f8b9ace..a91fe55ed3e 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -60,6 +60,8 @@ help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument('--padding', type=int, default=400, help='padding across horizontal/verticle direction') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() """ @@ -535,13 +537,12 @@ def check_writing_condition(wc_dict, base_name): Returns (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - - return True - + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True ### main ### diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index b57500cf2fa..920cb6f700b 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -42,6 +42,8 @@ help='Path to the downloaded (and extracted) writing conditions file 2') parser.add_argument('writing_condition3', type=str, help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -97,50 +99,40 @@ def check_writing_condition(wc_dict): Returns: (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True - return True - -def get_word_line_mapping(madcat_file_path): +def read_text(madcat_file_path): """ Maps every word in the page image to a corresponding line. Args: - madcat_file_path (string): complete path and name of the madcat xml file + madcat_file_path (string): complete path and name of the madcat xml file corresponding to the page image. Returns: + dict: Mapping every word in the page image to a corresponding line. """ + + word_line_dict = dict() doc = minidom.parse(madcat_file_path) zone = doc.getElementsByTagName('zone') for node in zone: line_id = node.getAttribute('id') - line_word_dict[line_id] = list() word_image = node.getElementsByTagName('token-image') for tnode in word_image: word_id = tnode.getAttribute('id') - line_word_dict[line_id].append(word_id) word_line_dict[word_id] = line_id - -def read_text(madcat_file_path): - """ Maps every word in the page image to a corresponding line. - Args: - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - Returns: - dict: Mapping every word in the page image to a corresponding line. - """ text_line_word_dict = dict() - doc = minidom.parse(madcat_file_path) segment = doc.getElementsByTagName('segment') for node in segment: token = node.getElementsByTagName('token') for tnode in token: ref_word_id = tnode.getAttribute('ref_id') word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue - word = unicodedata.normalize('NFKC',word) ref_line_id = word_line_dict[ref_word_id] if ref_line_id not in text_line_word_dict: text_line_word_dict[ref_line_id] = list() @@ -160,7 +152,6 @@ def get_line_image_location(): ### main ### - print("Processing '{}' data...".format(args.out_dir)) text_file = os.path.join(args.out_dir, 'text') @@ -188,24 +179,19 @@ def get_line_image_location(): madcat_xml_path, image_file_path, wc_dict = check_file_location() if wc_dict is None or not check_writing_condition(wc_dict): continue - if madcat_xml_path is not None: - madcat_doc = minidom.parse(madcat_xml_path) - writer = madcat_doc.getElementsByTagName('writer') - writer_id = writer[0].getAttribute('id') - line_word_dict = dict() - word_line_dict = dict() - get_word_line_mapping(madcat_xml_path) - text_line_word_dict = read_text(madcat_xml_path) - base_name = os.path.basename(image_file_path) - base_name, b = base_name.split('.tif') - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for lineID in sorted(text_line_word_dict): + updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[lineID] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh index 2c11aba3e13..31564d25326 100755 --- a/egs/madcat_ar/v1/local/score.sh +++ b/egs/madcat_ar/v1/local/score.sh @@ -1,5 +1,5 @@ #!/bin/bash -steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index 3b8a382cb00..85cb06480a3 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -6,20 +6,19 @@ # 2017 Hossein Hadian # Apache 2.0 # -# This script trains a LM on the MADCAT training transcriptions. +# This script trains a LM on the training transcriptions and corpus text. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation set -e stage=0 - +dir=data/local/local_lm +order=6 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; -dir=data/local/local_lm lm_dir=${dir}/data -segments=data/train/segmented_words mkdir -p $dir @@ -43,12 +42,10 @@ bypass_metaparam_optim_opt= # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. -#bypass_metaparam_optim_opt= # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done - if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text @@ -65,7 +62,7 @@ if [ $stage -le 0 ]; then # use the training data as an additional data source. # we can later fold the dev data into this. - cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work @@ -75,12 +72,10 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from MADCAT text - cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi -order=3 - if [ $stage -le 1 ]; then # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list @@ -88,7 +83,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 madcat=1' + min_counts='train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" @@ -96,13 +91,34 @@ if [ $stage -le 1 ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm - train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ --limit-unk-history=true \ ${bypass_metaparam_optim_opt} \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500k n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 5d27476d3e1..3696284ed1e 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -7,9 +7,7 @@ nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -27,15 +25,17 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then for dataset in test train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ @@ -44,9 +44,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -54,75 +52,65 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train -fi -if [ $stage -le 4 ]; then for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." + echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." local/chain/run_flatstart_cnn1a.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/chain/run_cnn_e2eali_1b.sh --nj $nj fi From 95aed1005b58f8adcff45f988947bfffabd09bbb Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 18:16:33 -0400 Subject: [PATCH 11/67] updating results --- egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh | 10 +++++----- egs/madcat_ar/v1/local/train_lm.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh index 67c76fdfd37..033cb88df10 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh @@ -5,16 +5,16 @@ # local/chain/compare_wer.sh exp/chain/e2e_cnn_1a # System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 +# WER 7.81 +# CER 2.05 +# Final train prob -0.0812 +# Final valid prob -0.0708 # Final train prob (xent) # Final valid prob (xent) # Parameters 2.94M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) +# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.073->-0.073 (over 2) logprob:train/valid[64,97,final]=(-0.084,-0.080,-0.081/-0.073,-0.070,-0.071) set -e diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index 85cb06480a3..b7fc0b09a46 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -6,7 +6,7 @@ # 2017 Hossein Hadian # Apache 2.0 # -# This script trains a LM on the training transcriptions and corpus text. +# This script trains a LM on the training transcriptions. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation From 85e3649535912d68ebadeae7126276edda0c4cb0 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 18:26:11 -0400 Subject: [PATCH 12/67] minor fix and adding tuning directory --- egs/madcat_ar/v1/local/chain/run_cnn.sh | 1 + egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh | 1 + egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh | 1 + egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh | 1 + egs/madcat_ar/v1/local/chain/{ => tuning}/run_cnn_1a.sh | 0 .../v1/local/chain/{ => tuning}/run_cnn_chainali_1a.sh | 0 .../v1/local/chain/{ => tuning}/run_cnn_e2eali_1a.sh | 0 .../v1/local/chain/{ => tuning}/run_cnn_e2eali_1b.sh | 0 .../{run_flatstart_cnn1a.sh => tuning/run_e2e_cnn_1a.sh} | 0 egs/madcat_ar/v1/run.sh | 4 ++-- egs/madcat_ar/v1/run_end2end.sh | 4 ++-- 11 files changed, 8 insertions(+), 4 deletions(-) create mode 120000 egs/madcat_ar/v1/local/chain/run_cnn.sh create mode 120000 egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh create mode 120000 egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh create mode 120000 egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh rename egs/madcat_ar/v1/local/chain/{ => tuning}/run_cnn_1a.sh (100%) rename egs/madcat_ar/v1/local/chain/{ => tuning}/run_cnn_chainali_1a.sh (100%) rename egs/madcat_ar/v1/local/chain/{ => tuning}/run_cnn_e2eali_1a.sh (100%) rename egs/madcat_ar/v1/local/chain/{ => tuning}/run_cnn_e2eali_1b.sh (100%) rename egs/madcat_ar/v1/local/chain/{run_flatstart_cnn1a.sh => tuning/run_e2e_cnn_1a.sh} (100%) diff --git a/egs/madcat_ar/v1/local/chain/run_cnn.sh b/egs/madcat_ar/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..a864819f542 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..fcf59f917c1 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1b.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 14c8bf7a6ce..06a16bf157a 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -132,9 +132,9 @@ if [ $stage -le 12 ]; then fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh + local/chain/run_cnn.sh fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1a.sh --stage 2 + local/chain/run_cnn_chainali.sh --stage 2 fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 3696284ed1e..0a57676fdbf 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -100,7 +100,7 @@ fi if [ $stage -le 4 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj + local/chain/run_e2e_cnn.sh --nj $nj fi if [ $stage -le 5 ]; then @@ -112,5 +112,5 @@ fi if [ $stage -le 6 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali_1b.sh --nj $nj + local/chain/run_cnn_e2eali.sh --nj $nj fi From bff652cc88c46741fb7e14be429b79a77834d812 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 18:50:24 -0400 Subject: [PATCH 13/67] adding overwrite variable --- egs/madcat_ar/v1/local/extract_features.sh | 4 ++++ egs/madcat_ar/v1/run_end2end.sh | 1 + 2 files changed, 5 insertions(+) diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 70c5498626c..56a8443e328 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -1,7 +1,11 @@ #!/bin/bash + # Copyright 2017 Yiwen Shao # 2018 Ashish Arora +# Apache 2.0 +# This script runs the make features script in parallel. + nj=4 cmd=run.pl feat_dim=40 diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 0a57676fdbf..e5ca540d3c1 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -15,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. From 303246ee867a2414b755f76a52e7406a98f1f7b3 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 18:53:56 -0400 Subject: [PATCH 14/67] adding documentation, fixing run.sh, minor fix --- egs/madcat_ar/v1/run.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 06a16bf157a..f6a63320497 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -11,9 +11,7 @@ decode_gmm=false # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -21,7 +19,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits - +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -34,8 +32,14 @@ mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 fi @@ -79,7 +83,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi From 6b857dec87a33652921835bbf6e64fba125799b7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 22:04:38 -0400 Subject: [PATCH 15/67] adding text localization changes --- egs/madcat_ar/v1/local/tl/augment_data.sh | 34 ++ ...eate_line_image_from_page_image.py.augment | 530 ++++++++++++++++++ egs/madcat_ar/v1/local/tl/make_features.py | 170 ++++++ egs/madcat_ar/v1/local/tl/prepare_data.sh | 49 ++ egs/madcat_ar/v1/local/tl/process_data.py | 215 +++++++ .../v1/local/tl/process_waldo_data.py | 62 ++ egs/madcat_ar/v1/local/tl/run_end2end.sh | 124 ++++ .../v1/local/tl/run_textlocalization.sh | 128 +++++ 8 files changed, 1312 insertions(+) create mode 100755 egs/madcat_ar/v1/local/tl/augment_data.sh create mode 100755 egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment create mode 100755 egs/madcat_ar/v1/local/tl/make_features.py create mode 100755 egs/madcat_ar/v1/local/tl/prepare_data.sh create mode 100755 egs/madcat_ar/v1/local/tl/process_data.py create mode 100755 egs/madcat_ar/v1/local/tl/process_waldo_data.py create mode 100755 egs/madcat_ar/v1/local/tl/run_end2end.sh create mode 100755 egs/madcat_ar/v1/local/tl/run_textlocalization.sh diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment new file mode 100755 index 00000000000..faf0d3503c7 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment @@ -0,0 +1,530 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import sys +import argparse +import os +import xml.dom.minidom as minidom +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple +import random +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage +parser = argparse.ArgumentParser(description="Creates line images from page image", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded madcat data directory 1') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded madcat data directory 2') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded madcat data directory 3') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument('--padding', type=int, default=400, + help='padding across horizontal/verticle direction') +parser.add_argument('--pixel-scaling', type=int, default=30, + help='padding across horizontal/verticle direction') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = im.size[0] / 2 + center_y = im.size[1] / 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +def dilate_polygon(points, amount_increase): + """ Increases size of polygon given as a list of tuples. + Assumes points in polygon are given in CCW + """ + expanded_points = [] + for index, point in enumerate(points): + prev_point = points[(index - 1) % len(points)] + next_point = points[(index + 1) % len(points)] + prev_edge = np.subtract(point, prev_point) + next_edge = np.subtract(next_point, point) + + prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) + prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) + next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) + next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) + + bisect = np.add(prev_normal, next_normal) + bisect = np.divide(bisect, np.linalg.norm(bisect)) + + cos_theta = np.dot(next_normal, bisect) + hyp = amount_increase / cos_theta + + new_point = np.around(point + hyp * bisect) + new_point = new_point.astype(int) + new_point = new_point.tolist() + new_point = tuple(new_point) + expanded_points.append(new_point) + return expanded_points + + +def set_line_image_data(image, line_id, image_file_name, image_fh): + """ Given an image, saves a flipped line image. Line image file name + is formed by appending the line id at the end page image name. + """ + + base_name = os.path.splitext(os.path.basename(image_file_name))[0] + line_id = '_' + line_id.zfill(4) + line_image_file_name = base_name + line_id + '.png' + image_path = os.path.join(args.out_dir, line_image_file_name) + imgray = image.convert('L') + imgray_rev_arr = np.fliplr(imgray) + imgray_rev = toimage(imgray_rev_arr) + imgray_rev.save(image_path) + image_fh.write(image_path + '\n') + + +def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh): + """ Given a page image, extracts the line images from it. + Input + ----- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + im_wo_pad = Image.open(image_file_name) + im = pad_image(im_wo_pad) + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + id = node.getAttribute('id') + token_image = node.getElementsByTagName('token-image') + minimum_bounding_box_input = [] + for token_node in token_image: + word_point = token_node.getElementsByTagName('point') + for word_node in word_point: + word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) + minimum_bounding_box_input.append(word_coordinate) + updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) + points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + + +def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): + """ Returns the complete path of the page image and corresponding + xml file. + Returns + ------- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Given writing condition file path, returns a dictionary which have writing condition + of each page image. + Returns + ------ + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict, base_name): + """ Given writing condition dictionary, checks if a page image is writing + in a specifed writing condition. + It is used to create subset of dataset based on writing condition. + Returns + (bool): True if writing condition matches. + """ + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True + +### main ### + +def main(): + + wc_dict1 = parse_writing_conditions(args.writing_condition1) + wc_dict2 = parse_writing_conditions(args.writing_condition2) + wc_dict3 = parse_writing_conditions(args.writing_condition3) + output_directory = args.out_dir + image_file = os.path.join(output_directory, 'images.scp') + image_fh = open(image_file, 'w', encoding='utf-8') + + splits_handle = open(args.data_splits, 'r') + splits_data = splits_handle.read().strip().split('\n') + prev_base_name = '' + for line in splits_data: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) + if wc_dict is None or not check_writing_condition(wc_dict, base_name): + continue + if madcat_file_path is not None: + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + + +if __name__ == '__main__': + main() + diff --git a/egs/madcat_ar/v1/local/tl/make_features.py b/egs/madcat_ar/v1/local/tl/make_features.py new file mode 100755 index 00000000000..e9d10ecc87e --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/make_features.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" +import random +import argparse +import os +import sys +import numpy as np +from scipy import misc +import math + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +parser.add_argument('--vertical-shift', type=int, default=16, + help='total number of padding pixel per column') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + return im + + +def horizontal_pad(im, allowed_lengths = None): + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] # width + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = int(padding // 2) + right_padding = padding - left_padding + dim_y = im.shape[0] # height + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'mid': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad + +### main ### +random.seed(1) +data_list_path = args.images_scp_path +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'w') + +allowed_lengths = None +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(allowed_len_handle) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +aug_setting = ['mid', 'notmid'] +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im) + im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) + if im_horizontal_padded is None: + num_fail += 1 + continue + if args.augment: + im_shift = vertical_shift(im_horizontal_padded, shift_setting[1]) + else: + im_shift = vertical_shift(im_horizontal_padded, shift_setting[0]) + data = np.transpose(im_shift, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (image too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/tl/prepare_data.sh b/egs/madcat_ar/v1/local/tl/prepare_data.sh new file mode 100755 index 00000000000..5fe41e7cf4c --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/prepare_data.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script prepares the training and test data for MADCAT Arabic dataset +# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. + +# Eg. local/prepare_data.sh +# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ +# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 +# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 +# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif + +stage=0 +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits +images_scp_dir=data/local + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +mkdir -p data/{train,test,dev} + +if [ $stage -le 1 ]; then + echo "$0: Processing dev, train and test data...$(date)" + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 + + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 + + local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + + for dataset in dev test train; do + echo "$0: Fixing data directory for dataset: $dataset." + image/fix_data_dir.sh data/$dataset + done +fi diff --git a/egs/madcat_ar/v1/local/tl/process_data.py b/egs/madcat_ar/v1/local/tl/process_data.py new file mode 100755 index 00000000000..c21beb1be70 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/process_data.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +""" This script reads MADCAT files and creates the following files (for the + data subset selected via --dataset) :text, utt2spk, images.scp. + Eg. local/process_data.py data/local /export/corpora/LDC/LDC2012T15 /export/corpora/LDC/LDC2013T09 + /export/corpora/LDC/LDC2013T15 data/download/data_splits/madcat.train.raw.lineid + data/dev data/local/lines/images.scp + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 وجه وعقل غارق حتّى النخاع + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +parser.add_argument('images_scp_path', type=str, + help='Path of input images.scp file(maps line image and location)') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") +args = parser.parse_args() + + +def check_file_location(): + """ Returns the complete path of the page image and corresponding + xml file. + Args: + Returns: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Returns a dictionary which have writing condition of each page image. + Args: + writing_conditions(string): complete path of writing condition file. + Returns: + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict): + """ Checks if a given page image is writing in a given writing condition. + It is used to create subset of dataset based on writing condition. + Args: + wc_dict (dict): dictionary with key as page image name and value as writing condition. + Returns: + (bool): True if writing condition matches. + """ + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True + + +def read_text(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + Returns: + dict: Mapping every word in the page image to a corresponding line. + """ + + word_line_dict = dict() + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + line_id = node.getAttribute('id') + word_image = node.getElementsByTagName('token-image') + for tnode in word_image: + word_id = tnode.getAttribute('id') + word_line_dict[word_id] = line_id + + text_line_word_dict = dict() + segment = doc.getElementsByTagName('segment') + for node in segment: + token = node.getElementsByTagName('token') + for tnode in token: + ref_word_id = tnode.getAttribute('ref_id') + word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue + ref_line_id = word_line_dict[ref_word_id] + if ref_line_id not in text_line_word_dict: + text_line_word_dict[ref_line_id] = list() + text_line_word_dict[ref_line_id].append(word) + return text_line_word_dict + + +def get_line_image_location(): + image_loc_dict = dict() # Stores image base name and location + image_loc_vect = input_image_fh.read().strip().split("\n") + for line in image_loc_vect: + base_name = os.path.basename(line) + location_vect = line.split('/') + location = "/".join(location_vect[:-1]) + image_loc_dict[base_name]=location + return image_loc_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +input_image_file = args.images_scp_path +input_image_fh = open(input_image_file, 'r', encoding='utf-8') + +wc_dict1 = parse_writing_conditions(args.writing_condition1) +wc_dict2 = parse_writing_conditions(args.writing_condition2) +wc_dict3 = parse_writing_conditions(args.writing_condition3) +image_loc_dict = get_line_image_location() + +image_num = 0 +with open(args.data_splits) as f: + prev_base_name = '' + for line in f: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_xml_path, image_file_path, wc_dict = check_file_location() + if wc_dict is None or not check_writing_condition(wc_dict): + continue + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for line_id in sorted(text_line_word_dict): + if args.augment: + key = (line_id + '.')[:-1] + for i in range(0, 3): + location_id = '_' + line_id + '_scale' + str(i) + line_image_file_name = base_name + location_id + '.png' + location = image_loc_dict[line_image_file_name] + image_file_path = os.path.join(location, line_image_file_name) + line = text_line_word_dict[key] + text = ' '.join(line) + base_line_image_file_name = line_image_file_name.split('.png')[0] + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 + else: + updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[line_id] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py new file mode 100755 index 00000000000..df8b6c5149f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('image_transcription_file', type=str, + help='Path to the file containing line image path and transcription information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +args = parser.parse_args() + + +def read_image_text(image_text_path): + """ Given the file path containing, mapping information of line image + and transcription, it returns a dict. The dict contains this mapping + info. It can be accessed via line_id and will provide transcription. + Returns: + -------- + dict: line_id and transcription mapping + """ + image_transcription_dict = dict() + with open(image_text_path, encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + image_path = line_vect[0] + line_id = os.path.basename(image_path).split('.png')[0] + transcription = line_vect[1:] + #transcription = " ".join(transcription) + #image_transcription_dict[line_id] = transcription + joined_transcription = list() + for word in transcription: + joined_transcription.append(word) + joined_transcription = " ".join(joined_transcription) + image_transcription_dict[line_id] = joined_transcription + return image_transcription_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +image_transcription_dict = read_image_text(args.image_transcription_file) +for line_id in image_transcription_dict: + writer_id = line_id.strip().split('_')[-3] + updated_line_id = line_id + '.png' + image_file_path = os.path.join('lines', updated_line_id) + text = image_transcription_dict[line_id] + utt_id = line_id + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + diff --git a/egs/madcat_ar/v1/local/tl/run_end2end.sh b/egs/madcat_ar/v1/local/tl/run_end2end.sh new file mode 100755 index 00000000000..1ff5b549180 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/run_end2end.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora +set -e +stage=0 +nj=30 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 + + for dataset in train dev; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done + + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in dev test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + image/fix_data_dir.sh data/train + +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | local/reverse.py | \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | local/reverse.py | \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + #rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj +fi + +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" + local/chain/run_cnn_e2eali_1b.sh --nj $nj +fi diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh new file mode 100755 index 00000000000..5d27476d3e1 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora +set -e +stage=0 +nj=70 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/LDC2012T15, +# https://catalog.ldc.upenn.edu/LDC2013T09/, +# https://catalog.ldc.upenn.edu/LDC2013T15/. +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} + +if [ $stage -le 0 ]; then + echo "$0: Downloading data splits..." + echo "Date: $(date)." + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 +fi + +if [ $stage -le 1 ]; then + for dataset in test train dev; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +if [ $stage -le 3 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + echo "$0: Obtaining image groups. calling get_allowed_lengths" + echo "Date: $(date)." + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train +fi + +if [ $stage -le 4 ]; then + for dataset in test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + done + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 5 ]; then + echo "$0: Preparing dictionary and lang..." + cut -d' ' -f2- data/train/text | local/reverse.py | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | local/reverse.py | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 7 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj +fi + +if [ $stage -le 8 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 9 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj +fi From 1bd1448d1c543de817510fb984d67a571ac4dc59 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 22:22:50 -0400 Subject: [PATCH 16/67] adding gpu = false for alignments in runend2end --- egs/madcat_ar/v1/run_end2end.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index e5ca540d3c1..3986ede9d7f 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -107,6 +107,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi From 895342a9a0b5a9ddbfcb02bf55511d1a2f5addc8 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 22:30:39 -0400 Subject: [PATCH 17/67] updating text localization routine --- egs/madcat_ar/v1/local/tl/run_end2end.sh | 124 ------------------ .../v1/local/tl/run_textlocalization.sh | 95 +++++++------- 2 files changed, 46 insertions(+), 173 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/tl/run_end2end.sh diff --git a/egs/madcat_ar/v1/local/tl/run_end2end.sh b/egs/madcat_ar/v1/local/tl/run_end2end.sh deleted file mode 100755 index 1ff5b549180..00000000000 --- a/egs/madcat_ar/v1/local/tl/run_end2end.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian -# 2018 Ashish Arora -set -e -stage=0 -nj=30 -# download_dir{1,2,3} points to the database path on the JHU grid. If you have not -# already downloaded the database you can set it to a local directory -# This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits - -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. -. ./path.sh -. ./utils/parse_options.sh # e.g. this parses the above options - # if supplied. -./local/check_tools.sh - -mkdir -p data/{train,test,dev}/data -mkdir -p data/local/{train,test,dev} -if [ $stage -le 0 ]; then - - if [ -f data/train/text ] && ! $overwrite; then - echo "$0: Not processing, probably script have run from wrong stage" - echo "Exiting with status 1 to avoid data corruption" - exit 1; - fi - echo "$0: Downloading data splits...$(date)" - local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ - --download_dir2 $download_dir2 --download_dir3 $download_dir3 - - for dataset in train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid - local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ - --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset - done - - echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 -fi - -if [ $stage -le 1 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." - image/get_image2num_frames.py data/train - image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in dev test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set - steps/compute_cmvn_stats.sh data/$set || exit 1; - done - echo "$0: Fixing data directory for train dataset $(date)." - image/fix_data_dir.sh data/train - -fi - -if [ $stage -le 2 ]; then - for set in train; do - echo "$(date) stage 2: Performing augmentation, it will double training data" - local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data - steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; - done -fi - -if [ $stage -le 3 ]; then - echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - - for set in test train dev; do - cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ - utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ - | sed 's/@@//g' > data/$set/bpe_text - - mv data/$set/text data/$set/text.old - paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - #rm -f data/$set/bpe_text data/$set/ids - done - - echo "$0:Preparing dictionary and lang..." - local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ - data/local/dict "" data/lang/temp data/lang - utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang -fi - -if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test -fi - -if [ $stage -le 4 ]; then - echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj -fi - -if [ $stage -le 5 ]; then - echo "$0: Aligning the training data using the e2e chain model...$(date)." - steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train -fi - -if [ $stage -le 6 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali_1b.sh --nj $nj -fi diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 5d27476d3e1..3211e93e120 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -3,13 +3,11 @@ # 2018 Ashish Arora set -e stage=0 -nj=70 +nj=30 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -17,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -27,16 +26,18 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in test train dev; do + for dataset in train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ @@ -44,9 +45,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -54,75 +53,73 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in dev test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + image/fix_data_dir.sh data/train + fi -if [ $stage -le 4 ]; then - for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." - utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + #rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." local/chain/run_flatstart_cnn1a.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/chain/run_cnn_e2eali_1b.sh --nj $nj fi From a72d9224066396e12149d8d43ca701a79b34c4ea Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 23:04:19 -0400 Subject: [PATCH 18/67] removing unused function --- .../create_line_image_from_page_image.py | 45 ------------------- 1 file changed, 45 deletions(-) diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index a91fe55ed3e..b6af4cbe717 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -211,50 +211,6 @@ def get_orientation(origin, p1, p2): return difference -def compute_hull(points): - """ - Given input list of points, return a list of points that - made up the convex hull. - Returns - ------- - [(float, float)]: convexhull points - """ - hull_points = [] - start = points[0] - min_x = start[0] - for p in points[1:]: - if p[0] < min_x: - min_x = p[0] - start = p - - point = start - hull_points.append(start) - - far_point = None - while far_point is not start: - p1 = None - for p in points: - if p is point: - continue - else: - p1 = p - break - - far_point = p1 - - for p2 in points: - if p2 is point or p2 is p1: - continue - else: - direction = get_orientation(point, far_point, p2) - if direction > 0: - far_point = p2 - - hull_points.append(far_point) - point = far_point - return hull_points - - def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. @@ -274,7 +230,6 @@ def minimum_bounding_box(points): hull_ordered = [points[index] for index in ConvexHull(points).vertices] hull_ordered.append(hull_ordered[0]) - #hull_ordered = compute_hull(points) hull_ordered = tuple(hull_ordered) min_rectangle = bounding_area(0, hull_ordered) From b2ef92343fbe23fadef604d99de0546f8dd09154 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 23:09:29 -0400 Subject: [PATCH 19/67] minor change --- egs/madcat_ar/v1/local/create_line_image_from_page_image.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index b6af4cbe717..34e339f1877 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -500,7 +500,6 @@ def check_writing_condition(wc_dict, base_name): return True ### main ### - def main(): wc_dict1 = parse_writing_conditions(args.writing_condition1) @@ -520,8 +519,7 @@ def main(): madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) if wc_dict is None or not check_writing_condition(wc_dict, base_name): continue - if madcat_file_path is not None: - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) if __name__ == '__main__': From b8974aae6011b30b6fbe746693c8dc05f28f6b47 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 17 Sep 2018 23:56:02 -0400 Subject: [PATCH 20/67] adding option for augmentation --- ...t => create_line_image_from_page_image.py} | 55 ++++++++++++++++--- 1 file changed, 46 insertions(+), 9 deletions(-) rename egs/madcat_ar/v1/local/tl/{create_line_image_from_page_image.py.augment => create_line_image_from_page_image.py} (89%) diff --git a/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py similarity index 89% rename from egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment rename to egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py index faf0d3503c7..bb126c39538 100755 --- a/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py.augment +++ b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py @@ -52,6 +52,8 @@ help='padding across horizontal/verticle direction') parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, help="only processes subset of data based on writing condition") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() """ @@ -401,10 +403,48 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) minimum_bounding_box_input.append(word_coordinate) updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] - for i in range(0, 3): - additional_pixel = random.randint(1, args.pixel_scaling) - mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) - bounding_box = minimum_bounding_box(mar) + if args.augment: + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + else: + bounding_box = minimum_bounding_box(points_ordered) (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) @@ -438,8 +478,7 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) box = (min_x, min_y, max_x, max_y) region_final = img2.crop(box) - line_id = id + '_scale' + str(i) - set_line_image_data(region_final, line_id, image_file_name, image_fh) + set_line_image_data(region_final, id, image_file_name, image_fh) def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): @@ -501,7 +540,6 @@ def check_writing_condition(wc_dict, base_name): return True ### main ### - def main(): wc_dict1 = parse_writing_conditions(args.writing_condition1) @@ -521,8 +559,7 @@ def main(): madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) if wc_dict is None or not check_writing_condition(wc_dict, base_name): continue - if madcat_file_path is not None: - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) if __name__ == '__main__': From 04b938c01d38cbbcae5801d6e4391eb4d831ecf3 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 18 Sep 2018 00:03:44 -0400 Subject: [PATCH 21/67] updating text localization routines --- ...t => create_line_image_from_page_image.py} | 73 ++++-- .../v1/local/tl/{imp => }/make_features.py | 0 .../tl/not_much_imp/run_cnn_e2eali_1b.sh | 246 ------------------ .../v1/local/tl/not_much_imp/run_end2end.sh | 124 --------- .../tl/not_much_imp/run_flatstart_cnn1a.sh | 168 ------------ .../tl/{not_much_imp => }/prepare_data.sh | 0 .../v1/local/tl/{imp => }/process_data.py | 0 .../local/tl/{imp => }/process_waldo_data.py | 0 .../v1/local/tl/run_textlocalization.sh | 95 ++++--- 9 files changed, 102 insertions(+), 604 deletions(-) rename egs/madcat_ar/v1/local/tl/{imp/create_line_image_from_page_image.py.augment => create_line_image_from_page_image.py} (87%) rename egs/madcat_ar/v1/local/tl/{imp => }/make_features.py (100%) delete mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh delete mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh delete mode 100755 egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh rename egs/madcat_ar/v1/local/tl/{not_much_imp => }/prepare_data.sh (100%) rename egs/madcat_ar/v1/local/tl/{imp => }/process_data.py (100%) rename egs/madcat_ar/v1/local/tl/{imp => }/process_waldo_data.py (100%) diff --git a/egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py similarity index 87% rename from egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment rename to egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py index da2b0f0a62f..bb126c39538 100755 --- a/egs/madcat_ar/v1/local/tl/imp/create_line_image_from_page_image.py.augment +++ b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py @@ -50,6 +50,10 @@ help='padding across horizontal/verticle direction') parser.add_argument('--pixel-scaling', type=int, default=30, help='padding across horizontal/verticle direction') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() """ @@ -331,7 +335,8 @@ def update_minimum_bounding_box_input(bounding_box_input): def dilate_polygon(points, amount_increase): - """ Increases size of polygon given as a list of tuples. Assumes points in polygon are given in CCW + """ Increases size of polygon given as a list of tuples. + Assumes points in polygon are given in CCW """ expanded_points = [] for index, point in enumerate(points): @@ -398,10 +403,48 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) minimum_bounding_box_input.append(word_coordinate) updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] - for i in range(0, 3): - additional_pixel = random.randint(1, args.pixel_scaling) - mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) - bounding_box = minimum_bounding_box(mar) + if args.augment: + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + else: + bounding_box = minimum_bounding_box(points_ordered) (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) @@ -435,8 +478,7 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) box = (min_x, min_y, max_x, max_y) region_final = img2.crop(box) - line_id = id + '_scale' + str(i) - set_line_image_data(region_final, line_id, image_file_name, image_fh) + set_line_image_data(region_final, id, image_file_name, image_fh) def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): @@ -490,16 +532,14 @@ def check_writing_condition(wc_dict, base_name): Returns (bool): True if writing condition matches. """ - #return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - - return True - + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True ### main ### - def main(): wc_dict1 = parse_writing_conditions(args.writing_condition1) @@ -519,8 +559,7 @@ def main(): madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) if wc_dict is None or not check_writing_condition(wc_dict, base_name): continue - if madcat_file_path is not None: - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) if __name__ == '__main__': diff --git a/egs/madcat_ar/v1/local/tl/imp/make_features.py b/egs/madcat_ar/v1/local/tl/make_features.py similarity index 100% rename from egs/madcat_ar/v1/local/tl/imp/make_features.py rename to egs/madcat_ar/v1/local/tl/make_features.py diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh deleted file mode 100755 index f44b12667e9..00000000000 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_cnn_e2eali_1b.sh +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/bash - -# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the -# lattice alignments and to build a tree - -# local/chain/compare_wer.sh exp/chain/exp/chain/cnn_e2eali_1b -# System cnn_e2eali_1b -# WER 10.78 -# CER 2.99 -# Final train prob -0.0587 -# Final valid prob -0.0609 -# Final train prob (xent) -0.4471 -# Final valid prob (xent) -0.4653 -# Parameters 3.37M - -# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b -#exp/chain/cnn_e2eali_1b: num-iters=179 nj=8..16 num-params=3.4M dim=40->416 combine=-0.058->-0.058 (over 3) xent:train/valid[118,178,final]=(-0.463,-0.445,-0.447/-0.477,-0.462,-0.465) logprob:train/valid[118,178,final]=(-0.062,-0.059,-0.059/-0.063,-0.061,-0.061) - -set -e -o pipefail - -stage=0 - -nj=30 -train_set=train -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -common_egs_dir= -reporting_email= - -# chain options -train_stage=-10 -xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options -chunk_width=340,300,200,100 -num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 -tdnn_dim=450 -# training options -srand=0 -remove_egs=true -lang_test=lang_test -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 2 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ - --acoustic-scale 1.0 \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/lang $e2echain_model_dir $lat_dir - echo "" >$lat_dir/splice_opts - -fi - -if [ $stage -le 3 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ - --alignment-subsampling-factor 1 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 4 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=56 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' mod?els... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 5 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$cmd" \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=1 \ - --chain.left-tolerance 3 \ - --chain.right-tolerance 3 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=2 \ - --trainer.frames-per-iter=1000000 \ - --trainer.optimization.num-jobs-initial=3 \ - --trainer.optimization.num-jobs-final=16 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ - --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 6 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 7 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh deleted file mode 100755 index 1ff5b549180..00000000000 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_end2end.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian -# 2018 Ashish Arora -set -e -stage=0 -nj=30 -# download_dir{1,2,3} points to the database path on the JHU grid. If you have not -# already downloaded the database you can set it to a local directory -# This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits - -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. -. ./path.sh -. ./utils/parse_options.sh # e.g. this parses the above options - # if supplied. -./local/check_tools.sh - -mkdir -p data/{train,test,dev}/data -mkdir -p data/local/{train,test,dev} -if [ $stage -le 0 ]; then - - if [ -f data/train/text ] && ! $overwrite; then - echo "$0: Not processing, probably script have run from wrong stage" - echo "Exiting with status 1 to avoid data corruption" - exit 1; - fi - echo "$0: Downloading data splits...$(date)" - local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ - --download_dir2 $download_dir2 --download_dir3 $download_dir3 - - for dataset in train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid - local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ - --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset - done - - echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 -fi - -if [ $stage -le 1 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." - image/get_image2num_frames.py data/train - image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in dev test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set - steps/compute_cmvn_stats.sh data/$set || exit 1; - done - echo "$0: Fixing data directory for train dataset $(date)." - image/fix_data_dir.sh data/train - -fi - -if [ $stage -le 2 ]; then - for set in train; do - echo "$(date) stage 2: Performing augmentation, it will double training data" - local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data - steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; - done -fi - -if [ $stage -le 3 ]; then - echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - - for set in test train dev; do - cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ - utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ - | sed 's/@@//g' > data/$set/bpe_text - - mv data/$set/text data/$set/text.old - paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - #rm -f data/$set/bpe_text data/$set/ids - done - - echo "$0:Preparing dictionary and lang..." - local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ - data/local/dict "" data/lang/temp data/lang - utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang -fi - -if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test -fi - -if [ $stage -le 4 ]; then - echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj -fi - -if [ $stage -le 5 ]; then - echo "$0: Aligning the training data using the e2e chain model...$(date)." - steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train -fi - -if [ $stage -le 6 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali_1b.sh --nj $nj -fi diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh deleted file mode 100755 index 4893dcfea08..00000000000 --- a/egs/madcat_ar/v1/local/tl/not_much_imp/run_flatstart_cnn1a.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian - -# This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a -# System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 -# Final train prob (xent) -# Final valid prob (xent) -# Parameters 2.94M - -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) - -set -e - - -# configs for 'chain' -stage=0 -nj=30 -train_stage=-10 -get_egs_stage=-10 -affix=1a - -# training options -tdnn_dim=450 -num_epochs=2 -num_jobs_initial=3 -num_jobs_final=16 -minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 -common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=1000000 -cmvn_opts="--norm-means=false --norm-vars=false" -train_set=train -lang_test=lang_test - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ - --shared-phones true \ - --type mono \ - data/$train_set $lang $treedir - $cmd $treedir/log/make_phone_lm.log \ - cat data/$train_set/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=500 \ - ark:- $treedir/phone_lm.fst -fi - -if [ $stage -le 2 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=56 name=input - conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 -EOF - - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs -fi - -if [ $stage -le 3 ]; then - # no need to store the egs in a shared storage because we always - # remove them. Anyway, it takes only 5 minutes to generate them. - - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ - --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ - --chain.apply-deriv-weights false \ - --egs.dir "$common_egs_dir" \ - --egs.stage $get_egs_stage \ - --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ - --chain.frame-subsampling-factor 4 \ - --chain.alignment-subsampling-factor 4 \ - --trainer.add-option="--optimization.memory-compression-level=2" \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.shrink-value 1.0 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir data/${train_set} \ - --tree-dir $treedir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh b/egs/madcat_ar/v1/local/tl/prepare_data.sh similarity index 100% rename from egs/madcat_ar/v1/local/tl/not_much_imp/prepare_data.sh rename to egs/madcat_ar/v1/local/tl/prepare_data.sh diff --git a/egs/madcat_ar/v1/local/tl/imp/process_data.py b/egs/madcat_ar/v1/local/tl/process_data.py similarity index 100% rename from egs/madcat_ar/v1/local/tl/imp/process_data.py rename to egs/madcat_ar/v1/local/tl/process_data.py diff --git a/egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py similarity index 100% rename from egs/madcat_ar/v1/local/tl/imp/process_waldo_data.py rename to egs/madcat_ar/v1/local/tl/process_waldo_data.py diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 5d27476d3e1..3211e93e120 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -3,13 +3,11 @@ # 2018 Ashish Arora set -e stage=0 -nj=70 +nj=30 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -17,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -27,16 +26,18 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in test train dev; do + for dataset in train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ @@ -44,9 +45,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -54,75 +53,73 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in dev test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + image/fix_data_dir.sh data/train + fi -if [ $stage -le 4 ]; then - for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." - utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + #rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." local/chain/run_flatstart_cnn1a.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/chain/run_cnn_e2eali_1b.sh --nj $nj fi From 92a470da6c151e23c872e0760a99b8f730439132 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 14:12:27 -0400 Subject: [PATCH 22/67] removing unnecessary files --- .../v1/local/chain/run_flatstart_cnn1b.sh | 164 ------------------ egs/madcat_ar/v1/path.sh | 1 - egs/madcat_ar/v1/run_end2end.sh | 1 + egs/wsj/s5/utils/lang/bpe/prepend_words.py | 11 +- egs/wsj/s5/utils/lang/make_lexicon_fst.py | 2 +- 5 files changed, 5 insertions(+), 174 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh deleted file mode 100755 index 901903a9bba..00000000000 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1b.sh +++ /dev/null @@ -1,164 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian - -# This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a -# System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 -# Final train prob (xent) -# Final valid prob (xent) -# Parameters 2.94M - -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) - -set -e - -# configs for 'chain' -stage=0 -nj=70 -train_stage=-10 -get_egs_stage=-10 -affix=1b - -# training options -tdnn_dim=550 -minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 -common_egs_dir= -cmvn_opts="--norm-means=true --norm-vars=true" -train_set=train -lang_test=lang_test -dropout_schedule='0,0@0.20,0.2@0.50,0' -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ - --shared-phones true \ - --type mono \ - data/$train_set $lang $treedir - $cmd $treedir/log/make_phone_lm.log \ - cat data/$train_set/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=500 \ - ark:- $treedir/phone_lm.fst -fi - -if [ $stage -le 2 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 - conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 - relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim dropout-proportion=0.0 - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 -EOF - - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs -fi - -if [ $stage -le 3 ]; then - # no need to store the egs in a shared storage because we always - # remove them. Anyway, it takes only 5 minutes to generate them. - - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ - --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ - --chain.apply-deriv-weights false \ - --egs.dir "$common_egs_dir" \ - --egs.stage $get_egs_stage \ - --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ - --chain.frame-subsampling-factor 4 \ - --chain.alignment-subsampling-factor 4 \ - --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --trainer.add-option="--optimization.memory-compression-level=2" \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter 2000000 \ - --trainer.num-epochs 2 \ - --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial 6 \ - --trainer.optimization.num-jobs-final 16 \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.shrink-value 1.0 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir data/${train_set} \ - --tree-dir $treedir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/path.sh b/egs/madcat_ar/v1/path.sh index 252d4ab04fe..2d17b17a84a 100755 --- a/egs/madcat_ar/v1/path.sh +++ b/egs/madcat_ar/v1/path.sh @@ -3,5 +3,4 @@ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -export CUDA_CACHE_DISABLE=1 export LC_ALL=C diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index e5ca540d3c1..3986ede9d7f 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -107,6 +107,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi diff --git a/egs/wsj/s5/utils/lang/bpe/prepend_words.py b/egs/wsj/s5/utils/lang/bpe/prepend_words.py index d497344e850..face771c7ca 100755 --- a/egs/wsj/s5/utils/lang/bpe/prepend_words.py +++ b/egs/wsj/s5/utils/lang/bpe/prepend_words.py @@ -4,16 +4,11 @@ # the beginning of the words for finding the initial-space of every word # after decoding. -import argparse import sys, io -parser = argparse.ArgumentParser(description="Prepends '|' to the beginning of every word") -parser.add_argument('--encoding', type=str, default='latin-1', - help='Type of encoding') -args = parser.parse_args() - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding=args.encoding) -output = io.TextIOWrapper(sys.stdout.buffer, encoding=args.encoding) +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1') for line in infile: output.write(' '.join([ "|"+word for word in line.split()]) + '\n') + diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py index 89c50b2f069..67ed0ac2789 100755 --- a/egs/wsj/s5/utils/lang/make_lexicon_fst.py +++ b/egs/wsj/s5/utils/lang/make_lexicon_fst.py @@ -72,7 +72,7 @@ def read_lexiconp(filename): with open(filename, 'r', encoding='latin-1') as f: whitespace = re.compile("[ \t]+") for line in f: - a = whitespace.split(line.rstrip('\n')) + a = whitespace.split(line.strip()) if len(a) < 2: print("{0}: error: found bad line '{1}' in lexicon file {2} ".format( sys.argv[0], line.strip(), filename), file=sys.stderr) From e7b7597301ce59eb1e1ed9833cf0853c12d82c17 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:23:45 -0400 Subject: [PATCH 23/67] adding lm rescoring, cleaning in chain scripts --- .../v1/local/chain/tuning/run_cnn_1a.sh | 11 +- .../local/chain/tuning/run_cnn_chainali_1a.sh | 11 +- .../local/chain/tuning/run_cnn_e2eali_1b.sh | 9 +- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 8 +- egs/madcat_ar/v1/local/extract_features.sh | 6 +- egs/madcat_ar/v1/local/make_features.py | 138 ------------------ egs/madcat_ar/v1/run_end2end.sh | 6 +- 7 files changed, 40 insertions(+), 149 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/make_features.py diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh index a3a98ce5ad5..02d095b3a82 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh @@ -32,7 +32,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -207,7 +208,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -221,4 +222,10 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh index b652eab034a..5faf6a73691 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh @@ -29,7 +29,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -209,7 +210,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -223,4 +224,10 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 55df0cad4b7..b0b77be2a18 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -42,7 +42,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -240,4 +241,10 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 033cb88df10..bf215a0cae2 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -36,7 +36,8 @@ l2_regularize=0.00005 frames_per_iter=2000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -152,7 +153,7 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -161,6 +162,9 @@ if [ $stage -le 5 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 56a8443e328..4ed6ba04348 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -9,6 +9,8 @@ nj=4 cmd=run.pl feat_dim=40 +augment=false +fliplr=false echo "$0 $@" . ./cmd.sh @@ -34,9 +36,9 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim \| \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py deleted file mode 100755 index a21276d32c2..00000000000 --- a/egs/madcat_ar/v1/local/make_features.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2018 Hossein Hadian - -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - If an 'image2num_frames' file is found in the data dir, it will be used - to enforce the images to have the specified length in that file by padding - white pixels (the --padding option will be ignored in this case). This relates - to end2end chain training. - - eg. local/make_features.py data/train --feat-dim 40 -""" - -import argparse -import os -import sys -import numpy as np -from scipy import misc - -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") -parser.add_argument('images_scp_path', type=str, - help='Path of images.scp file') -parser.add_argument('--allowed_len_file_path', type=str, default=None, - help='If supplied, each images will be padded to reach the ' - 'target length (this overrides --padding).') -parser.add_argument('--out-ark', type=str, default='-', - help='Where to write the output feature file') -parser.add_argument('--feat-dim', type=int, default=40, - help='Size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='Number of white pixels to pad on the left' - 'and right side of the image.') - - -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - -def horizontal_pad(im, allowed_lengths = None): - if allowed_lengths is None: - left_padding = right_padding = args.padding - else: # Find an allowed length for the image - imlen = im.shape[1] # width - allowed_len = 0 - for l in allowed_lengths: - if l > imlen: - allowed_len = l - break - if allowed_len == 0: - # No allowed length was found for the image (the image is too long) - return None - padding = allowed_len - imlen - left_padding = int(padding // 2) - right_padding = padding - left_padding - dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) - return im_pad1 - - -### main ### - -data_list_path = args.images_scp_path - -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark,'wb') - -allowed_lengths = None -allowed_len_handle = args.allowed_len_file_path -if os.path.isfile(allowed_len_handle): - print("Found 'allowed_lengths.txt' file...", file=sys.stderr) - allowed_lengths = [] - with open(allowed_len_handle) as f: - for line in f: - allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the " - "features.".format(len(allowed_lengths)), file=sys.stderr) - -num_fail = 0 -num_ok = 0 -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: - num_fail += 1 - continue - data = np.transpose(im_horizontal_padded, (1, 0)) - data = np.divide(data, 255.0) - num_ok += 1 - write_kaldi_matrix(out_fh, data, image_id) - -print('Generated features for {} images. Failed for {} (image too ' - 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 3986ede9d7f..a24b851331b 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -95,8 +95,10 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi if [ $stage -le 4 ]; then From e647607d495eee3115fb1f799be5f4c897cd39d4 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:27:59 -0400 Subject: [PATCH 24/67] minor fix --- .../v1/local/chain/tuning/run_cnn_e2eali_1a.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh index 38387ce2fcc..38de5fe3b7c 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -29,7 +29,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -213,7 +214,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -227,4 +228,10 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir From a0d2b6879d1ea061ac801bb8291acdf3104018af Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:31:43 -0400 Subject: [PATCH 25/67] removing prepend words --- egs/madcat_ar/v1/local/prepend_words.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/prepend_words.py diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py deleted file mode 100755 index d53eb8974bf..00000000000 --- a/egs/madcat_ar/v1/local/prepend_words.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# This script, prepend '|' to every words in the transcript to mark -# the beginning of the words for finding the initial-space of every word -# after decoding. - -import sys, io - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -for line in infile: - output.write(' '.join(["|" + word for word in line.split()]) + '\n') From 53edde450b1803a60d0cb7f28b4448bd52f0abff Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:33:14 -0400 Subject: [PATCH 26/67] minor bug fix --- egs/madcat_ar/v1/run_end2end.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index a24b851331b..859cced6c17 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -70,13 +70,13 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text From e9ae85369f7723e2379cc6cea4c568ffa6a41c6a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:51:43 -0400 Subject: [PATCH 27/67] fixing run.sh --- egs/madcat_ar/v1/run.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index f6a63320497..2e2d0d7af90 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -83,8 +83,10 @@ fi if [ $stage -le 5 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi if [ $stage -le 6 ]; then From 8d0c7930a20eba30bc654d16a00e09ec966a7f04 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:57:38 -0400 Subject: [PATCH 28/67] removing prepare data --- egs/madcat_ar/v1/local/prepare_data.sh | 53 -------------------------- egs/madcat_ar/v1/run_end2end.sh | 12 ++++-- 2 files changed, 8 insertions(+), 57 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/prepare_data.sh diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh deleted file mode 100755 index d808d736845..00000000000 --- a/egs/madcat_ar/v1/local/prepare_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2017 Hossein Hadian -# Apache 2.0 - -# This script prepares the training and test data for MADCAT Arabic dataset -# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. - -# Eg. local/prepare_data.sh -# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ -# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 -# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 -# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif - -stage=0 -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits -images_scp_dir=data/local - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -mkdir -p data/{train,test,dev} - -if [ $stage -le 1 ]; then - echo "$0: Processing dev, train and test data..." - echo "Date: $(date)." - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - for dataset in dev test train; do - echo "$0: Fixing data directory for dataset: $dataset" - echo "Date: $(date)." - image/fix_data_dir.sh data/$dataset - done -fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 859cced6c17..48832e3159b 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -47,10 +47,14 @@ if [ $stage -le 0 ]; then done echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + data/local/splits/${set}.txt data/${set} + image/fix_data_dir.sh data/${set} + done + fi if [ $stage -le 1 ]; then From ee582d50d6c46f4bd9293f1bf4243c65f7c54542 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 15:59:35 -0400 Subject: [PATCH 29/67] fixing run.sh --- egs/madcat_ar/v1/run.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 2e2d0d7af90..7922bf30ed6 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -57,10 +57,13 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + data/local/splits/${set}.txt data/${set} + image/fix_data_dir.sh data/${set} + done fi mkdir -p data/{train,test,dev}/data From a16a11d8145d323254a00fd088187994eca96d0f Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 16:02:30 -0400 Subject: [PATCH 30/67] removing reverse.py --- egs/madcat_ar/v1/run_end2end.sh | 4 ++-- egs/{madcat_ar/v1/local => wsj/s5/utils/lang/bpe}/reverse.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename egs/{madcat_ar/v1/local => wsj/s5/utils/lang/bpe}/reverse.py (100%) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 48832e3159b..6ab6e8ff32d 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -73,13 +73,13 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ + cut -d' ' -f2- data/train/text | utilis/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text diff --git a/egs/madcat_ar/v1/local/reverse.py b/egs/wsj/s5/utils/lang/bpe/reverse.py similarity index 100% rename from egs/madcat_ar/v1/local/reverse.py rename to egs/wsj/s5/utils/lang/bpe/reverse.py From fb0b8a25363b443d3687d8446bc8a7c1565e4db4 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 19:09:45 -0400 Subject: [PATCH 31/67] removing prepare data --- egs/madcat_ar/v1/local/tl/prepare_data.sh | 49 ------------------- .../v1/local/tl/run_textlocalization.sh | 31 +++++++----- 2 files changed, 20 insertions(+), 60 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/tl/prepare_data.sh diff --git a/egs/madcat_ar/v1/local/tl/prepare_data.sh b/egs/madcat_ar/v1/local/tl/prepare_data.sh deleted file mode 100755 index 5fe41e7cf4c..00000000000 --- a/egs/madcat_ar/v1/local/tl/prepare_data.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2017 Hossein Hadian -# Apache 2.0 - -# This script prepares the training and test data for MADCAT Arabic dataset -# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. - -# Eg. local/prepare_data.sh -# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ -# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 -# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 -# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif - -stage=0 -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits -images_scp_dir=data/local - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -mkdir -p data/{train,test,dev} - -if [ $stage -le 1 ]; then - echo "$0: Processing dev, train and test data...$(date)" - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 - - local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test - - for dataset in dev test train; do - echo "$0: Fixing data directory for dataset: $dataset." - image/fix_data_dir.sh data/$dataset - done -fi diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 3211e93e120..cd5c96e368e 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -47,10 +47,16 @@ if [ $stage -le 0 ]; then done echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + for set in dev train; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 + data/local/splits/${set}.txt data/${set} + image/fix_data_dir.sh data/${set} + done + + local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + image/fix_data_dir.sh data/test fi if [ $stage -le 1 ]; then @@ -77,20 +83,20 @@ fi if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + cut -d' ' -f2- data/train/text | utilis/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - #rm -f data/$set/bpe_text data/$set/ids + rm -f data/$set/bpe_text data/$set/ids done echo "$0:Preparing dictionary and lang..." @@ -103,8 +109,10 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi if [ $stage -le 4 ]; then @@ -115,6 +123,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi From 7835ed4926f7fa56345487f5b25b56fb22ba7a45 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 19:16:30 -0400 Subject: [PATCH 32/67] adding augmentation during line image creation, removing unnecessary files --- .../create_line_image_from_page_image.py | 180 +++--- egs/madcat_ar/v1/local/process_data.py | 40 +- .../tl/create_line_image_from_page_image.py | 567 ------------------ egs/madcat_ar/v1/local/tl/process_data.py | 215 ------- 4 files changed, 139 insertions(+), 863 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py delete mode 100755 egs/madcat_ar/v1/local/tl/process_data.py diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index 34e339f1877..bb126c39538 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -21,22 +21,10 @@ import numpy as np from math import atan2, cos, sin, pi, degrees, sqrt from collections import namedtuple - +import random from scipy.spatial import ConvexHull from PIL import Image from scipy.misc import toimage -import logging - -sys.path.insert(0, 'steps') -logger = logging.getLogger('libs') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setLevel(logging.INFO) -formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " - "%(funcName)s - %(levelname)s ] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) - parser = argparse.ArgumentParser(description="Creates line images from page image", epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " @@ -60,8 +48,12 @@ help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument('--padding', type=int, default=400, help='padding across horizontal/verticle direction') +parser.add_argument('--pixel-scaling', type=int, default=30, + help='padding across horizontal/verticle direction') parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, help="only processes subset of data based on writing condition") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() """ @@ -196,21 +188,6 @@ def rectangle_corners(rectangle): return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) -def get_orientation(origin, p1, p2): - """ - Given origin and two points, return the orientation of the Point p1 with - regards to Point p2 using origin. - Returns - ------- - integer: Negative if p1 is clockwise of p2. - """ - difference = ( - ((p2[0] - origin[0]) * (p1[1] - origin[1])) - - ((p1[0] - origin[0]) * (p2[1] - origin[1])) - ) - return difference - - def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. @@ -357,6 +334,36 @@ def update_minimum_bounding_box_input(bounding_box_input): return updated_minimum_bounding_box_input +def dilate_polygon(points, amount_increase): + """ Increases size of polygon given as a list of tuples. + Assumes points in polygon are given in CCW + """ + expanded_points = [] + for index, point in enumerate(points): + prev_point = points[(index - 1) % len(points)] + next_point = points[(index + 1) % len(points)] + prev_edge = np.subtract(point, prev_point) + next_edge = np.subtract(next_point, point) + + prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) + prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) + next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) + next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) + + bisect = np.add(prev_normal, next_normal) + bisect = np.divide(bisect, np.linalg.norm(bisect)) + + cos_theta = np.dot(next_normal, bisect) + hyp = amount_increase / cos_theta + + new_point = np.around(point + hyp * bisect) + new_point = new_point.astype(int) + new_point = new_point.tolist() + new_point = tuple(new_point) + expanded_points.append(new_point) + return expanded_points + + def set_line_image_data(image, line_id, image_file_name, image_fh): """ Given an image, saves a flipped line image. Line image file name is formed by appending the line id at the end page image name. @@ -395,50 +402,83 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) minimum_bounding_box_input.append(word_coordinate) updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) - bounding_box = minimum_bounding_box(updated_mbb_input) - - p1, p2, p3, p4 = bounding_box.corner_points - x1, y1 = p1 - x2, y2 = p2 - x3, y3 = p3 - x4, y4 = p4 - min_x = int(min(x1, x2, x3, x4)) - min_y = int(min(y1, y2, y3, y4)) - max_x = int(max(x1, x2, x3, x4)) - max_y = int(max(y1, y2, y3, y4)) - box = (min_x, min_y, max_x, max_y) - region_initial = im.crop(box) - rot_points = [] - p1_new = (x1 - min_x, y1 - min_y) - p2_new = (x2 - min_x, y2 - min_y) - p3_new = (x3 - min_x, y3 - min_y) - p4_new = (x4 - min_x, y4 - min_y) - rot_points.append(p1_new) - rot_points.append(p2_new) - rot_points.append(p3_new) - rot_points.append(p4_new) - - cropped_bounding_box = bounding_box_tuple(bounding_box.area, - bounding_box.length_parallel, - bounding_box.length_orthogonal, - bounding_box.length_orthogonal, - bounding_box.unit_vector, - bounding_box.unit_vector_angle, - set(rot_points) - ) - - rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) - img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) - x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] + if args.augment: + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + else: + bounding_box = minimum_bounding_box(points_ordered) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( cropped_bounding_box, get_center(region_initial)) - min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - box = (min_x, min_y, max_x, max_y) - region_final = img2.crop(box) - set_line_image_data(region_final, id, image_file_name, image_fh) + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + set_line_image_data(region_final, id, image_file_name, image_fh) def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index 920cb6f700b..c21beb1be70 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -42,6 +42,8 @@ help='Path to the downloaded (and extracted) writing conditions file 2') parser.add_argument('writing_condition3', type=str, help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -184,14 +186,30 @@ def get_line_image_location(): writer_id = writer[0].getAttribute('id') text_line_word_dict = read_text(madcat_xml_path) base_name = os.path.basename(image_file_path).split('.tif')[0] - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 + for line_id in sorted(text_line_word_dict): + if args.augment: + key = (line_id + '.')[:-1] + for i in range(0, 3): + location_id = '_' + line_id + '_scale' + str(i) + line_image_file_name = base_name + location_id + '.png' + location = image_loc_dict[line_image_file_name] + image_file_path = os.path.join(location, line_image_file_name) + line = text_line_word_dict[key] + text = ' '.join(line) + base_line_image_file_name = line_image_file_name.split('.png')[0] + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 + else: + updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[line_id] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py deleted file mode 100755 index bb126c39538..00000000000 --- a/egs/madcat_ar/v1/local/tl/create_line_image_from_page_image.py +++ /dev/null @@ -1,567 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2018 Ashish Arora -# Apache 2.0 -# minimum bounding box part in this script is originally from -#https://github.com/BebeSparkelSparkel/MinimumBoundingBox -#https://startupnextdoor.com/computing-convex-hull-in-python/ -""" This module will be used for extracting line images from page image. - Given the word segmentation (bounding box around a word) for every word, it will - extract line segmentation. To extract line segmentation, it will take word bounding - boxes of a line as input, will create a minimum area bounding box that will contain - all corner points of word bounding boxes. The obtained bounding box (will not necessarily - be vertically or horizontally aligned). Hence to extract line image from line bounding box, - page image is rotated and line image is cropped and saved. -""" - -import sys -import argparse -import os -import xml.dom.minidom as minidom -import numpy as np -from math import atan2, cos, sin, pi, degrees, sqrt -from collections import namedtuple -import random -from scipy.spatial import ConvexHull -from PIL import Image -from scipy.misc import toimage -parser = argparse.ArgumentParser(description="Creates line images from page image", - epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" - " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " - " data/local/lines ", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('database_path1', type=str, - help='Path to the downloaded madcat data directory 1') -parser.add_argument('database_path2', type=str, - help='Path to the downloaded madcat data directory 2') -parser.add_argument('database_path3', type=str, - help='Path to the downloaded madcat data directory 3') -parser.add_argument('data_splits', type=str, - help='Path to file that contains the train/test/dev split information') -parser.add_argument('out_dir', type=str, - help='directory location to write output files') -parser.add_argument('writing_condition1', type=str, - help='Path to the downloaded (and extracted) writing conditions file 1') -parser.add_argument('writing_condition2', type=str, - help='Path to the downloaded (and extracted) writing conditions file 2') -parser.add_argument('writing_condition3', type=str, - help='Path to the downloaded (and extracted) writing conditions file 3') -parser.add_argument('--padding', type=int, default=400, - help='padding across horizontal/verticle direction') -parser.add_argument('--pixel-scaling', type=int, default=30, - help='padding across horizontal/verticle direction') -parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, - help="only processes subset of data based on writing condition") -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") -args = parser.parse_args() - -""" -bounding_box is a named tuple which contains: - area (float): area of the rectangle - length_parallel (float): length of the side that is parallel to unit_vector - length_orthogonal (float): length of the side that is orthogonal to unit_vector - rectangle_center(int, int): coordinates of the rectangle center - (use rectangle_corners to get the corner points of the rectangle) - unit_vector (float, float): direction of the length_parallel side. - (it's orthogonal vector can be found with the orthogonal_vector function - unit_vector_angle (float): angle of the unit vector to be in radians. - corner_points [(float, float)]: set that contains the corners of the rectangle -""" - -bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' - 'length_parallel ' - 'length_orthogonal ' - 'rectangle_center ' - 'unit_vector ' - 'unit_vector_angle ' - 'corner_points' - ) - - -def unit_vector(pt0, pt1): - """ Given two points pt0 and pt1, return a unit vector that - points in the direction of pt0 to pt1. - Returns - ------- - (float, float): unit vector - """ - dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) - return (pt1[0] - pt0[0]) / dis_0_to_1, \ - (pt1[1] - pt0[1]) / dis_0_to_1 - - -def orthogonal_vector(vector): - """ Given a vector, returns a orthogonal/perpendicular vector of equal length. - Returns - ------ - (float, float): A vector that points in the direction orthogonal to vector. - """ - return -1 * vector[1], vector[0] - - -def bounding_area(index, hull): - """ Given index location in an array and convex hull, it gets two points - hull[index] and hull[index+1]. From these two points, it returns a named - tuple that mainly contains area of the box that bounds the hull. This - bounding box orintation is same as the orientation of the lines formed - by the point hull[index] and hull[index+1]. - Returns - ------- - a named tuple that contains: - area: area of the rectangle - length_parallel: length of the side that is parallel to unit_vector - length_orthogonal: length of the side that is orthogonal to unit_vector - rectangle_center: coordinates of the rectangle center - unit_vector: direction of the length_parallel side. - (it's orthogonal vector can be found with the orthogonal_vector function) - """ - unit_vector_p = unit_vector(hull[index], hull[index+1]) - unit_vector_o = orthogonal_vector(unit_vector_p) - - dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) - dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) - - min_p = min(dis_p) - min_o = min(dis_o) - len_p = max(dis_p) - min_p - len_o = max(dis_o) - min_o - - return {'area': len_p * len_o, - 'length_parallel': len_p, - 'length_orthogonal': len_o, - 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), - 'unit_vector': unit_vector_p, - } - - -def to_xy_coordinates(unit_vector_angle, point): - """ Given angle from horizontal axis and a point from origin, - returns converted unit vector coordinates in x, y coordinates. - angle of unit vector should be in radians. - Returns - ------ - (float, float): converted x,y coordinate of the unit vector. - """ - angle_orthogonal = unit_vector_angle + pi / 2 - return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ - point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) - - -def rotate_points(center_of_rotation, angle, points): - """ Rotates a point cloud around the center_of_rotation point by angle - input - ----- - center_of_rotation (float, float): angle of unit vector to be in radians. - angle (float): angle of rotation to be in radians. - points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. - Returns - ------ - [(float, float)]: Rotated points around center of rotation by angle - """ - rot_points = [] - ang = [] - for pt in points: - diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) - diff_angle = atan2(diff[1], diff[0]) + angle - ang.append(diff_angle) - diff_length = sqrt(sum([d**2 for d in diff])) - rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), - center_of_rotation[1] + diff_length * sin(diff_angle))) - - return rot_points - - -def rectangle_corners(rectangle): - """ Given rectangle center and its inclination, returns the corner - locations of the rectangle. - Returns - ------ - [(float, float)]: 4 corner points of rectangle. - """ - corner_points = [] - for i1 in (.5, -.5): - for i2 in (i1, -1 * i1): - corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], - rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) - - return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) - - -def minimum_bounding_box(points): - """ Given a list of 2D points, it returns the minimum area rectangle bounding all - the points in the point cloud. - Returns - ------ - returns a namedtuple that contains: - area: area of the rectangle - length_parallel: length of the side that is parallel to unit_vector - length_orthogonal: length of the side that is orthogonal to unit_vector - rectangle_center: coordinates of the rectangle center - unit_vector: direction of the length_parallel side. RADIANS - unit_vector_angle: angle of the unit vector - corner_points: set that contains the corners of the rectangle - """ - - if len(points) <= 2: raise ValueError('More than two points required.') - - hull_ordered = [points[index] for index in ConvexHull(points).vertices] - hull_ordered.append(hull_ordered[0]) - hull_ordered = tuple(hull_ordered) - - min_rectangle = bounding_area(0, hull_ordered) - for i in range(1, len(hull_ordered)-1): - rectangle = bounding_area(i, hull_ordered) - if rectangle['area'] < min_rectangle['area']: - min_rectangle = rectangle - - min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) - min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) - - return bounding_box_tuple( - area = min_rectangle['area'], - length_parallel = min_rectangle['length_parallel'], - length_orthogonal = min_rectangle['length_orthogonal'], - rectangle_center = min_rectangle['rectangle_center'], - unit_vector = min_rectangle['unit_vector'], - unit_vector_angle = min_rectangle['unit_vector_angle'], - corner_points = set(rectangle_corners(min_rectangle)) - ) - - -def get_center(im): - """ Given image, returns the location of center pixel - Returns - ------- - (int, int): center of the image - """ - center_x = im.size[0] / 2 - center_y = im.size[1] / 2 - return int(center_x), int(center_y) - - -def get_horizontal_angle(unit_vector_angle): - """ Given an angle in radians, returns angle of the unit vector in - first or fourth quadrant. - Returns - ------ - (float): updated angle of the unit vector to be in radians. - It is only in first or fourth quadrant. - """ - if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: - unit_vector_angle = unit_vector_angle - pi - elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: - unit_vector_angle = unit_vector_angle + pi - - return unit_vector_angle - - -def get_smaller_angle(bounding_box): - """ Given a rectangle, returns its smallest absolute angle from horizontal axis. - Returns - ------ - (float): smallest angle of the rectangle to be in radians. - """ - unit_vector = bounding_box.unit_vector - unit_vector_angle = bounding_box.unit_vector_angle - ortho_vector = orthogonal_vector(unit_vector) - ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) - - unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) - ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) - - if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): - return unit_vector_angle_updated - else: - return ortho_vector_angle_updated - - -def rotated_points(bounding_box, center): - """ Given the rectangle, returns corner points of rotated rectangle. - It rotates the rectangle around the center by its smallest angle. - Returns - ------- - [(int, int)]: 4 corner points of rectangle. - """ - p1, p2, p3, p4 = bounding_box.corner_points - x1, y1 = p1 - x2, y2 = p2 - x3, y3 = p3 - x4, y4 = p4 - center_x, center_y = center - rotation_angle_in_rad = -get_smaller_angle(bounding_box) - x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x - x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x - x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x - x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x - - y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y - y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y - y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y - y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y - return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 - - -def pad_image(image): - """ Given an image, returns a padded image around the border. - This routine save the code from crashing if bounding boxes that are - slightly outside the page boundary. - Returns - ------- - image: page image - """ - offset = int(args.padding // 2) - padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") - padded_image.paste(im = image, box = (offset, offset)) - return padded_image - - -def update_minimum_bounding_box_input(bounding_box_input): - """ Given list of 2D points, returns list of 2D points shifted by an offset. - Returns - ------ - points [(float, float)]: points, a list or tuple of 2D coordinates - """ - updated_minimum_bounding_box_input = [] - offset = int(args.padding // 2) - for point in bounding_box_input: - x, y = point - new_x = x + offset - new_y = y + offset - word_coordinate = (new_x, new_y) - updated_minimum_bounding_box_input.append(word_coordinate) - - return updated_minimum_bounding_box_input - - -def dilate_polygon(points, amount_increase): - """ Increases size of polygon given as a list of tuples. - Assumes points in polygon are given in CCW - """ - expanded_points = [] - for index, point in enumerate(points): - prev_point = points[(index - 1) % len(points)] - next_point = points[(index + 1) % len(points)] - prev_edge = np.subtract(point, prev_point) - next_edge = np.subtract(next_point, point) - - prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) - prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) - next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) - next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) - - bisect = np.add(prev_normal, next_normal) - bisect = np.divide(bisect, np.linalg.norm(bisect)) - - cos_theta = np.dot(next_normal, bisect) - hyp = amount_increase / cos_theta - - new_point = np.around(point + hyp * bisect) - new_point = new_point.astype(int) - new_point = new_point.tolist() - new_point = tuple(new_point) - expanded_points.append(new_point) - return expanded_points - - -def set_line_image_data(image, line_id, image_file_name, image_fh): - """ Given an image, saves a flipped line image. Line image file name - is formed by appending the line id at the end page image name. - """ - - base_name = os.path.splitext(os.path.basename(image_file_name))[0] - line_id = '_' + line_id.zfill(4) - line_image_file_name = base_name + line_id + '.png' - image_path = os.path.join(args.out_dir, line_image_file_name) - imgray = image.convert('L') - imgray_rev_arr = np.fliplr(imgray) - imgray_rev = toimage(imgray_rev_arr) - imgray_rev.save(image_path) - image_fh.write(image_path + '\n') - - -def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh): - """ Given a page image, extracts the line images from it. - Input - ----- - image_file_name (string): complete path and name of the page image. - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - """ - im_wo_pad = Image.open(image_file_name) - im = pad_image(im_wo_pad) - doc = minidom.parse(madcat_file_path) - zone = doc.getElementsByTagName('zone') - for node in zone: - id = node.getAttribute('id') - token_image = node.getElementsByTagName('token-image') - minimum_bounding_box_input = [] - for token_node in token_image: - word_point = token_node.getElementsByTagName('point') - for word_node in word_point: - word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) - minimum_bounding_box_input.append(word_coordinate) - updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) - points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] - if args.augment: - for i in range(0, 3): - additional_pixel = random.randint(1, args.pixel_scaling) - mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) - bounding_box = minimum_bounding_box(mar) - (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points - min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) - max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) - box = (min_x, min_y, max_x, max_y) - region_initial = im.crop(box) - rot_points = [] - p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) - p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) - rot_points.append(p1) - rot_points.append(p2) - rot_points.append(p3) - rot_points.append(p4) - - cropped_bounding_box = bounding_box_tuple(bounding_box.area, - bounding_box.length_parallel, - bounding_box.length_orthogonal, - bounding_box.length_orthogonal, - bounding_box.unit_vector, - bounding_box.unit_vector_angle, - set(rot_points) - ) - - rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) - img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) - x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( - cropped_bounding_box, get_center(region_initial)) - - min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - box = (min_x, min_y, max_x, max_y) - region_final = img2.crop(box) - line_id = id + '_scale' + str(i) - set_line_image_data(region_final, line_id, image_file_name, image_fh) - else: - bounding_box = minimum_bounding_box(points_ordered) - (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points - min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) - max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) - box = (min_x, min_y, max_x, max_y) - region_initial = im.crop(box) - rot_points = [] - p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) - p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) - rot_points.append(p1) - rot_points.append(p2) - rot_points.append(p3) - rot_points.append(p4) - - cropped_bounding_box = bounding_box_tuple(bounding_box.area, - bounding_box.length_parallel, - bounding_box.length_orthogonal, - bounding_box.length_orthogonal, - bounding_box.unit_vector, - bounding_box.unit_vector_angle, - set(rot_points) - ) - - rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) - img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) - x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( - cropped_bounding_box, get_center(region_initial)) - - min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - box = (min_x, min_y, max_x, max_y) - region_final = img2.crop(box) - set_line_image_data(region_final, id, image_file_name, image_fh) - - -def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): - """ Returns the complete path of the page image and corresponding - xml file. - Returns - ------- - image_file_name (string): complete path and name of the page image. - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - """ - madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') - madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') - madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') - - image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') - image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') - image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') - - if os.path.exists(madcat_file_path1): - return madcat_file_path1, image_file_path1, wc_dict1 - - if os.path.exists(madcat_file_path2): - return madcat_file_path2, image_file_path2, wc_dict2 - - if os.path.exists(madcat_file_path3): - return madcat_file_path3, image_file_path3, wc_dict3 - - return None, None, None - - -def parse_writing_conditions(writing_conditions): - """ Given writing condition file path, returns a dictionary which have writing condition - of each page image. - Returns - ------ - (dict): dictionary with key as page image name and value as writing condition. - """ - with open(writing_conditions) as f: - file_writing_cond = dict() - for line in f: - line_list = line.strip().split("\t") - file_writing_cond[line_list[0]] = line_list[3] - return file_writing_cond - - -def check_writing_condition(wc_dict, base_name): - """ Given writing condition dictionary, checks if a page image is writing - in a specifed writing condition. - It is used to create subset of dataset based on writing condition. - Returns - (bool): True if writing condition matches. - """ - if args.subset: - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - else: - return True - -### main ### -def main(): - - wc_dict1 = parse_writing_conditions(args.writing_condition1) - wc_dict2 = parse_writing_conditions(args.writing_condition2) - wc_dict3 = parse_writing_conditions(args.writing_condition3) - output_directory = args.out_dir - image_file = os.path.join(output_directory, 'images.scp') - image_fh = open(image_file, 'w', encoding='utf-8') - - splits_handle = open(args.data_splits, 'r') - splits_data = splits_handle.read().strip().split('\n') - prev_base_name = '' - for line in splits_data: - base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] - if prev_base_name != base_name: - prev_base_name = base_name - madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) - if wc_dict is None or not check_writing_condition(wc_dict, base_name): - continue - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) - - -if __name__ == '__main__': - main() - diff --git a/egs/madcat_ar/v1/local/tl/process_data.py b/egs/madcat_ar/v1/local/tl/process_data.py deleted file mode 100755 index c21beb1be70..00000000000 --- a/egs/madcat_ar/v1/local/tl/process_data.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2018 Ashish Arora - -""" This script reads MADCAT files and creates the following files (for the - data subset selected via --dataset) :text, utt2spk, images.scp. - Eg. local/process_data.py data/local /export/corpora/LDC/LDC2012T15 /export/corpora/LDC/LDC2013T09 - /export/corpora/LDC/LDC2013T15 data/download/data_splits/madcat.train.raw.lineid - data/dev data/local/lines/images.scp - Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 وجه وعقل غارق حتّى النخاع - utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 - images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 - data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif -""" - -import argparse -import os -import sys -import xml.dom.minidom as minidom -import unicodedata - -parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", - epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" - " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " - " data/train data/local/lines ", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('database_path1', type=str, - help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path2', type=str, - help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('database_path3', type=str, - help='Path to the downloaded (and extracted) madcat data') -parser.add_argument('data_splits', type=str, - help='Path to file that contains the train/test/dev split information') -parser.add_argument('out_dir', type=str, - help='directory location to write output files.') -parser.add_argument('images_scp_path', type=str, - help='Path of input images.scp file(maps line image and location)') -parser.add_argument('writing_condition1', type=str, - help='Path to the downloaded (and extracted) writing conditions file 1') -parser.add_argument('writing_condition2', type=str, - help='Path to the downloaded (and extracted) writing conditions file 2') -parser.add_argument('writing_condition3', type=str, - help='Path to the downloaded (and extracted) writing conditions file 3') -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") -parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, - help="only processes subset of data based on writing condition") -args = parser.parse_args() - - -def check_file_location(): - """ Returns the complete path of the page image and corresponding - xml file. - Args: - Returns: - image_file_name (string): complete path and name of the page image. - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - """ - madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') - madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') - madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') - - image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') - image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') - image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') - - if os.path.exists(madcat_file_path1): - return madcat_file_path1, image_file_path1, wc_dict1 - - if os.path.exists(madcat_file_path2): - return madcat_file_path2, image_file_path2, wc_dict2 - - if os.path.exists(madcat_file_path3): - return madcat_file_path3, image_file_path3, wc_dict3 - - return None, None, None - - -def parse_writing_conditions(writing_conditions): - """ Returns a dictionary which have writing condition of each page image. - Args: - writing_conditions(string): complete path of writing condition file. - Returns: - (dict): dictionary with key as page image name and value as writing condition. - """ - with open(writing_conditions) as f: - file_writing_cond = dict() - for line in f: - line_list = line.strip().split("\t") - file_writing_cond[line_list[0]] = line_list[3] - return file_writing_cond - - -def check_writing_condition(wc_dict): - """ Checks if a given page image is writing in a given writing condition. - It is used to create subset of dataset based on writing condition. - Args: - wc_dict (dict): dictionary with key as page image name and value as writing condition. - Returns: - (bool): True if writing condition matches. - """ - if args.subset: - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - else: - return True - - -def read_text(madcat_file_path): - """ Maps every word in the page image to a corresponding line. - Args: - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - Returns: - dict: Mapping every word in the page image to a corresponding line. - """ - - word_line_dict = dict() - doc = minidom.parse(madcat_file_path) - zone = doc.getElementsByTagName('zone') - for node in zone: - line_id = node.getAttribute('id') - word_image = node.getElementsByTagName('token-image') - for tnode in word_image: - word_id = tnode.getAttribute('id') - word_line_dict[word_id] = line_id - - text_line_word_dict = dict() - segment = doc.getElementsByTagName('segment') - for node in segment: - token = node.getElementsByTagName('token') - for tnode in token: - ref_word_id = tnode.getAttribute('ref_id') - word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue - ref_line_id = word_line_dict[ref_word_id] - if ref_line_id not in text_line_word_dict: - text_line_word_dict[ref_line_id] = list() - text_line_word_dict[ref_line_id].append(word) - return text_line_word_dict - - -def get_line_image_location(): - image_loc_dict = dict() # Stores image base name and location - image_loc_vect = input_image_fh.read().strip().split("\n") - for line in image_loc_vect: - base_name = os.path.basename(line) - location_vect = line.split('/') - location = "/".join(location_vect[:-1]) - image_loc_dict[base_name]=location - return image_loc_dict - - -### main ### -print("Processing '{}' data...".format(args.out_dir)) - -text_file = os.path.join(args.out_dir, 'text') -text_fh = open(text_file, 'w', encoding='utf-8') -utt2spk_file = os.path.join(args.out_dir, 'utt2spk') -utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') -image_file = os.path.join(args.out_dir, 'images.scp') -image_fh = open(image_file, 'w', encoding='utf-8') - -input_image_file = args.images_scp_path -input_image_fh = open(input_image_file, 'r', encoding='utf-8') - -wc_dict1 = parse_writing_conditions(args.writing_condition1) -wc_dict2 = parse_writing_conditions(args.writing_condition2) -wc_dict3 = parse_writing_conditions(args.writing_condition3) -image_loc_dict = get_line_image_location() - -image_num = 0 -with open(args.data_splits) as f: - prev_base_name = '' - for line in f: - base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] - if prev_base_name != base_name: - prev_base_name = base_name - madcat_xml_path, image_file_path, wc_dict = check_file_location() - if wc_dict is None or not check_writing_condition(wc_dict): - continue - madcat_doc = minidom.parse(madcat_xml_path) - writer = madcat_doc.getElementsByTagName('writer') - writer_id = writer[0].getAttribute('id') - text_line_word_dict = read_text(madcat_xml_path) - base_name = os.path.basename(image_file_path).split('.tif')[0] - for line_id in sorted(text_line_word_dict): - if args.augment: - key = (line_id + '.')[:-1] - for i in range(0, 3): - location_id = '_' + line_id + '_scale' + str(i) - line_image_file_name = base_name + location_id + '.png' - location = image_loc_dict[line_image_file_name] - image_file_path = os.path.join(location, line_image_file_name) - line = text_line_word_dict[key] - text = ' '.join(line) - base_line_image_file_name = line_image_file_name.split('.png')[0] - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 - else: - updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[line_id] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 From 0234a1aabbb4242dc8dfc9364c8fe0e8dea2aa68 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 19:34:54 -0400 Subject: [PATCH 33/67] adding chain recepi --- .../v1/local/tl/chain/run_cnn_e2eali.sh | 244 ++++++++++++++++++ .../v1/local/tl/chain/run_e2e_cnn.sh | 172 ++++++++++++ .../v1/local/tl/run_textlocalization.sh | 4 +- 3 files changed, 418 insertions(+), 2 deletions(-) create mode 100755 egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh create mode 100755 egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh new file mode 100755 index 00000000000..3f2a0dd6e37 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1b +# WER 10.78 +# CER 2.99 +# Final train prob -0.0587 +# Final valid prob -0.0609 +# Final train prob (xent) -0.4471 +# Final valid prob (xent) -0.4653 +# Parameters 3.37M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +#exp/chain/cnn_e2eali_1b: num-iters=179 nj=8..16 num-params=3.4M dim=40->416 combine=-0.058->-0.058 (over 3) xent:train/valid[118,178,final]=(-0.463,-0.445,-0.447/-0.477,-0.462,-0.465) logprob:train/valid[118,178,final]=(-0.062,-0.059,-0.059/-0.063,-0.061,-0.061) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..d43c1f1a0f3 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.71 +# CER 2.85 +# Final train prob -0.0859 +# Final valid prob -0.1266 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 2.94M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) + +set -e + + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=2 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index cd5c96e368e..7a49e2ebb76 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -117,7 +117,7 @@ fi if [ $stage -le 4 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj + local/chain/run_e2e_cnn.sh --nj $nj fi if [ $stage -le 5 ]; then @@ -130,5 +130,5 @@ fi if [ $stage -le 6 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali_1b.sh --nj $nj + local/chain/run_cnn_e2eali.sh --nj $nj fi From a17fbb3e260e6957dcc7b0854aee1939932ef833 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 19:43:17 -0400 Subject: [PATCH 34/67] minor fix --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 7a49e2ebb76..d4db24ce0cb 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -117,7 +117,7 @@ fi if [ $stage -le 4 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." - local/chain/run_e2e_cnn.sh --nj $nj + local/tl/chain/run_e2e_cnn.sh --nj $nj fi if [ $stage -le 5 ]; then @@ -130,5 +130,5 @@ fi if [ $stage -le 6 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/chain/run_cnn_e2eali.sh --nj $nj + local/tl/chain/run_cnn_e2eali.sh --nj $nj fi From 59c84f2428569ad884a2407c2427401e0b11ca5a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 19:48:42 -0400 Subject: [PATCH 35/67] bug fix --- egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh index 3f2a0dd6e37..3e03473faef 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -41,7 +41,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -226,7 +227,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -240,5 +241,10 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir From a23b478a269e0f9ac3be4e6a7040aa8848e21493 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 21:57:52 -0400 Subject: [PATCH 36/67] fixing bugs --- egs/madcat_ar/v1/local/extract_lines.sh | 4 +++- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 14 +++++++------- egs/madcat_ar/v1/run_end2end.sh | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/egs/madcat_ar/v1/local/extract_lines.sh b/egs/madcat_ar/v1/local/extract_lines.sh index 50129ad38c9..ab87836ae3a 100755 --- a/egs/madcat_ar/v1/local/extract_lines.sh +++ b/egs/madcat_ar/v1/local/extract_lines.sh @@ -11,6 +11,8 @@ writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_split_file=data/download/data_splits/madcat.dev.raw.lineid data=data/local/dev +subset=false +augment=false echo "$0 $@" . ./cmd.sh @@ -35,7 +37,7 @@ done $cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \ local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \ $log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \ - || exit 1; + --subset $subset --augment $augment || exit 1; ## concatenate the .scp files together. for n in $(seq $nj); do diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index d4db24ce0cb..845986224d1 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -16,7 +16,8 @@ writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits overwrite=false - +subset=true +augment=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -37,21 +38,21 @@ if [ $stage -le 0 ]; then local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 - for dataset in train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + for set in train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 done echo "$0: Preparing data..." for set in dev train; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 --augment true || exit 1 - data/local/splits/${set}.txt data/${set} + $writing_condition1 $writing_condition2 $writing_condition3 \ + data/local/splits/${set}.txt data/${set} --augment $augment --subset $subset || exit 1 image/fix_data_dir.sh data/${set} done @@ -70,7 +71,6 @@ if [ $stage -le 1 ]; then done echo "$0: Fixing data directory for train dataset $(date)." image/fix_data_dir.sh data/train - fi if [ $stage -le 2 ]; then diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 6ab6e8ff32d..0e9be93be61 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -43,15 +43,15 @@ if [ $stage -le 0 ]; then --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$dataset || exit 1 done echo "$0: Preparing data..." for set in dev train test; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - data/local/splits/${set}.txt data/${set} + $writing_condition1 $writing_condition2 $writing_condition3 \ + data/local/splits/${set}.txt data/${set} || exit 1 image/fix_data_dir.sh data/${set} done From a3aac1abdca709f44973b3ba85cc08f88cab8b40 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 22:01:26 -0400 Subject: [PATCH 37/67] fixing bugs --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 3 +-- egs/madcat_ar/v1/run_end2end.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 845986224d1..340dcd71fb2 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -51,8 +51,7 @@ if [ $stage -le 0 ]; then for set in dev train; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 \ - data/local/splits/${set}.txt data/${set} --augment $augment --subset $subset || exit 1 + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset image/fix_data_dir.sh data/${set} done diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 0e9be93be61..342b0d69597 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -50,8 +50,7 @@ if [ $stage -le 0 ]; then for set in dev train test; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 \ - data/local/splits/${set}.txt data/${set} || exit 1 + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 image/fix_data_dir.sh data/${set} done From 8fc860d9fe4f2c2c15ef126e5bce4456dd8fa613 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 19 Sep 2018 22:11:10 -0400 Subject: [PATCH 38/67] bug fix --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 4 +++- egs/madcat_ar/v1/run_end2end.sh | 12 +++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 340dcd71fb2..c725871c964 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -3,7 +3,7 @@ # 2018 Ashish Arora set -e stage=0 -nj=30 +nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: @@ -15,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +images_scp_dir=data/local overwrite=false subset=true augment=true @@ -114,6 +115,7 @@ if [ $stage -le 3 ]; then data/lang data/lang_rescore_6g fi +nj=30 if [ $stage -le 4 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." local/tl/chain/run_e2e_cnn.sh --nj $nj diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 342b0d69597..a5496a503be 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -15,8 +15,10 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +images_scp_dir=data/local overwrite=false - +subset=true +augment=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -37,20 +39,20 @@ if [ $stage -le 0 ]; then local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset || exit 1 + --data data/local/$set --subset $subset --augment $augment || exit 1 done echo "$0: Preparing data..." for set in dev train test; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset image/fix_data_dir.sh data/${set} done From cafd89ad9352e20a3c43c915a9f42542b7314a76 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 02:19:43 -0400 Subject: [PATCH 39/67] fixing bug in subset --- egs/madcat_ar/v1/local/create_line_image_from_page_image.py | 2 ++ egs/madcat_ar/v1/local/process_data.py | 2 ++ egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index bb126c39538..778555c427e 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -536,6 +536,8 @@ def check_writing_condition(wc_dict, base_name): writing_condition = wc_dict[base_name].strip() if writing_condition != 'IUC': return False + else: + return True else: return True diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index c21beb1be70..e476b67cb96 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -105,6 +105,8 @@ def check_writing_condition(wc_dict): writing_condition = wc_dict[base_name].strip() if writing_condition != 'IUC': return False + else: + return True else: return True diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index c725871c964..473f463d77f 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -18,7 +18,7 @@ data_splits_dir=data/download/data_splits images_scp_dir=data/local overwrite=false subset=true -augment=true +augment=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -47,7 +47,7 @@ if [ $stage -le 0 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$set --subset $subset --augment $augment || exit 1 done - + echo "$0: Preparing data..." for set in dev train; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ From 87c9241186ba01dde184e5ac5414fcfd0f0aeade Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 02:24:19 -0400 Subject: [PATCH 40/67] adding augmentation in text localization --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 473f463d77f..8ced7d37af9 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -56,15 +56,15 @@ if [ $stage -le 0 ]; then image/fix_data_dir.sh data/${set} done - local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test - image/fix_data_dir.sh data/test + #local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + #image/fix_data_dir.sh data/test fi if [ $stage -le 1 ]; then echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in dev test train; do + for set in dev train; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; @@ -87,7 +87,7 @@ if [ $stage -le 3 ]; then utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in test train dev; do + for set in test train dev train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ @@ -118,7 +118,7 @@ fi nj=30 if [ $stage -le 4 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." - local/tl/chain/run_e2e_cnn.sh --nj $nj + local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug fi if [ $stage -le 5 ]; then @@ -126,10 +126,10 @@ if [ $stage -le 5 ]; then steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi if [ $stage -le 6 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" - local/tl/chain/run_cnn_e2eali.sh --nj $nj + local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug fi From 0e74e5562edc0403a203a45a35432d7659969238 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 11:30:29 -0400 Subject: [PATCH 41/67] fixing bugs --- egs/madcat_ar/v1/local/extract_features.sh | 2 +- egs/madcat_ar/v1/local/{tl => }/make_features.py | 0 egs/madcat_ar/v1/run_end2end.sh | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename egs/madcat_ar/v1/local/{tl => }/make_features.py (100%) diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 4ed6ba04348..1741ad3f9b2 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -36,7 +36,7 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - image/ocr/make_features.py $logdir/images.JOB.scp \ + local/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ diff --git a/egs/madcat_ar/v1/local/tl/make_features.py b/egs/madcat_ar/v1/local/make_features.py similarity index 100% rename from egs/madcat_ar/v1/local/tl/make_features.py rename to egs/madcat_ar/v1/local/make_features.py diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index a5496a503be..bee203d1483 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -74,7 +74,7 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | utilis/lang/bpe/reverse.py | \ + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt From 60915aa65c306673270e5c4b91935162046bb934 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 11:32:11 -0400 Subject: [PATCH 42/67] fixing bugs --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 8ced7d37af9..d59ad006886 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -76,7 +76,7 @@ fi if [ $stage -le 2 ]; then for set in train; do echo "$(date) stage 2: Performing augmentation, it will double training data" - local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi From 4099d4ad08d509ba676a437fbb5f9197e5c020ed Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 11:37:25 -0400 Subject: [PATCH 43/67] fixing bugs --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index d59ad006886..00f8f176f4f 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -83,11 +83,11 @@ fi if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | utilis/lang/bpe/reverse.py | \ + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in test train dev train_aug; do + for set in train dev train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ From 4f98f69f7ca922a0ee0f3d8193a8d943829b4e31 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 13:46:43 -0400 Subject: [PATCH 44/67] fixing bugs --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 00f8f176f4f..d263c34a838 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -56,15 +56,15 @@ if [ $stage -le 0 ]; then image/fix_data_dir.sh data/${set} done - #local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test - #image/fix_data_dir.sh data/test + local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + image/fix_data_dir.sh data/test fi if [ $stage -le 1 ]; then echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in dev train; do + for set in test dev train; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; @@ -87,7 +87,7 @@ if [ $stage -le 3 ]; then utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in train dev train_aug; do + for set in test train dev train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ utils/lang/bpe/prepend_words.py | \ From 717501f9c83c44b3e328c64a04d76c9991822773 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 22:58:32 -0400 Subject: [PATCH 45/67] fixing bugs --- egs/madcat_ar/v1/local/extract_features.sh | 3 +-- egs/madcat_ar/v1/local/tl/augment_data.sh | 2 +- egs/madcat_ar/v1/local/tl/process_waldo_data.py | 5 +---- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 5 +++-- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 1741ad3f9b2..7df6385d9c9 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -10,7 +10,6 @@ nj=4 cmd=run.pl feat_dim=40 augment=false -fliplr=false echo "$0 $@" . ./cmd.sh @@ -38,7 +37,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ local/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh index 31e4a8217ca..e49112c9987 100755 --- a/egs/madcat_ar/v1/local/tl/augment_data.sh +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -26,7 +26,7 @@ for set in $aug_set; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr false --augment true $datadir/augmentations/$set + --augment true $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py index df8b6c5149f..b7a24807c5a 100755 --- a/egs/madcat_ar/v1/local/tl/process_waldo_data.py +++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py @@ -29,8 +29,6 @@ def read_image_text(image_text_path): image_path = line_vect[0] line_id = os.path.basename(image_path).split('.png')[0] transcription = line_vect[1:] - #transcription = " ".join(transcription) - #image_transcription_dict[line_id] = transcription joined_transcription = list() for word in transcription: joined_transcription.append(word) @@ -41,7 +39,6 @@ def read_image_text(image_text_path): ### main ### print("Processing '{}' data...".format(args.out_dir)) - text_file = os.path.join(args.out_dir, 'text') text_fh = open(text_file, 'w', encoding='utf-8') utt2spk_file = os.path.join(args.out_dir, 'utt2spk') @@ -50,7 +47,7 @@ def read_image_text(image_text_path): image_fh = open(image_file, 'w', encoding='utf-8') image_transcription_dict = read_image_text(args.image_transcription_file) -for line_id in image_transcription_dict: +for line_id in sorted(image_transcription_dict.keys()): writer_id = line_id.strip().split('_')[-3] updated_line_id = line_id + '.png' image_file_path = os.path.join('lines', updated_line_id) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index d263c34a838..9c7e5c7ab58 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -56,8 +56,9 @@ if [ $stage -le 0 ]; then image/fix_data_dir.sh data/${set} done - local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test - image/fix_data_dir.sh data/test + local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt + #image/fix_data_dir.sh data/test fi if [ $stage -le 1 ]; then From 56c77c4523723e5d79f43bfffb82015c0048f104 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 23:16:24 -0400 Subject: [PATCH 46/67] fixing bugs --- egs/madcat_ar/v1/local/make_features.py | 4 ++-- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 4 ++-- egs/madcat_ar/v1/run_end2end.sh | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py index e9d10ecc87e..1dbefe48f64 100755 --- a/egs/madcat_ar/v1/local/make_features.py +++ b/egs/madcat_ar/v1/local/make_features.py @@ -158,9 +158,9 @@ def vertical_shift(im, mode='mid'): num_fail += 1 continue if args.augment: - im_shift = vertical_shift(im_horizontal_padded, shift_setting[1]) + im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) else: - im_shift = vertical_shift(im_horizontal_padded, shift_setting[0]) + im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) data = np.transpose(im_shift, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 9c7e5c7ab58..e15aba27888 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -65,8 +65,8 @@ if [ $stage -le 1 ]; then echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in test dev train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" + for set in dev train test; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; done diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index bee203d1483..ccb177a6896 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -63,10 +63,10 @@ if [ $stage -le 1 ]; then image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; + for set in test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train From b9d26513435aa58bcb3791eb75548306145d6d46 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 23:26:18 -0400 Subject: [PATCH 47/67] fixing bugs --- .../v1/local/tl/run_textlocalization.sh | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index e15aba27888..fd18a895232 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -107,22 +107,20 @@ if [ $stage -le 3 ]; then utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." - local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ - data/local/dict/lexicon.txt data/lang - utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/lang data/lang_rescore_6g + local/train_lm.sh --order 3 + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/lang data/lang fi nj=30 -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe... $(date)." local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ --use-gpu false \ @@ -130,7 +128,7 @@ if [ $stage -le 5 ]; then data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug fi From 74f7a82b5e986401bc0bc90d3ea36f1f3eac7781 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 20 Sep 2018 23:40:17 -0400 Subject: [PATCH 48/67] fixing bugs --- .../v1/local/tl/run_textlocalization.sh | 2 +- egs/madcat_ar/v1/local/tl/train_lm.sh | 102 ++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100755 egs/madcat_ar/v1/local/tl/train_lm.sh diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index fd18a895232..fc18e52e58f 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -109,7 +109,7 @@ fi if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." - local/train_lm.sh --order 3 + local/tl/train_lm.sh --order 3 utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ data/lang data/lang fi diff --git a/egs/madcat_ar/v1/local/tl/train_lm.sh b/egs/madcat_ar/v1/local/tl/train_lm.sh new file mode 100755 index 00000000000..524bb2e9f40 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/train_lm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=3 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/dev/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi From 7597638d7e8af01854de8711204e0229efdf7db4 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 00:00:27 -0400 Subject: [PATCH 49/67] fixing bugs --- egs/madcat_ar/v1/local/chain/compare_wer.sh | 14 ++++++++++++++ egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh | 4 ---- egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh | 4 ---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/compare_wer.sh b/egs/madcat_ar/v1/local/chain/compare_wer.sh index ad90710b13f..7f04061dafb 100755 --- a/egs/madcat_ar/v1/local/chain/compare_wer.sh +++ b/egs/madcat_ar/v1/local/chain/compare_wer.sh @@ -27,6 +27,13 @@ for x in $*; do done echo +echo -n "# WER (rescored) " +for x in $*; do + wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + echo -n "# CER " for x in $*; do cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') @@ -34,6 +41,13 @@ for x in $*; do done echo +echo -n "# CER (rescored) " +for x in $*; do + cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh index 3e03473faef..cef0f927dd6 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -42,7 +42,6 @@ tdnn_dim=450 srand=0 remove_egs=true lang_decode=data/lang -lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -241,9 +240,6 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh index d43c1f1a0f3..f93ff164b65 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -38,7 +38,6 @@ frames_per_iter=1000000 cmvn_opts="--norm-means=false --norm-vars=false" train_set=train lang_decode=data/lang -lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -163,9 +162,6 @@ if [ $stage -le 5 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" From 479590a61ef73d0bc0c0c93e0a87f59df75fb439 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 01:12:03 -0400 Subject: [PATCH 50/67] fixing run.sh --- egs/madcat_ar/v1/run.sh | 69 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 7922bf30ed6..d3937582662 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -32,7 +32,6 @@ mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then - if [ -f data/train/text ] && ! $overwrite; then echo "$0: Not processing, probably script have run from wrong stage" echo "Exiting with status 1 to avoid data corruption" @@ -42,33 +41,27 @@ if [ $stage -le 0 ]; then echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." for set in dev train test; do local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - data/local/splits/${set}.txt data/${set} + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset image/fix_data_dir.sh data/${set} done fi -mkdir -p data/{train,test,dev}/data -if [ $stage -le 3 ]; then +if [ $stage -le 1 ]; then for dataset in test train; do local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; @@ -76,14 +69,32 @@ if [ $stage -le 3 ]; then utils/fix_data_dir.sh data/train fi -if [ $stage -le 4 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ - data/local/dict "" data/lang/temp data/lang + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 5 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ @@ -92,19 +103,19 @@ if [ $stage -le 5 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 6 ]; then +if [ $stage -le 4 ]; then steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ data/lang exp/mono fi -if [ $stage -le 7 ] && $decode_gmm; then - utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph +if [ $stage -le 5 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi -if [ $stage -le 8 ]; then +if [ $stage -le 6 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/mono exp/mono_ali @@ -112,14 +123,14 @@ if [ $stage -le 8 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 9 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi -if [ $stage -le 10 ]; then +if [ $stage -le 8 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/tri exp/tri_ali @@ -128,22 +139,22 @@ if [ $stage -le 10 ]; then data/train data/lang exp/tri_ali exp/tri3 fi -if [ $stage -le 11 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \ data/test exp/tri3/decode_test fi -if [ $stage -le 12 ]; then +if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train data/lang exp/tri3 exp/tri3_ali fi -if [ $stage -le 13 ]; then +if [ $stage -le 11 ]; then local/chain/run_cnn.sh fi -if [ $stage -le 14 ]; then +if [ $stage -le 12 ]; then local/chain/run_cnn_chainali.sh --stage 2 fi From 87ab218a35be29a2a2d8154d244a7c79dab4895a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 13:50:30 -0400 Subject: [PATCH 51/67] fixing bug in language modelling --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index fc18e52e58f..85f662373e9 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -58,7 +58,6 @@ if [ $stage -le 0 ]; then local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt - #image/fix_data_dir.sh data/test fi if [ $stage -le 1 ]; then @@ -110,8 +109,8 @@ fi if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/tl/train_lm.sh --order 3 - utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ - data/lang data/lang + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang fi nj=30 From d9790005b4d07866d51698fbb88ad7688f5a3d84 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 15:13:31 -0400 Subject: [PATCH 52/67] correcting options --- egs/madcat_ar/v1/run_end2end.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index ccb177a6896..a6ebb3cb5fb 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -17,8 +17,8 @@ writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits images_scp_dir=data/local overwrite=false -subset=true -augment=true +subset=false +augment=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh From ed3ab450cd7bec2300415af88cfccafd3e128efb Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 15:16:45 -0400 Subject: [PATCH 53/67] adding comments --- egs/madcat_ar/v1/local/wer_output_filter | 3 +++ 1 file changed, 3 insertions(+) diff --git a/egs/madcat_ar/v1/local/wer_output_filter b/egs/madcat_ar/v1/local/wer_output_filter index c0f03e7178a..d6d46f3f565 100755 --- a/egs/madcat_ar/v1/local/wer_output_filter +++ b/egs/madcat_ar/v1/local/wer_output_filter @@ -2,6 +2,9 @@ # Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) # Apache 2.0 +# This script converts a BPE-encoded text to normal text and performs normalization. +# It is used in scoring. + use utf8; use open qw(:encoding(utf8)); From 22df693730b0fe6313a50925cad4c15bb9614870 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 15:27:44 -0400 Subject: [PATCH 54/67] fixing conflict --- .../local/create_line_image_from_page_image.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index c7525cea89c..778555c427e 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -188,24 +188,6 @@ def rectangle_corners(rectangle): return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) -<<<<<<< HEAD -def get_orientation(origin, p1, p2): - """ - Given origin and two points, return the orientation of the Point p1 with - regards to Point p2 using origin. - Returns - ------- - integer: Negative if p1 is clockwise of p2. - """ - difference = ( - ((p2[0] - origin[0]) * (p1[1] - origin[1])) - - ((p1[0] - origin[0]) * (p2[1] - origin[1])) - ) - return difference - - -======= ->>>>>>> ed3ab450cd7bec2300415af88cfccafd3e128efb def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. From 95b1c3a73b9a4e8fa887029a1960eb9536beba30 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 16:30:19 -0400 Subject: [PATCH 55/67] updating chain parameters --- .../v1/local/tl/chain/run_cnn_e2eali.sh | 23 ++++--------------- .../v1/local/tl/chain/run_e2e_cnn.sh | 15 +++++------- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh index cef0f927dd6..7dac49d32f4 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -23,22 +23,17 @@ stage=0 nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 -# training options srand=0 remove_egs=true lang_decode=data/lang @@ -120,7 +115,7 @@ if [ $stage -le 3 ]; then fi steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ + --frame-subsampling-factor 4 \ --alignment-subsampling-factor 1 \ --context-opts "--context-width=2 --central-position=1" \ --cmd "$cmd" $num_leaves ${train_data_dir} \ @@ -185,15 +180,15 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=2 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -202,10 +197,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --cleanup.remove-egs=$remove_egs \ @@ -233,10 +224,6 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh index f93ff164b65..525207423a3 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -28,12 +28,8 @@ affix=1a # training options tdnn_dim=450 -num_epochs=2 -num_jobs_initial=3 -num_jobs_final=16 minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 common_egs_dir= -l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=false --norm-vars=false" train_set=train @@ -120,20 +116,21 @@ if [ $stage -le 3 ]; then --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ From 0b71dae5778dd0a74b90511f66acaf9a8ebd95a1 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 16:37:35 -0400 Subject: [PATCH 56/67] updating chain parameters --- .../v1/local/chain/tuning/run_cnn_e2eali_1b.sh | 14 +------------- .../v1/local/chain/tuning/run_e2e_cnn_1a.sh | 17 ++++++----------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index b0b77be2a18..4fe730d2728 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -32,12 +32,8 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -186,7 +182,7 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ @@ -203,10 +199,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --cleanup.remove-egs=$remove_egs \ @@ -234,10 +226,6 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index bf215a0cae2..2891e50da9e 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -27,14 +27,9 @@ affix=1a # training options tdnn_dim=450 -num_epochs=2 -num_jobs_initial=6 -num_jobs_final=16 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=2000000 -cmvn_opts="--norm-means=true --norm-vars=true" +cmvn_opts="--norm-means=false --norm-vars=false" train_set=train lang_decode=data/lang lang_rescore=data/lang_rescore_6g @@ -119,7 +114,7 @@ if [ $stage -le 3 ]; then --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ @@ -129,11 +124,11 @@ if [ $stage -le 3 ]; then --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ From e380a205535ff3316f3f446a054a9cea786ee2d6 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 16:50:31 -0400 Subject: [PATCH 57/67] updating parameters --- .../local/chain/tuning/run_cnn_e2eali_1a.sh | 20 ++++--------------- .../local/chain/tuning/run_cnn_e2eali_1b.sh | 4 ++-- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh index 38de5fe3b7c..ee84ea0d83f 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -19,12 +19,8 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -172,28 +168,24 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=2 \ - --trainer.frames-per-iter=1000000 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=96,64 \ + --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -221,10 +213,6 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 4fe730d2728..c6052b76e7f 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -103,7 +103,6 @@ if [ $stage -le 2 ]; then --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ ${train_data_dir} data/lang $e2echain_model_dir $lat_dir echo "" >$lat_dir/splice_opts - fi if [ $stage -le 3 ]; then @@ -198,6 +197,7 @@ if [ $stage -le 5 ]; then --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.chunk-width=$chunk_width \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ @@ -219,7 +219,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi From 639289df6d32c816c2b2bc8f641b60a4ebf3757a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 16:53:46 -0400 Subject: [PATCH 58/67] updating parameters --- egs/madcat_ar/v1/local/tl/run_textlocalization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 85f662373e9..1a0aaf738d2 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -18,7 +18,7 @@ data_splits_dir=data/download/data_splits images_scp_dir=data/local overwrite=false subset=true -augment=false +augment=true . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh From 04e023625bda485de9b4e9fd48e0b0afc40964ec Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 17:04:38 -0400 Subject: [PATCH 59/67] updating parameters --- .../v1/local/chain/tuning/run_cnn_1a.sh | 19 ++++--------------- .../local/chain/tuning/run_cnn_chainali_1a.sh | 18 +++--------------- 2 files changed, 7 insertions(+), 30 deletions(-) diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh index 02d095b3a82..eb140e900e1 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh @@ -21,13 +21,10 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 # we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -169,13 +166,13 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=4 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -184,10 +181,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -215,10 +208,6 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh index 5faf6a73691..5b3597a3915 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh @@ -18,13 +18,9 @@ lats_affix= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 @@ -171,13 +167,13 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -186,10 +182,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -217,10 +209,6 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; From 9c33a35da9c0554b0bcbced987819ba4c3f2828e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 21 Sep 2018 20:50:30 -0400 Subject: [PATCH 60/67] fixing bug in make features --- egs/madcat_ar/v1/local/extract_features.sh | 3 +- egs/madcat_ar/v1/local/make_features.py | 47 +---- egs/madcat_ar/v1/local/tl/make_features.py | 170 ++++++++++++++++++ .../v1/local/tl/run_textlocalization.sh | 2 +- 4 files changed, 180 insertions(+), 42 deletions(-) create mode 100755 egs/madcat_ar/v1/local/tl/make_features.py diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 7df6385d9c9..91b38a0407e 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -10,6 +10,7 @@ nj=4 cmd=run.pl feat_dim=40 augment=false +script_path=local echo "$0 $@" . ./cmd.sh @@ -35,7 +36,7 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + $script_path/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ --feat-dim $feat_dim --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py index 1dbefe48f64..21ae44be1da 100755 --- a/egs/madcat_ar/v1/local/make_features.py +++ b/egs/madcat_ar/v1/local/make_features.py @@ -14,16 +14,14 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ -import random + import argparse import os import sys import numpy as np from scipy import misc -import math parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") @@ -39,10 +37,8 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') -parser.add_argument('--vertical-shift', type=int, default=16, - help='total number of padding pixel per column') -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") + + args = parser.parse_args() @@ -97,39 +93,15 @@ def horizontal_pad(im, allowed_lengths = None): dtype=int)), axis=1) return im_pad1 -def vertical_shift(im, mode='mid'): - total = args.vertical_shift - if mode == 'notmid': - val = random.randint(0, 1) - if val == 0: - mode = 'top' - else: - mode = 'bottom' - if mode == 'mid': - top = int(total / 2) - bottom = total - top - elif mode == 'top': # more padding on top - top = random.randint(total / 2, total) - bottom = total - top - elif mode == 'bottom': # more padding on bottom - top = random.randint(0, total / 2) - bottom = total - top - width = im.shape[1] - im_pad = np.concatenate( - (255 * np.ones((top, width), dtype=int) - - np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) - im_pad = np.concatenate( - (im_pad, 255 * np.ones((bottom, width), dtype=int) - - np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) - return im_pad ### main ### -random.seed(1) + data_list_path = args.images_scp_path + if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'w') + out_fh = open(args.out_ark,'wb') allowed_lengths = None allowed_len_handle = args.allowed_len_file_path @@ -144,7 +116,6 @@ def vertical_shift(im, mode='mid'): num_fail = 0 num_ok = 0 -aug_setting = ['mid', 'notmid'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -157,11 +128,7 @@ def vertical_shift(im, mode='mid'): if im_horizontal_padded is None: num_fail += 1 continue - if args.augment: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) - else: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) - data = np.transpose(im_shift, (1, 0)) + data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/madcat_ar/v1/local/tl/make_features.py b/egs/madcat_ar/v1/local/tl/make_features.py new file mode 100755 index 00000000000..1dbefe48f64 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/make_features.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" +import random +import argparse +import os +import sys +import numpy as np +from scipy import misc +import math + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +parser.add_argument('--vertical-shift', type=int, default=16, + help='total number of padding pixel per column') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + return im + + +def horizontal_pad(im, allowed_lengths = None): + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] # width + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = int(padding // 2) + right_padding = padding - left_padding + dim_y = im.shape[0] # height + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'mid': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad + +### main ### +random.seed(1) +data_list_path = args.images_scp_path +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'w') + +allowed_lengths = None +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(allowed_len_handle) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +aug_setting = ['mid', 'notmid'] +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im) + im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) + if im_horizontal_padded is None: + num_fail += 1 + continue + if args.augment: + im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) + else: + im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) + data = np.transpose(im_shift, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (image too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 1a0aaf738d2..7e7aabeac48 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -66,7 +66,7 @@ if [ $stage -le 1 ]; then image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in dev train test; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 --script_path local/tl data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." From d4516eab853227c3cd2a3c7bb18c88f3d7b9f6e6 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 22 Sep 2018 05:57:02 -0400 Subject: [PATCH 61/67] Revert "fixing bug in make features" This reverts commit 9c33a35da9c0554b0bcbced987819ba4c3f2828e. --- egs/madcat_ar/v1/local/extract_features.sh | 3 +- egs/madcat_ar/v1/local/make_features.py | 47 ++++- egs/madcat_ar/v1/local/tl/make_features.py | 170 ------------------ .../v1/local/tl/run_textlocalization.sh | 2 +- 4 files changed, 42 insertions(+), 180 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/tl/make_features.py diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 91b38a0407e..7df6385d9c9 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -10,7 +10,6 @@ nj=4 cmd=run.pl feat_dim=40 augment=false -script_path=local echo "$0 $@" . ./cmd.sh @@ -36,7 +35,7 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - $script_path/make_features.py $logdir/images.JOB.scp \ + local/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ --feat-dim $feat_dim --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py index 21ae44be1da..1dbefe48f64 100755 --- a/egs/madcat_ar/v1/local/make_features.py +++ b/egs/madcat_ar/v1/local/make_features.py @@ -14,14 +14,16 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. + eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys import numpy as np from scipy import misc +import math parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") @@ -37,8 +39,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--vertical-shift', type=int, default=16, + help='total number of padding pixel per column') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() @@ -93,15 +97,39 @@ def horizontal_pad(im, allowed_lengths = None): dtype=int)), axis=1) return im_pad1 +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'mid': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad ### main ### - +random.seed(1) data_list_path = args.images_scp_path - if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None allowed_len_handle = args.allowed_len_file_path @@ -116,6 +144,7 @@ def horizontal_pad(im, allowed_lengths = None): num_fail = 0 num_ok = 0 +aug_setting = ['mid', 'notmid'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -128,7 +157,11 @@ def horizontal_pad(im, allowed_lengths = None): if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_horizontal_padded, (1, 0)) + if args.augment: + im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) + else: + im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) + data = np.transpose(im_shift, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/madcat_ar/v1/local/tl/make_features.py b/egs/madcat_ar/v1/local/tl/make_features.py deleted file mode 100755 index 1dbefe48f64..00000000000 --- a/egs/madcat_ar/v1/local/tl/make_features.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2018 Hossein Hadian - -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - If an 'image2num_frames' file is found in the data dir, it will be used - to enforce the images to have the specified length in that file by padding - white pixels (the --padding option will be ignored in this case). This relates - to end2end chain training. - - eg. local/make_features.py data/train --feat-dim 40 -""" -import random -import argparse -import os -import sys -import numpy as np -from scipy import misc -import math - -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") -parser.add_argument('images_scp_path', type=str, - help='Path of images.scp file') -parser.add_argument('--allowed_len_file_path', type=str, default=None, - help='If supplied, each images will be padded to reach the ' - 'target length (this overrides --padding).') -parser.add_argument('--out-ark', type=str, default='-', - help='Where to write the output feature file') -parser.add_argument('--feat-dim', type=int, default=40, - help='Size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='Number of white pixels to pad on the left' - 'and right side of the image.') -parser.add_argument('--vertical-shift', type=int, default=16, - help='total number of padding pixel per column') -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - -def horizontal_pad(im, allowed_lengths = None): - if allowed_lengths is None: - left_padding = right_padding = args.padding - else: # Find an allowed length for the image - imlen = im.shape[1] # width - allowed_len = 0 - for l in allowed_lengths: - if l > imlen: - allowed_len = l - break - if allowed_len == 0: - # No allowed length was found for the image (the image is too long) - return None - padding = allowed_len - imlen - left_padding = int(padding // 2) - right_padding = padding - left_padding - dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) - return im_pad1 - -def vertical_shift(im, mode='mid'): - total = args.vertical_shift - if mode == 'notmid': - val = random.randint(0, 1) - if val == 0: - mode = 'top' - else: - mode = 'bottom' - if mode == 'mid': - top = int(total / 2) - bottom = total - top - elif mode == 'top': # more padding on top - top = random.randint(total / 2, total) - bottom = total - top - elif mode == 'bottom': # more padding on bottom - top = random.randint(0, total / 2) - bottom = total - top - width = im.shape[1] - im_pad = np.concatenate( - (255 * np.ones((top, width), dtype=int) - - np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) - im_pad = np.concatenate( - (im_pad, 255 * np.ones((bottom, width), dtype=int) - - np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) - return im_pad - -### main ### -random.seed(1) -data_list_path = args.images_scp_path -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark,'w') - -allowed_lengths = None -allowed_len_handle = args.allowed_len_file_path -if os.path.isfile(allowed_len_handle): - print("Found 'allowed_lengths.txt' file...", file=sys.stderr) - allowed_lengths = [] - with open(allowed_len_handle) as f: - for line in f: - allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the " - "features.".format(len(allowed_lengths)), file=sys.stderr) - -num_fail = 0 -num_ok = 0 -aug_setting = ['mid', 'notmid'] -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: - num_fail += 1 - continue - if args.augment: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) - else: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) - data = np.transpose(im_shift, (1, 0)) - data = np.divide(data, 255.0) - num_ok += 1 - write_kaldi_matrix(out_fh, data, image_id) - -print('Generated features for {} images. Failed for {} (image too ' - 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh index 7e7aabeac48..1a0aaf738d2 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_textlocalization.sh @@ -66,7 +66,7 @@ if [ $stage -le 1 ]; then image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in dev train test; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 --script_path local/tl data/$set + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." From bac599a37d573b56a24e0b3724b1320ed7718425 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 03:54:32 -0400 Subject: [PATCH 62/67] modification from review --- egs/madcat_ar/v1/local/extract_features.sh | 4 ++- egs/madcat_ar/v1/local/make_features.py | 2 ++ .../v1/local/tl/chain/run_cnn_e2eali.sh | 28 ++++++++----------- .../v1/local/tl/chain/run_e2e_cnn.sh | 14 +++++----- ...calization.sh => run_text_localization.sh} | 10 ++++++- egs/madcat_ar/v1/run_end2end.sh | 2 +- 6 files changed, 34 insertions(+), 26 deletions(-) rename egs/madcat_ar/v1/local/tl/{run_textlocalization.sh => run_text_localization.sh} (91%) diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 7df6385d9c9..06207482a18 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -10,6 +10,7 @@ nj=4 cmd=run.pl feat_dim=40 augment=false +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -37,7 +38,8 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ local/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --augment $augment \| \ + --feat-dim $feat_dim --augment $augment \ + --vertical-shift $verticle_shift \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py index 1dbefe48f64..34d38dd9c82 100755 --- a/egs/madcat_ar/v1/local/make_features.py +++ b/egs/madcat_ar/v1/local/make_features.py @@ -98,6 +98,8 @@ def horizontal_pad(im, allowed_lengths = None): return im_pad1 def vertical_shift(im, mode='mid'): + if args.vertical_shift == 0: + return im total = args.vertical_shift if mode == 'notmid': val = random.randint(0, 1) diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh index 7dac49d32f4..e0cca104f50 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -1,21 +1,17 @@ #!/bin/bash -# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the -# lattice alignments and to build a tree - -# local/chain/compare_wer.sh exp/chain/exp/chain/cnn_e2eali_1b -# System cnn_e2eali_1b -# WER 10.78 -# CER 2.99 -# Final train prob -0.0587 -# Final valid prob -0.0609 -# Final train prob (xent) -0.4471 -# Final valid prob (xent) -0.4653 -# Parameters 3.37M - -# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b -#exp/chain/cnn_e2eali_1b: num-iters=179 nj=8..16 num-params=3.4M dim=40->416 combine=-0.058->-0.058 (over 3) xent:train/valid[118,178,final]=(-0.463,-0.445,-0.447/-0.477,-0.462,-0.465) logprob:train/valid[118,178,final]=(-0.062,-0.059,-0.059/-0.063,-0.061,-0.061) - +# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a/ +# System cnn_e2eali_1a +# WER 16.78 +# CER 5.22 +# Final train prob -0.1189 +# Final valid prob -0.1319 +# Final train prob (xent) -0.6395 +# Final valid prob (xent) -0.6732 +# Parameters 3.73M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/ +# exp/chain/cnn_e2eali_1a/: num-iters=24 nj=3..15 num-params=3.7M dim=56->392 combine=-0.125->-0.125 (over 1) xent:train/valid[15,23,final]=(-0.850,-1.24,-0.640/-0.901,-1.31,-0.673) logprob:train/valid[15,23,final]=(-0.149,-0.209,-0.119/-0.166,-0.229,-0.132) set -e -o pipefail stage=0 diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh index 525207423a3..3fca8cf5fdc 100755 --- a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -3,18 +3,18 @@ # This script does end2end chain training (i.e. from scratch) -# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ # System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 +# WER 19.30 +# CER 5.72 +# Final train prob -0.0734 +# Final valid prob -0.0607 # Final train prob (xent) # Final valid prob (xent) -# Parameters 2.94M +# Parameters 3.30M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) +# exp/chain/e2e_cnn_1a/: num-iters=24 nj=3..15 num-params=3.3M dim=56->292 combine=-0.060->-0.060 (over 1) logprob:train/valid[15,23,final]=(-0.122,-0.143,-0.073/-0.105,-0.132,-0.061) set -e diff --git a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh similarity index 91% rename from egs/madcat_ar/v1/local/tl/run_textlocalization.sh rename to egs/madcat_ar/v1/local/tl/run_text_localization.sh index 1a0aaf738d2..5277dc58a30 100755 --- a/egs/madcat_ar/v1/local/tl/run_textlocalization.sh +++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh @@ -1,6 +1,12 @@ #!/bin/bash # Copyright 2017 Hossein Hadian # 2018 Ashish Arora +""" This script performs full page text recognition on automatically extracted line images + from madcat arabic data. It is created as a separate scrip, because it performs + data augmentation, uses smaller language model and calls process_waldo_data for + test images (automatically extracted line images). Data augmentation increases image + height hence requires different DNN arachitecture and different chain scripts. +""" set -e stage=0 nj=70 @@ -19,6 +25,7 @@ images_scp_dir=data/local overwrite=false subset=true augment=true +verticle_shift=16 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -66,7 +73,8 @@ if [ $stage -le 1 ]; then image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in dev train test; do echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \ + --verticle_shift $verticle_shift data/$set steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index a6ebb3cb5fb..de67e444f39 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -100,7 +100,7 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ data/local/dict/lexicon.txt data/lang utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/lang data/lang_rescore_6g From f0a990e09d376cbd8b26599e4cbc772a60762759 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 05:34:11 -0400 Subject: [PATCH 63/67] modification from review, adding new augmentation in make feature --- egs/cifar/v1/image/ocr/make_features.py | 75 ++++++-- egs/madcat_ar/v1/local/extract_features.sh | 6 +- egs/madcat_ar/v1/local/make_features.py | 172 ------------------ egs/madcat_ar/v1/local/tl/augment_data.sh | 2 +- .../v1/local/tl/run_text_localization.sh | 3 +- egs/yomdle_fa/v1/local/augment_data.sh | 2 +- egs/yomdle_fa/v1/local/extract_features.sh | 4 +- egs/yomdle_tamil/v1/local/augment_data.sh | 2 +- egs/yomdle_tamil/v1/local/extract_features.sh | 4 +- egs/yomdle_zh/v1/local/augment_data.sh | 2 +- egs/yomdle_zh/v1/local/extract_features.sh | 4 +- 11 files changed, 76 insertions(+), 200 deletions(-) delete mode 100755 egs/madcat_ar/v1/local/make_features.py diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index 07f3cb12257..54d2b92ab25 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -45,10 +45,13 @@ 'and right side of the image.') parser.add_argument('--num-channels', type=int, default=1, help='Number of color channels') +parser.add_argument('--vertical-shift', type=int, default=0, + help='total number of padding pixel per column') parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, help="Flip the image left-right for right to left languages") -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") +parser.add_argument('--augment_type', type=str, default='no_aug', + choices=['no_aug', 'random_scale','random_shift'], + help='Subset of data to process.') args = parser.parse_args() @@ -112,6 +115,41 @@ def get_scaled_image_aug(im, mode='normal'): return im_scaled_up return im +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + return im + +def vertical_shift(im, mode='mid'): if args.vertical_shift == 0: return im + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'mid': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad ### main ### random.seed(1) @@ -134,7 +172,11 @@ def get_scaled_image_aug(im, mode='normal'): num_fail = 0 num_ok = 0 -aug_setting = ['normal', 'scaled'] +if args.augment_type == 'random_scale': + aug_setting = ['normal', 'scaled'] +elif args.augment_type == 'random_shift': + aug_setting = ['mid', 'notmid'] + with open(data_list_path) as f: for line in f: line = line.strip() @@ -144,21 +186,26 @@ def get_scaled_image_aug(im, mode='normal'): im = misc.imread(image_path) if args.fliplr: im = np.fliplr(im) - if args.augment: - im_aug = get_scaled_image_aug(im, aug_setting[1]) - else: - im_aug = get_scaled_image_aug(im, aug_setting[0]) - im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) - if im_horizontal_padded is None: + if args.augment_type == 'no_aug': + im = get_scaled_image_aug(im, aug_setting[0]) + im = vertical_shift(im, aug_setting[0]) + elif args.augment_type == 'random_scale': + im = get_scaled_image_aug(im, aug_setting[1]) + im = vertical_shift(im, aug_setting[0]) + elif args.augment_type == 'random_shift': + im = get_scaled_image_aug(im, aug_setting[0]) + im = vertical_shift(im, aug_setting[1]) + im = horizontal_pad(im, allowed_lengths) + if im is None: num_fail += 1 continue if args.num_channels == 1: - data = np.transpose(im_horizontal_padded, (1, 0)) + data = np.transpose(im, (1, 0)) elif args.num_channels == 3: - H = im_horizontal_padded.shape[0] - W = im_horizontal_padded.shape[1] - C = im_horizontal_padded.shape[2] - data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C)) + H = im.shape[0] + W = im.shape[1] + C = im.shape[2] + data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 06207482a18..9fe588f31b8 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -9,7 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 -augment=false +augment='no_aug' verticle_shift=0 echo "$0 $@" @@ -36,9 +36,9 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --augment $augment \ + --feat-dim $feat_dim --augment_type $augment \ --vertical-shift $verticle_shift \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py deleted file mode 100755 index 34d38dd9c82..00000000000 --- a/egs/madcat_ar/v1/local/make_features.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2018 Hossein Hadian - -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - If an 'image2num_frames' file is found in the data dir, it will be used - to enforce the images to have the specified length in that file by padding - white pixels (the --padding option will be ignored in this case). This relates - to end2end chain training. - - eg. local/make_features.py data/train --feat-dim 40 -""" -import random -import argparse -import os -import sys -import numpy as np -from scipy import misc -import math - -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") -parser.add_argument('images_scp_path', type=str, - help='Path of images.scp file') -parser.add_argument('--allowed_len_file_path', type=str, default=None, - help='If supplied, each images will be padded to reach the ' - 'target length (this overrides --padding).') -parser.add_argument('--out-ark', type=str, default='-', - help='Where to write the output feature file') -parser.add_argument('--feat-dim', type=int, default=40, - help='Size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='Number of white pixels to pad on the left' - 'and right side of the image.') -parser.add_argument('--vertical-shift', type=int, default=16, - help='total number of padding pixel per column') -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - -def horizontal_pad(im, allowed_lengths = None): - if allowed_lengths is None: - left_padding = right_padding = args.padding - else: # Find an allowed length for the image - imlen = im.shape[1] # width - allowed_len = 0 - for l in allowed_lengths: - if l > imlen: - allowed_len = l - break - if allowed_len == 0: - # No allowed length was found for the image (the image is too long) - return None - padding = allowed_len - imlen - left_padding = int(padding // 2) - right_padding = padding - left_padding - dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) - return im_pad1 - -def vertical_shift(im, mode='mid'): - if args.vertical_shift == 0: - return im - total = args.vertical_shift - if mode == 'notmid': - val = random.randint(0, 1) - if val == 0: - mode = 'top' - else: - mode = 'bottom' - if mode == 'mid': - top = int(total / 2) - bottom = total - top - elif mode == 'top': # more padding on top - top = random.randint(total / 2, total) - bottom = total - top - elif mode == 'bottom': # more padding on bottom - top = random.randint(0, total / 2) - bottom = total - top - width = im.shape[1] - im_pad = np.concatenate( - (255 * np.ones((top, width), dtype=int) - - np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) - im_pad = np.concatenate( - (im_pad, 255 * np.ones((bottom, width), dtype=int) - - np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) - return im_pad - -### main ### -random.seed(1) -data_list_path = args.images_scp_path -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark,'w') - -allowed_lengths = None -allowed_len_handle = args.allowed_len_file_path -if os.path.isfile(allowed_len_handle): - print("Found 'allowed_lengths.txt' file...", file=sys.stderr) - allowed_lengths = [] - with open(allowed_len_handle) as f: - for line in f: - allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the " - "features.".format(len(allowed_lengths)), file=sys.stderr) - -num_fail = 0 -num_ok = 0 -aug_setting = ['mid', 'notmid'] -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: - num_fail += 1 - continue - if args.augment: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[1]) - else: - im_shift = vertical_shift(im_horizontal_padded, aug_setting[0]) - data = np.transpose(im_shift, (1, 0)) - data = np.divide(data, 255.0) - num_ok += 1 - write_kaldi_matrix(out_fh, data, image_id) - -print('Generated features for {} images. Failed for {} (image too ' - 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh index e49112c9987..8251021acb7 100755 --- a/egs/madcat_ar/v1/local/tl/augment_data.sh +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -26,7 +26,7 @@ for set in $aug_set; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --augment true $datadir/augmentations/$set + --augment 'random_shift' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh index 5277dc58a30..7263d45b062 100755 --- a/egs/madcat_ar/v1/local/tl/run_text_localization.sh +++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh @@ -84,7 +84,8 @@ fi if [ $stage -le 2 ]; then for set in train; do echo "$(date) stage 2: Performing augmentation, it will double training data" - local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \ + --verticle_shift $verticle_shift data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh index 34e938db069..20fb1f415d4 100755 --- a/egs/yomdle_fa/v1/local/augment_data.sh +++ b/egs/yomdle_fa/v1/local/augment_data.sh @@ -27,7 +27,7 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr $fliplr --augment true $datadir/augmentations/$set + --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_fa/v1/local/extract_features.sh b/egs/yomdle_fa/v1/local/extract_features.sh index 7d6806a2712..f75837ae5b3 100755 --- a/egs/yomdle_fa/v1/local/extract_features.sh +++ b/egs/yomdle_fa/v1/local/extract_features.sh @@ -6,7 +6,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false -augment=false +augment='no_aug' num_channels=3 echo "$0 $@" @@ -35,7 +35,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/yomdle_tamil/v1/local/augment_data.sh b/egs/yomdle_tamil/v1/local/augment_data.sh index 82fa5230a43..da5213fba65 100755 --- a/egs/yomdle_tamil/v1/local/augment_data.sh +++ b/egs/yomdle_tamil/v1/local/augment_data.sh @@ -26,7 +26,7 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr false --augment true $datadir/augmentations/$set + --fliplr false --augment 'random_scale' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_tamil/v1/local/extract_features.sh b/egs/yomdle_tamil/v1/local/extract_features.sh index 4ed6ba04348..3880ebad3e8 100755 --- a/egs/yomdle_tamil/v1/local/extract_features.sh +++ b/egs/yomdle_tamil/v1/local/extract_features.sh @@ -9,7 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 -augment=false +augment='no_aug' fliplr=false echo "$0 $@" @@ -38,7 +38,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh index 34e938db069..20fb1f415d4 100755 --- a/egs/yomdle_zh/v1/local/augment_data.sh +++ b/egs/yomdle_zh/v1/local/augment_data.sh @@ -27,7 +27,7 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr $fliplr --augment true $datadir/augmentations/$set + --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_zh/v1/local/extract_features.sh b/egs/yomdle_zh/v1/local/extract_features.sh index 7d6806a2712..f75837ae5b3 100755 --- a/egs/yomdle_zh/v1/local/extract_features.sh +++ b/egs/yomdle_zh/v1/local/extract_features.sh @@ -6,7 +6,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false -augment=false +augment='no_aug' num_channels=3 echo "$0 $@" @@ -35,7 +35,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp From 09da981a3678d3e9bc1ed82acf5c40a00878d3f7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 05:50:36 -0400 Subject: [PATCH 64/67] minor fix --- egs/madcat_ar/v1/local/tl/run_text_localization.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh index 7263d45b062..8d12f7d802f 100755 --- a/egs/madcat_ar/v1/local/tl/run_text_localization.sh +++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh @@ -1,12 +1,13 @@ #!/bin/bash # Copyright 2017 Hossein Hadian # 2018 Ashish Arora -""" This script performs full page text recognition on automatically extracted line images - from madcat arabic data. It is created as a separate scrip, because it performs - data augmentation, uses smaller language model and calls process_waldo_data for - test images (automatically extracted line images). Data augmentation increases image - height hence requires different DNN arachitecture and different chain scripts. -""" + +# This script performs full page text recognition on automatically extracted line images +# from madcat arabic data. It is created as a separate scrip, because it performs +# data augmentation, uses smaller language model and calls process_waldo_data for +# test images (automatically extracted line images). Data augmentation increases image +# height hence requires different DNN arachitecture and different chain scripts. + set -e stage=0 nj=70 From 3d9615e8109fac2b5101e33f25b55586538d7925 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 15:12:18 -0400 Subject: [PATCH 65/67] fixing bugs --- egs/cifar/v1/image/ocr/make_features.py | 32 +++++++++-------------- egs/madcat_ar/v1/local/tl/augment_data.sh | 2 ++ egs/yomdle_fa/v1/local/augment_data.sh | 3 +++ egs/yomdle_tamil/v1/local/augment_data.sh | 2 ++ egs/yomdle_zh/v1/local/augment_data.sh | 2 ++ 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index 54d2b92ab25..2c4d44a1990 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -71,7 +71,6 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") - def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding @@ -115,17 +114,9 @@ def get_scaled_image_aug(im, mode='normal'): return im_scaled_up return im -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - -def vertical_shift(im, mode='mid'): if args.vertical_shift == 0: return im +def vertical_shift(im, mode='normal'): + if args.vertical_shift == 0: + return im total = args.vertical_shift if mode == 'notmid': val = random.randint(0, 1) @@ -133,7 +124,7 @@ def vertical_shift(im, mode='mid'): mode = 'top' else: mode = 'bottom' - if mode == 'mid': + if mode == 'normal': top = int(total / 2) bottom = total - top elif mode == 'top': # more padding on top @@ -175,7 +166,9 @@ def vertical_shift(im, mode='mid'): if args.augment_type == 'random_scale': aug_setting = ['normal', 'scaled'] elif args.augment_type == 'random_shift': - aug_setting = ['mid', 'notmid'] + aug_setting = ['normal', 'notmid'] +else: + aug_setting = ['normal'] with open(data_list_path) as f: for line in f: @@ -186,19 +179,18 @@ def vertical_shift(im, mode='mid'): im = misc.imread(image_path) if args.fliplr: im = np.fliplr(im) - if args.augment_type == 'no_aug': + if args.augment_type == 'no_aug' or 'random_shift': im = get_scaled_image_aug(im, aug_setting[0]) - im = vertical_shift(im, aug_setting[0]) elif args.augment_type == 'random_scale': im = get_scaled_image_aug(im, aug_setting[1]) - im = vertical_shift(im, aug_setting[0]) - elif args.augment_type == 'random_shift': - im = get_scaled_image_aug(im, aug_setting[0]) - im = vertical_shift(im, aug_setting[1]) im = horizontal_pad(im, allowed_lengths) if im is None: num_fail += 1 continue + if args.augment_type == 'no_aug' or 'random_scale': + im = vertical_shift(im, aug_setting[0]) + elif args.augment_type == 'random_shift': + im = vertical_shift(im, aug_setting[1]) if args.num_channels == 1: data = np.transpose(im, (1, 0)) elif args.num_channels == 3: diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh index 8251021acb7..cc44aa58a62 100755 --- a/egs/madcat_ar/v1/local/tl/augment_data.sh +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -8,6 +8,7 @@ nj=4 cmd=run.pl feat_dim=40 +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -26,6 +27,7 @@ for set in $aug_set; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ --augment 'random_shift' $datadir/augmentations/$set done diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh index 20fb1f415d4..1c38bcb072d 100755 --- a/egs/yomdle_fa/v1/local/augment_data.sh +++ b/egs/yomdle_fa/v1/local/augment_data.sh @@ -9,6 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -27,7 +28,9 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set + done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_tamil/v1/local/augment_data.sh b/egs/yomdle_tamil/v1/local/augment_data.sh index da5213fba65..136bfd24eb2 100755 --- a/egs/yomdle_tamil/v1/local/augment_data.sh +++ b/egs/yomdle_tamil/v1/local/augment_data.sh @@ -8,6 +8,7 @@ nj=4 cmd=run.pl feat_dim=40 +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -26,6 +27,7 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ --fliplr false --augment 'random_scale' $datadir/augmentations/$set done diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh index 20fb1f415d4..1f13ed15ded 100755 --- a/egs/yomdle_zh/v1/local/augment_data.sh +++ b/egs/yomdle_zh/v1/local/augment_data.sh @@ -9,6 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -27,6 +28,7 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set done From c33da9fa2d02015a0fcb351aa63555d50b679873 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 16:19:53 -0400 Subject: [PATCH 66/67] adding doocumentation --- egs/madcat_ar/v1/local/tl/process_waldo_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py index b7a24807c5a..0d278e64122 100755 --- a/egs/madcat_ar/v1/local/tl/process_waldo_data.py +++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 +""" This script reads image and transcription mapping and creates the following files :text, utt2spk, images.scp. + Eg. local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + import argparse import os import sys From ee42879d06fa83ab87692fe89737bd2b2e3e82f7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 9 Oct 2018 16:52:41 -0400 Subject: [PATCH 67/67] modification from review --- egs/cifar/v1/image/ocr/make_features.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index 2c4d44a1990..a11cbcc7a82 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -163,13 +163,6 @@ def vertical_shift(im, mode='normal'): num_fail = 0 num_ok = 0 -if args.augment_type == 'random_scale': - aug_setting = ['normal', 'scaled'] -elif args.augment_type == 'random_shift': - aug_setting = ['normal', 'notmid'] -else: - aug_setting = ['normal'] - with open(data_list_path) as f: for line in f: line = line.strip() @@ -180,17 +173,17 @@ def vertical_shift(im, mode='normal'): if args.fliplr: im = np.fliplr(im) if args.augment_type == 'no_aug' or 'random_shift': - im = get_scaled_image_aug(im, aug_setting[0]) + im = get_scaled_image_aug(im, 'normal') elif args.augment_type == 'random_scale': - im = get_scaled_image_aug(im, aug_setting[1]) + im = get_scaled_image_aug(im, 'scaled') im = horizontal_pad(im, allowed_lengths) if im is None: num_fail += 1 continue if args.augment_type == 'no_aug' or 'random_scale': - im = vertical_shift(im, aug_setting[0]) + im = vertical_shift(im, 'normal') elif args.augment_type == 'random_shift': - im = vertical_shift(im, aug_setting[1]) + im = vertical_shift(im, 'notmid') if args.num_channels == 1: data = np.transpose(im, (1, 0)) elif args.num_channels == 3: