forked from unicode-org/ml-confusables-generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_builder.py
320 lines (261 loc) · 12.5 KB
/
dataset_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# Copyright (C) 2020 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
import configparser
import data_preprocessing
from easydict import EasyDict as edict
from functools import partial
import numpy as np
import pathlib
import tensorflow as tf
# AUTOTUNE allows TensorFlow to find a good allocation of CPU budget for
# performance optimization
AUTOTUNE = tf.data.experimental.AUTOTUNE
class DatasetBuilder:
def __init__(self, config_path='configs/sample_config.ini', one_hot=True):
"""Read and set configuration from config file (.ini file) and create
tf.Dataset object or input function according to configuration.
Args:
config_path: Str, path to config (.ini) file.
one_hot: Bool, whether or not to return label as one-hot encoding.
Raises:
ValueError: if values in config file does not have the correct type.
"""
# Set one-hot encoding setting
self.ONE_HOT = one_hot
# Parse config file
config = configparser.ConfigParser()
config.read(config_path)
# Get (and check) configuration
# Get dataset info
self._TRAIN_DATA_DIR = config.get('DATASET', 'TRAIN_DATA_DIR')
self._TEST_DATA_DIR = config.get('DATASET', 'TEST_DATA_DIR')
self._LABEL_FILE = config.get('DATASET', 'LABEL_FILE')
# Get image info
self._HEIGHT = config.getint('IMAGE', 'HEIGHT')
self._WIDTH = config.getint('IMAGE', 'WIDTH')
self._GRAYSCALE_IN = config.getboolean('IMAGE', 'GRAYSCALE_IN')
self._GRAYSCALE_OUT = config.getboolean('IMAGE', 'GRAYSCALE_OUT')
# Get traning and testing spec
self._TRAIN_BATCH_SIZE = config.getint('TRAIN_TEST_SPEC',
'TRAIN_BATCH_SIZE')
self._TEST_BATCH_SIZE = config.getint('TRAIN_TEST_SPEC',
'TEST_BATCH_SIZE')
self._SHUFFLE_BUFFER_SIZE = config.getint('TRAIN_TEST_SPEC',
'SHUFFLE_BUFFER_SIZE')
self._PREFETCH_BUFFER_SIZE = config.getint('TRAIN_TEST_SPEC',
'PREFETCH_BUFFER_SIZE')
# Get data augmentation spec
self._RANDOM_ROTATE = config.getboolean('DATA_AUGMENTATION',
'RANDOM_ROTATE')
self._ROTATE_STDDEV = config.getfloat('DATA_AUGMENTATION',
'ROTATE_STDDEV')
self._RANDOM_ZOOM = config.getboolean('DATA_AUGMENTATION',
'RANDOM_ZOOM')
self._ZOOM_MAX_PERCENT = config.getfloat('DATA_AUGMENTATION',
'ZOOM_MAX_PERCENT')
self._ZOOM_STDDEV = config.getfloat('DATA_AUGMENTATION', 'ZOOM_STDDEV')
self._RESIZE = config.getboolean('DATA_AUGMENTATION', 'RESIZE')
self._RESIZE_HEIGHT = config.getint('DATA_AUGMENTATION',
'RESIZE_HEIGHT')
self._RESIZE_WIDTH = config.getint('DATA_AUGMENTATION', 'RESIZE_WIDTH')
# Label conversion
self._CLASS_NAMES = [line.strip() for line in
open(self._LABEL_FILE).readlines()]
self._NUM_CLASSES = len(self._CLASS_NAMES) # Number of classes
def _get_data_preprocessing_fns(self):
"""Get multiple data preprocessing functions with partial positional
arguments assigned with corresponding configuration.
Returns: EasyDict, allowing accessing dict values as attributes.
"""
# Create new functions with partial positional arguments assigned
process_path_fn = \
partial(data_preprocessing.process_path,
one_hot=self.ONE_HOT,
num_classes=self._NUM_CLASSES,
class_names=self._CLASS_NAMES)
process_img_path_fn = data_preprocessing.process_img_path
convert_format_fn = \
partial(data_preprocessing.convert_format,
grayscale_in=self._GRAYSCALE_IN,
grayscale_out=self._GRAYSCALE_OUT)
random_rotate_fn = \
partial(data_preprocessing.random_rotate,
stddev=self._ROTATE_STDDEV)
random_zoom_fn = \
partial(data_preprocessing.random_zoom,
max_percent=self._ZOOM_MAX_PERCENT,
stddev=self._ZOOM_STDDEV,
img_height=self._HEIGHT,
img_width=self._WIDTH)
resize_fn = \
partial(data_preprocessing.resize,
height=self._HEIGHT,
width=self._WIDTH)
funcs = edict({'process_path': process_path_fn,
'process_img_path': process_img_path_fn,
'convert_format': convert_format_fn,
'random_rotate': random_rotate_fn,
'random_zoom': random_zoom_fn,
'resize': resize_fn})
return funcs
def get_train_dataset(self, filter_size=None):
"""Get training dataset. For the purpose of triplet selection, restrict
dataset to have certain number of labels if specified. See
https://www.tensorflow.org/addons/tutorials/losses_triplet.
Args:
filter_size: Int or None, if filter_size is None, do nothing.
If filter_size is Int, restrict datset to only have
filter_size number of classes. The classes to include
is randomly selected form all classes.
Returns:
ds: tf.Dataset, TensorFlow dataset object for training. Each entry
is (image, label) pair.
"""
# Get filename dataset (each entry is a filename)
data_dir = pathlib.Path(self._TRAIN_DATA_DIR)
list_ds = tf.data.Dataset.list_files(str(data_dir / '*'))
# Create data pre-processing functions
funcs = self._get_data_preprocessing_fns()
# Get labeled dataset (each entry is (image, label) tuple)
ds = list_ds.map(funcs.process_path, num_parallel_calls=AUTOTUNE)
# Execute when filter_size is not None or 0
if filter_size:
# Filter using filter_size
labels = tf.constant(np.random.choice(self._NUM_CLASSES,
filter_size,
replace=False))
ds = ds.filter(lambda img, label:
tf.reduce_any(tf.equal(label,labels)))
# Format conversion
ds = ds.map(funcs.convert_format, num_parallel_calls=AUTOTUNE)
# Map rotate function
if self._RANDOM_ROTATE:
ds = ds.map(funcs.random_rotate, num_parallel_calls=AUTOTUNE)
# Map zoom-in function
if self._RANDOM_ZOOM:
ds = ds.map(funcs.random_zoom, num_parallel_calls=AUTOTUNE)
# Image resizing
ds = ds.map(funcs.resize, num_parallel_calls=AUTOTUNE)
# Shuffle, batch, repeat, prefetch
ds = ds.shuffle(buffer_size=self._SHUFFLE_BUFFER_SIZE)
ds = ds.batch(self._TRAIN_BATCH_SIZE)
ds = ds.prefetch(buffer_size=self._PREFETCH_BUFFER_SIZE)
return ds
def get_test_dataset(self):
"""Get test dataset.
Returns:
ds: tf.Dataset, TensorFlow dataset object for testing. Each entry
is (image, label) pair.
"""
# Get filenames
data_dir = pathlib.Path(self._TEST_DATA_DIR)
list_ds = tf.data.Dataset.list_files(str(data_dir / '*'))
# Create data pre-processing functions
funcs = self._get_data_preprocessing_fns()
# Get labeled dataset
ds = list_ds.map(funcs.process_path, num_parallel_calls=AUTOTUNE)
# Format conversion
ds = ds.map(funcs.convert_format, num_parallel_calls=AUTOTUNE)
# Resizing
ds = ds.map(funcs.resize, num_parallel_calls=AUTOTUNE)
# Batch, prefetch
ds = ds.batch(self._TEST_BATCH_SIZE)
ds = ds.prefetch(buffer_size=self._PREFETCH_BUFFER_SIZE)
return ds
def get_filename_dataset(self, data_dir):
"""For prediciton only! No label file needed! Given a directory of
images, return datatset with images and filenames.
Args:
data_dir: Str, path to image directory.
Returns:
ds: tf.Dataset, TensorFlow dataset object for training. Each entry
is (image, filename) pair.
"""
# Get filenames
data_dir = pathlib.Path(data_dir)
list_ds = tf.data.Dataset.list_files(str(data_dir / '*'))
# Create data pre-processing functions
funcs = self._get_data_preprocessing_fns()
# Get FAKE labeled dataset
ds = list_ds.map(funcs.process_img_path, num_parallel_calls=AUTOTUNE)
# Format conversion
ds = ds.map(funcs.convert_format, num_parallel_calls=AUTOTUNE)
# Resizing
ds = ds.map(funcs.resize, num_parallel_calls=AUTOTUNE)
# Batch, prefetch
ds = ds.batch(1)
return ds
def get_train_input_fn(self, input_name):
"""For tf.estimator training. Create train_input_fn that returns a
tf.Dataset when called. Each entry in tf.Dataset is a
{input_name: image}, label pair.
Args:
input_name: Str, name of the input tensor. Required by tf.estimator.
Returns:
train_input_fn: Function, returns tf.Dataset when called.
"""
def train_input_fn():
# Get filenames
data_dir = pathlib.Path(self._TRAIN_DATA_DIR)
list_ds = tf.data.Dataset.list_files(str(data_dir / '*'))
# Create data pre-processing functions
funcs = self._get_data_preprocessing_fns()
# Get labeled dataset
ds = list_ds.map(funcs.process_path, num_parallel_calls=AUTOTUNE)
# Format conversion
ds = ds.map(funcs.convert_format, num_parallel_calls=AUTOTUNE)
# Map rotate function
if self._RANDOM_ROTATE:
ds = ds.map(funcs.random_rotate, num_parallel_calls=AUTOTUNE)
# Map zoom-in function
if self._RANDOM_ZOOM:
ds = ds.map(funcs.random_zoom, num_parallel_calls=AUTOTUNE)
# Resizing
ds = ds.map(funcs.resize, num_parallel_calls=AUTOTUNE)
# Prepare for tf.estimator
ds = ds.map(lambda img, label: ({input_name: img}, label))
# Shuffle, batch, repeat, prefetch
ds = ds.shuffle(buffer_size=self._SHUFFLE_BUFFER_SIZE)
ds = ds.batch(self._TRAIN_BATCH_SIZE)
ds = ds.repeat()
ds = ds.prefetch(buffer_size=self._PREFETCH_BUFFER_SIZE)
return ds
return train_input_fn
def get_eval_input_fn(self, input_name):
"""For tf.estimator evaluation. Create eval_input_fn that returns a
tf.Dataset when called. Each entry in tf.Dataset is a
{input_name: image}, label pair.
Args:
input_name: Str, name of the input tensor. Required by tf.estimator.
Returns:
train_input_fn: Function, returns tf.Dataset when called.
"""
def eval_input_fn():
# Get filenames
data_dir = pathlib.Path(self._TEST_DATA_DIR)
list_ds = tf.data.Dataset.list_files(str(data_dir / '*'))
# Create data pre-processing functions
funcs = self._get_data_preprocessing_fns()
# Get labeled dataset
ds = list_ds.map(funcs.process_path, num_parallel_calls=AUTOTUNE)
# Format conversion
ds = ds.map(funcs.convert_format, num_parallel_calls=AUTOTUNE)
# Resizing
ds = ds.map(funcs.resize, num_parallel_calls=AUTOTUNE)
# Prepare for tf.estimator
ds = ds.map(lambda img, label: ({input_name: img}, label))
# Batch, prefetch
ds = ds.batch(self._TEST_BATCH_SIZE)
ds = ds.prefetch(buffer_size=self._PREFETCH_BUFFER_SIZE)
return ds
return eval_input_fn
if __name__ == "__main__":
db = DatasetBuilder()
db.ONE_HOT = False
train_input_fn = db.get_eval_input_fn(input_name = 'resnet50_input')
train_ds = train_input_fn()
# train_ds = db.get_test_dataset()
for features_batch, labels_batch in train_ds.take(1):
print(features_batch)
print(labels_batch)
# import pdb;pdb.set_trace()