-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils.py
290 lines (242 loc) · 12.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import os
import random
import time
import numpy as np
import scipy.io
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from data_utils import rotated_mnist, adni_dataset
def gen_batch(data_x, data_y, batch_size, num_iter):
"""NOTE: Deprecated in favor of `epoch_generator`."""
index = len(data_y)
for i in range(num_iter):
index += batch_size
if (index + batch_size > len(data_y)):
index = 0
data_x, data_y = sklearn.utils.shuffle(data_x, data_y, random_state=i + 1)
batched_x = data_x[index:index + batch_size]
batched_y = data_y[index:index + batch_size]
yield (batched_x, batched_y)
def epochs_generator(data_x, data_y, batch_size, epochs=1, seed=None):
for ep in range(epochs):
gen = epoch_generator(data_x, data_y, batch_size, seed=seed + ep)
for batch in gen:
yield batch
def epoch_generator(data_x, data_y, batch_size, seed=None):
"""Generate one epoch of batches."""
data_x, data_y = sklearn.utils.shuffle(data_x, data_y, random_state=seed)
# Drop last by default
epoch_iters = len(data_x) // batch_size
for i in range(epoch_iters):
left, right = i * batch_size, (i + 1) * batch_size
yield (data_x[left:right], data_y[left:right])
def client_selection(seed, total, num_selected, weights=None):
rng = np.random.default_rng(seed=seed)
indices = rng.choice(range(total), num_selected, replace=False, p=weights)
return indices
def print_log(message, fpath=None, stdout=True, print_time=False):
if print_time:
timestr = time.strftime('%Y-%m-%d %a %H:%M:%S')
message = f'{timestr} | {message}'
if stdout:
print(message)
if fpath is not None:
with open(fpath, 'a') as f:
print(message, file=f)
#########################
#### Dataset loading ####
#########################
def read_vehicle_data(data_dir='data/vehicle', seed=None, bias=False, density=1.0, standardize=True):
"""Read Vehicle dataset.
Args:
data_dir: directory that stores the `vehicle.mat` file
seed: random seed for generating the train/test split
bias: whether to insert a column of 1s to the dataset (after standardizing)
so that a model bias term is implicitly included.
density: fraction of the training data on each client to keep; this does not
affect test examples.
"""
x_trains, y_trains, x_tests, y_tests = [], [], [], []
mat = scipy.io.loadmat(os.path.join(data_dir, 'vehicle.mat'))
raw_x, raw_y = mat['X'], mat['Y'] # y in {-1, 1}
print('Vehicle dataset:')
print('\tnumber of clients:', len(raw_x), len(raw_y))
print('\tnumber of examples:', [len(raw_x[i][0]) for i in range(len(raw_x))])
print('\tnumber of features:', len(raw_x[0][0][0]))
print('\tSeed of dataset:', seed)
print(f'\tUsing {density * 100:.2f}% of training data on each client')
print(f'\tStandardizing using (density adjusted) training statistics: {standardize}')
for i in range(len(raw_x)):
features, label = raw_x[i][0], raw_y[i][0].flatten()
x_train, x_test, y_train, y_test = train_test_split(
features, label, test_size=0.25, random_state=seed)
if density != 1:
num_train_examples = int(density * len(x_train))
# Randomness should be set for different workers (no explicit seed)
train_mask = np.random.permutation(len(x_train))[:num_train_examples]
x_train = x_train[train_mask] # Mask before fitting standard scaler.
y_train = y_train[train_mask]
if standardize:
# Preprocessing using mean/std from training examples, within each silo.
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
if bias:
x_train = np.c_[x_train, np.ones(len(x_train))]
x_test = np.c_[x_test, np.ones(len(x_test))]
x_trains.append(x_train)
x_tests.append(x_test)
# binary label can either be ints or floats (float32 suffice)
y_trains.append(y_train.astype(float))
y_tests.append(y_test.astype(float))
# Since different tasks have differnet data, this is a ragged array
return (np.array(x_trains, dtype=object), np.array(y_trains, dtype=object),
np.array(x_tests, dtype=object), np.array(y_tests, dtype=object))
def read_gleam_data(data_dir='data/gleam', seed=None, bias=False, density=1.0, standardize=True):
"""Read GLEAM dataset.
Args:
data_dir: directory that stores the `vehicle.mat` file
seed: random seed for generating the train/test split
bias: whether to insert a column of 1s to the dataset (after standardizing)
so that a model bias term is implicitly included.
density: fraction of the training data on each client to keep; this does not
affect test examples.
"""
x_trains, y_trains, x_tests, y_tests = [], [], [], []
mat = scipy.io.loadmat(os.path.join(data_dir, 'gleam.mat'))
raw_x, raw_y = mat['X'][0], mat['Y'][0] # y in {-1, 1}
print('Google glass (GLEAM) dataset:')
print('number of clients:', len(raw_x), len(raw_y))
print('number of examples:', [len(raw_x[i]) for i in range(len(raw_x))])
print('number of features:', len(raw_x[0][0]))
print('Seed of dataset:', seed)
print(f'Keeping {density * 100:.2f}% of training data on each client')
print(f'Standardizing using (density adjusted) training statistics: {standardize}')
for i in range(len(raw_x)):
features, label = raw_x[i], raw_y[i].flatten()
x_train, x_test, y_train, y_test = train_test_split(
features, label, test_size=0.25, random_state=seed)
if density != 1:
# Randomness should be set for different workers (no explicit seed)
num_train_examples = int(density * len(x_train))
train_mask = np.random.permutation(len(x_train))[:num_train_examples]
x_train = x_train[train_mask]
y_train = y_train[train_mask]
if standardize:
# Preprocessing using mean/std from training examples, within each silo
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
if bias:
# Append a column of ones to implicitly include bias
x_train = np.c_[x_train, np.ones(len(x_train))]
x_test = np.c_[x_test, np.ones(len(x_test))]
x_trains.append(x_train)
x_tests.append(x_test)
# binary label can either be ints or floats (float32 suffice)
y_trains.append(y_train.astype(float))
y_tests.append(y_test.astype(float))
# Since different tasks have differnet data, this is a ragged array
return (np.array(x_trains, dtype=object), np.array(y_trains, dtype=object),
np.array(x_tests, dtype=object), np.array(y_tests, dtype=object))
def read_school_data(data_dir='data/school', test_frac=0.3, seed=None, bias=False, standardize=True, **__kwargs):
"""Read School dataset."""
x_trains, y_trains, x_tests, y_tests = [], [], [], []
mat = scipy.io.loadmat(os.path.join(data_dir, 'school.mat'))
# Note that the raw data structure is different from vehicles
raw_x, raw_y = mat['X'][0], mat['Y'][0] # y is exam score
print('School dataset:')
print('number of clients:', len(raw_x), len(raw_y))
print('number of examples:', [len(raw_x[i]) for i in range(len(raw_x))])
print('number of features:', len(raw_x[0][0]))
for i in range(len(raw_x)): # For each client
features, label = raw_x[i], raw_y[i].flatten()
x_train, x_test, y_train, y_test = train_test_split(
features, label, test_size=test_frac, random_state=seed)
if standardize:
# Preprocessing using mean/std from training examples, within each silo
scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# For y (scores), use min/max normalization
min_y, max_y = 1, 70 # Hardcode stats from dataset.
y_train = (y_train - min_y) / (max_y - min_y)
y_test = (y_test - min_y) / (max_y - min_y)
if bias:
x_train = np.c_[x_train, np.ones(len(x_train))]
x_test = np.c_[x_test, np.ones(len(x_test))]
# features / exam scores should be float (if not standardized)
x_trains.append(x_train.astype(float))
x_tests.append(x_test.astype(float))
y_trains.append(y_train.astype(float))
y_tests.append(y_test.astype(float))
# Since different tasks have differnet data, this is a ragged array
return (np.array(x_trains, dtype=object), np.array(y_trains, dtype=object),
np.array(x_tests, dtype=object), np.array(y_tests, dtype=object))
def read_rotated_mnist_data(data_dir='data/rotated_mnist', **__kwargs):
"""Read rotated MNIST data."""
try:
x_trains = np.load(os.path.join(data_dir, 'train_images.npy'))
y_trains = np.load(os.path.join(data_dir, 'train_labels.npy'))
x_tests = np.load(os.path.join(data_dir, 'test_images.npy'))
y_tests = np.load(os.path.join(data_dir, 'test_labels.npy'))
except FileNotFoundError:
x_trains, y_trains, x_tests, y_tests = rotated_mnist.rotated_mnist(save_dir=data_dir)
assert len(x_trains) == len(y_trains) == len(x_tests) == len(y_tests)
num_clients = len(x_trains)
print('Rotated MNIST dataset:')
print('\tnumber of clients:', num_clients)
print('\tnumber of train examples:', [len(x_trains[i]) for i in range(num_clients)])
print('\tnumber of test examples:', [len(x_tests[i]) for i in range(num_clients)])
print('\tnumber of features:', x_trains[0][0].shape)
return x_trains, y_trains, x_tests, y_tests
def read_rotated_patched_mnist_data(data_dir='data/rotated_patched_mnist',
noise_level=0.5, patch_size=7,
**__kwargs):
"""Read rotated and patched MNIST data."""
suffix = f'noise{noise_level}_patch{patch_size}_area'
try:
x_trains = np.load(os.path.join(data_dir, f'train_images_{suffix}.npy'))
y_trains = np.load(os.path.join(data_dir, f'train_labels_{suffix}.npy'))
x_tests = np.load(os.path.join(data_dir, f'test_images_{suffix}.npy'))
y_tests = np.load(os.path.join(data_dir, f'test_labels_{suffix}.npy'))
except FileNotFoundError:
x_trains, y_trains, x_tests, y_tests = rotated_mnist.rotated_patched_mnist(
noise_level=noise_level, save_dir=data_dir)
assert len(x_trains) == len(y_trains) == len(x_tests) == len(y_tests)
num_clients = len(x_trains)
print(f'Rotated + Patched MNIST dataset ({suffix}):')
print('\tnumber of clients:', num_clients)
print('\tnumber of train examples:', [len(x_trains[i]) for i in range(num_clients)])
print('\tnumber of test examples:', [len(x_tests[i]) for i in range(num_clients)])
print('\tnumber of features:', x_trains[0][0].shape)
return x_trains, y_trains, x_tests, y_tests
def read_adni_data(data_dir='data/adni', seed=None, density=1.0, **__kwargs):
print(f'ADNI dataset:')
try:
tag = f'_seed{seed}' if seed is not None else ''
print(f'Loading dataset with tag "{tag}"...')
# `allow_pickle` since we saved np object arrays (for ragged arrays)
x_trains = np.load(os.path.join(data_dir, f'train_images{tag}.npy'), allow_pickle=True)
y_trains = np.load(os.path.join(data_dir, f'train_labels{tag}.npy'), allow_pickle=True)
x_tests = np.load(os.path.join(data_dir, f'test_images{tag}.npy'), allow_pickle=True)
y_tests = np.load(os.path.join(data_dir, f'test_labels{tag}.npy'), allow_pickle=True)
print('Loaded preprocessed ADNI dataset')
except FileNotFoundError:
print('Preprocessing ADNI dataset...')
x_trains, y_trains, x_tests, y_tests = adni_dataset.read_data(seed=seed, save_dir=data_dir)
# Check number of clients
assert len(x_trains) == len(y_trains) == len(x_tests) == len(y_tests)
num_clients = len(x_trains)
# Do deterministic subsampling since dataset was shuffled during construction.
if density < 1:
print(f'Subsampling training sets to {density}')
for i in range(num_clients):
x_trains[i] = x_trains[i][:int(len(x_trains[i]) * density)]
y_trains[i] = y_trains[i][:int(len(y_trains[i]) * density)]
print('\tnumber of clients:', num_clients)
print('\tnumber of train examples:', [len(x_trains[i]) for i in range(num_clients)])
print('\tnumber of test examples:', [len(x_tests[i]) for i in range(num_clients)])
print('\tnumber of features:', x_trains[0][0].shape)
return x_trains, y_trains, x_tests, y_tests