-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFemnistDataset.py
112 lines (81 loc) · 3.16 KB
/
FemnistDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
FEMIST dataset from LEAF
"""
import json
import os
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
class FemnistDataset(Dataset):
"""FEMNIST dataset."""
def __init__(self, root_dir, train=True, transform=None, random_seed=42):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.width = 28
self.data = torch.tensor([])
self.targets = torch.LongTensor([])
self.clients = []
self.groups = []
self.dict_users = {}
# Select training set or test set
dataset = "train"
files = os.listdir(os.path.join(root_dir, dataset))
files = [f for f in files if f.endswith('.json')]
for f in files:
with open(os.path.join(root_dir, dataset, f), 'r') as inf:
cdata = json.load(inf)
# List of clients
self.clients.extend(cdata['users'])
for user, data in cdata['user_data'].items():
# Figure out the index of this data in the dataset
start_index = len(self.data)
end_index = start_index + len(data['x'])
idx = list(range(start_index, end_index))
X = torch.reshape(torch.tensor(data['x']),
(-1, self.width, self.width))
y = torch.LongTensor(data['y'])
train_idx, val_idx = self.train_val_dataset(
list(range(len(idx))), random_state=random_seed)
# Extend data tensor
self.data = torch.cat(
(self.data, X))
# Extend the target tensor
self.targets = torch.cat(
(self.targets,
y))
if train:
selected_idx = torch.tensor(idx)[train_idx].tolist()
else:
selected_idx = torch.tensor(idx)[val_idx].tolist()
# Check if this user already exists in the dictionary
if user in self.dict_users:
self.dict_users[user].extend(selected_idx)
else:
self.dict_users[user] = selected_idx
self.root_dir = root_dir
self.transform = transform
def train_val_dataset(self, X, val_split=0.2, random_state=42):
"""
Split dataset into train and test sets.
"""
train_idx, val_idx = train_test_split(
X, test_size=val_split, random_state=random_state)
return train_idx, val_idx
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
image = self.data[idx]
target = self.targets[idx]
sample = (
torch.reshape(image, (1, self.width, self.width)),
target)
if self.transform:
sample = self.transform(sample)
return sample