ispamm
diff --git a/‎data/create_dataset.py
+26-16 b/‎data/create_dataset.py
+26-16
diff --git a/‎data/dataloader.py
+3-3 b/‎data/dataloader.py
+3-3
diff --git a/‎data/preprocessing.py
+32-26 b/‎data/preprocessing.py
+32-26
diff --git a/‎earlystopping.py
+13-11 b/‎earlystopping.py
+13-11
@@ -7,6 +7,8 @@
 from scipy import signal
 import copy
 
+from tqdm import tqdm
+
 def set_seed(seed: int = 42) -> None:
     np.random.seed(seed)
     random.seed(seed)
@@ -45,33 +47,33 @@ class DatasetCreator:
         sample_len: length in seconds of each generated sample.
         overlap: percentage of overlapping between samples."""
     def __init__(self, preproc_data, label_kind,
-                 physio_f, gaze_f, block_len, sample_len, overlap):
+                 physio_f, gaze_f, block_len, sample_len, overlap, verbose=False):
         self.preproc_data = preproc_data
         self.label_kind = label_kind
         self.physio_f = physio_f
         self.gaze_f = gaze_f
         self.block_len = block_len
         self.sample_len = sample_len
         self.overlap = overlap
+        self.verbose = verbose
 
     def save_to_list(self):
         """Grabs data from hierarchical structure and unpacks all values.
-        Add gathered data to a list as as single sample."""
+        Add gathered data to a list as a single sample."""
         sub_dir = list_files(self.preproc_data, sorted_dir=False)
         data_list = []
         labels = []
-        for dir in sub_dir:  # for each subject/folder
+        for dir in tqdm(sub_dir, desc='Reading data'):  # for each subject/folder
             subj = int(dir[-2:])
-            print("Working on subject", subj)
 
             # Get labels for current subject and label kind
             all_labels = np.genfromtxt(os.path.join(dir, 'labels_felt{}.csv'  # return Dataframe
                                     .format(self.label_kind)), delimiter=',')
 
             # Get each original sample and create dataset samples
-            id_trials = [x.split("/")[-1].partition("_")[0] for x in list_files(dir, sorted_dir=False)] # get beggining of files
+            id_trials = [x.split("\\")[-1].partition("_")[0] for x in list_files(dir, sorted_dir=False)] # get beggining of files
             id_trials = sorted(np.unique(id_trials)[:-1], key=lambda x: int(x))  # remove duplicates, "label", and sort
-            for i, id in enumerate(id_trials):
+            for i, id in enumerate(tqdm(id_trials, desc=f'Subject {subj}')):
                 pupil_data = np.genfromtxt(os.path.join(dir, '{}_PUPIL.csv'
                                     .format(id)), delimiter=',')
                 gaze_data = np.genfromtxt(os.path.join(dir, '{}_GAZE_COORD.csv'
@@ -110,7 +112,8 @@ def save_to_list(self):
                     gsr = gsr_data[k : k + n_points_sample_physio]
                     eeg = eeg_data[k : k + n_points_sample_physio]
                     ecg = ecg_data[k : k + n_points_sample_physio]
-
+                    
+                   
                     if (len(pupil) != n_points_sample_gaze or len(gaze_coord) != n_points_sample_gaze or len(eye_dist) != n_points_sample_gaze or
                         len(gsr) != n_points_sample_physio or len(eeg) != n_points_sample_physio or len(ecg) != n_points_sample_physio):
                         # sanity check on the samples
@@ -121,8 +124,9 @@ def save_to_list(self):
                     clean_gaze_coord = gaze_coord[gaze_coord != -1]
                     clean_eye_dist = eye_dist[eye_dist != -1]
                     if len(clean_pupil)/len(pupil) < 0.6 or len(clean_gaze_coord)/len(gaze_coord) < 0.6 or len(clean_eye_dist)/len(eye_dist) < 0.6:
-                        print("\033[93mGaze segment too noisy for subject: {}, sample: {}, segment:{}!\033[0m"
-                                .format(subj, id, str(j//(n_points_sample_gaze - overlap_step_gaze))))
+                        if self.verbose:
+                            print("\033[93mGaze segment too noisy for subject: {}, sample: {}, segment:{}!\033[0m"
+                                    .format(subj, id, str(j//(n_points_sample_gaze - overlap_step_gaze))))
                         continue
 
                     # Create single variable containing all gaze information
@@ -145,7 +149,6 @@ def save_to_list(self):
 
         return data_list, labels
 
-
 def std_for_SNR(signal, noise, snr):
     '''Compute the gain to be applied to the noise to achieve the given SNR in dB'''
     signal_power = np.var(signal.numpy())
@@ -220,32 +223,39 @@ def load_dataset(data, labels, scaling, noise, m, SNR):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--path_to_csv', type=str)
-    parser.add_argument('--save_path', type=str)
-    parser.add_argument('--label_kind', type=str, default='Vlnc', help="Choose valence (Vlnc) or arousal (Arsl) label")
+    parser.add_argument('--preproc_data_path', type=str, default='hci-tagging-database/preproc_data', help='Path to folder where preprocessed data was saved')
+    parser.add_argument('--save_path', type=str, default='hci-tagging-database/torch_datasets', help='Path to save .pt files')
+    parser.add_argument('--label_kind', type=str, default='Arsl', help="Choose valence (Vlnc) or arousal (Arsl) label")
     parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--verbose', type=bool, action=argparse.BooleanOptionalAction, default=False)
     args = parser.parse_args()
 
     assert args.label_kind in ["Arsl", "Vlnc"]
+    print("Creating dataset for label: ", args.label_kind)
 
     set_seed(args.seed)
 
-    d = DatasetCreator(args.path_to_csv, args.label_kind, physio_f = 128, gaze_f = 60, block_len = 30, sample_len=10, overlap = 0)  # create object
+    d = DatasetCreator(args.preproc_data_path, args.label_kind, physio_f = 128, gaze_f = 60, block_len = 30, sample_len=10, overlap = 0, verbose=args.verbose)  # create object
     data, labels = d.save_to_list()  # call method
+    print(len(data))
 
     test_size = 0.2
     X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=args.seed, stratify=labels)
 
+    # Augmentation
     m = 30 # Number of augmented signals for each original sample
     SNR = 5
 
     train_data = load_dataset(X_train, y_train, scaling=True, noise=True, m=m, SNR=SNR)
-    test_data = load_dataset(X_test, y_test, False, False, 1, None)
+    test_data = load_dataset(X_test, y_test, scaling=False, noise=False, m=1, SNR=None)
 
     print("Len train before augmentation: ", len(X_train))
     print("Len train after augmentation: ", len(train_data))
     print("Len test: ", len(test_data))
     print("Tot dataset: ", len(train_data) + len(test_data))
 
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+        
     torch.save(train_data, f'{args.save_path}/train_augmented_data_{args.label_kind}.pt')
-    torch.save(test_data,  f'{args.save_path}/test_augmented_data_{args.label_kind}.pt')
+    torch.save(test_data,  f'{args.save_path}/test_data_{args.label_kind}.pt')
@@ -19,11 +19,11 @@ def __getitem__(self, idx):
         sample, label = self.data[idx]
         return sample, label
 
-def MyDataLoader(root, label_kind, batch_size, num_workers=1):
+def MyDataLoader(train_file, test_file, batch_size, num_workers=1):
     print("----Loading dataset----")
 
-    training = torch.load(root + f"/train_augmented_data_{label_kind}.pt")  # Loads an object saved with torch.save() from a file
-    validation = torch.load(root + f"/test_augmented_data_{label_kind}.pt")  # Loads an object saved with torch.save() from a file
+    training = torch.load(train_file)  # Loads an object saved with torch.save() from a file
+    validation = torch.load(test_file)  # Loads an object saved with torch.save() from a file
 
     train_dataset = MyDataset(training)
     eval_dataset = MyDataset(validation)
 
@@ -3,7 +3,8 @@
 import xml.etree.ElementTree as ET
 import mne
 import pandas as pd
-import numpy as np 
+import numpy as np
+from tqdm import tqdm 
 
 def list_files(directory, sorted_dir):
     """List all files (i.e. their paths) in the dataset directory. Need sorted argument
@@ -19,15 +20,8 @@ def list_files(directory, sorted_dir):
         files.append(single)
     return files
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--sessions_path', type=str)
-    parser.add_argument('--save_path', type=str)
-    args = parser.parse_args()
-
-    sessions_dir = list_files(args.sessions_path, sorted_dir=True)
-
-    for dir_id in range(len(sessions_dir)):
+def preprocess(sessions_dir, save_path, verbose=False):
+    for dir_id in tqdm(range(len(sessions_dir)), desc='Preprocessing'):
         dir = sessions_dir[dir_id]
 
         # SESSION.XML --------------------------------------------------------------
@@ -43,13 +37,15 @@ def list_files(directory, sorted_dir):
         # Get subject infos
         session = root.attrib['cutNr']
         subject = root[0].attrib['id']
-        print("\033[94mCurrently considerig sub:{}, session:{}\033[0m".format(subject, session))
-
+        
+        if verbose:
+            print("\033[94mCurrently considerig sub:{}, session:{}\033[0m".format(subject, session))
+        
         # PHYSIOLOGICAL DATA -------------------------------------------------------
         physio_file = os.path.join(dir, "Part_{}_S_Trial{}_emotion.bdf".format(subject, int(session)//2))
-        raw = mne.io.read_raw_bdf(physio_file, preload=True)
+        raw = mne.io.read_raw_bdf(physio_file, preload=True, verbose=verbose)
         # documentation for mne raw: https://mne.tools/1.0/auto_tutorials/raw/10_raw_overview.html#sphx-glr-auto-tutorials-raw-10-raw-overview-py
-
+        
         # Get general info from file
         n_time_samps = raw.n_times  # number of samples
         time_secs = raw.times  # corresponding second [s] of each sample
@@ -62,7 +58,7 @@ def list_files(directory, sorted_dir):
         # Resample all data from 256 Hz to 128 Hz, passing status channels as stimuli
         # documentation: https://mne.tools/0.24/auto_tutorials/preprocessing/30_filtering_resampling.html
         # OBS: this function applies first a brick-wall filter at the Nyquist frequency of the desired new sampling rate (i.e. 64Hz)
-        raw = raw.resample(sfreq=128, stim_picks=46)
+        raw = raw.resample(sfreq=128, stim_picks=46, verbose=verbose)
 
         # Get status channel to extract video's initial and ending samples (to remove baseline pre/post-stimulus)
         status_ch, time = raw[-1]  # extract last channel
@@ -80,37 +76,37 @@ def list_files(directory, sorted_dir):
 
         # PREPROCESSING 
         # EEG ------------------------------------------------------------------------------------------- 
-        raw_eeg = raw.copy().pick_channels(EEG_CH)
+        raw_eeg = raw.copy().pick_channels(EEG_CH, verbose=verbose)
         # Referencing to average reference 
         # documentation: https://mne.tools/dev/generated/mne.set_eeg_reference.html
-        raw_eeg = raw_eeg.set_eeg_reference(ref_channels='average')
+        raw_eeg = raw_eeg.set_eeg_reference(ref_channels='average', verbose=verbose)
         # Artifact removal and filtering  
         # documentation: https://mne.tools/0.24/auto_tutorials/preprocessing/30_filtering_resampling.html
         # Power line at 50 Hz, as proved with plots below
         # Band pass FIR filter from 1 - 45 Hz => still need to apply notch filter at 50Hz,
         # since the filter is not acting upon the 50Hz component (neglectable attenuation)
-        raw_eeg = raw_eeg.notch_filter(50)
-        raw_eeg = raw_eeg.filter(l_freq=1,  h_freq=45)
+        raw_eeg = raw_eeg.notch_filter(50, verbose=verbose)
+        raw_eeg = raw_eeg.filter(l_freq=1,  h_freq=45, verbose=verbose)
         # OBS the order between notch and bandpass filter is inrelevant (TRIED)
         # EOG removal: not considered for now, TODO?
         # ECG -------------------------------------------------------------------------------------------
-        raw_ecg = raw.copy().pick_channels(ECG_CH)
+        raw_ecg = raw.copy().pick_channels(ECG_CH, verbose=verbose)
         # Artifact removal and filtering  
         # documentation: https://mne.tools/0.24/auto_tutorials/preprocessing/30_filtering_resampling.html
         # Power line at 50 Hz, as proved with plots below
         # Band pass FIR filter from 0.5 - 45 Hz => still need to apply notch filter at 50Hz,
         # since the filter is not acting upon the 50Hz component (neglectable attenuation)
-        raw_ecg = raw_ecg.notch_filter(50)
-        raw_ecg = raw_ecg.filter(l_freq=0.5,  h_freq=45)
+        raw_ecg = raw_ecg.notch_filter(50, verbose=verbose)
+        raw_ecg = raw_ecg.filter(l_freq=0.5, h_freq=45, verbose=verbose)
         # OBS the order between notch and bandpass filter is inrelevant (TRIED)
         # GSR -------------------------------------------------------------------------------------------
-        raw_gsr = raw.copy().pick_channels([GSR_CH])
+        raw_gsr = raw.copy().pick_channels([GSR_CH], verbose=verbose)
         # Artifact removal and filtering  
         # documentation: https://mne.tools/0.24/auto_tutorials/preprocessing/30_filtering_resampling.html
         # Power line at 50 Hz, as proved with plots below
         # Low pass FIR filter at 60 Hz => still need to apply notch filter at 50Hz
-        raw_gsr = raw_gsr.notch_filter(50)
-        raw_gsr = raw_gsr.filter(l_freq=None,  h_freq=60)
+        raw_gsr = raw_gsr.notch_filter(50, verbose=verbose)
+        raw_gsr = raw_gsr.filter(l_freq=None, h_freq=60, verbose=verbose)
         # OBS the order between notch and bandpass filter is inrelevant (TRIED)
 
         # Extract data (removing baseline pre/post-stimulus)
@@ -227,7 +223,7 @@ def list_files(directory, sorted_dir):
                 mean_eye_dist.append(-1)
 
         # SAVE CURRENT TRIAL IN NEW DATASET (IN CSV FORMAT) ------------------------
-        path_name = os.path.join(args.save_path, 'S'+f"{int(subject):02}")
+        path_name = os.path.join(save_path, 'S'+f"{int(subject):02}")
         if not os.path.exists(path_name):
                 os.makedirs(path_name)
 
@@ -245,4 +241,14 @@ def list_files(directory, sorted_dir):
             f.write(trial_labels[i] + "\n")
             f.close()
 
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sessions_path', type=str, default='hci-tagging-database/Sessions', help='Path to Sessions folder')
+    parser.add_argument('--save_path', type=str, default='hci-tagging-database/preproc_data', help='Path to save preprocessed data')
+    parser.add_argument('--verbose',  type=bool, action=argparse.BooleanOptionalAction, default=False)
+    args = parser.parse_args()
+
+    sessions_dir = list_files(args.sessions_path, sorted_dir=True)
+    preprocess(sessions_dir, args.save_path, args.verbose)
+
 
@@ -5,7 +5,7 @@
 
 class EarlyStopping:
     """Early stops the training if validation metric doesn't improve after a given patience."""
-    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print, rank=0):
+    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print, mode='max'):
         """
         Args:
             patience (int): How long to wait after last time validation loss improved.
@@ -17,7 +17,9 @@ def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', tra
             path (str): Path for the checkpoint to be saved to.
                             Default: 'checkpoint.pt'
             trace_func (function): trace print function.
-                            Default: print            
+                            Default: print      
+            mode (str): 'min' to save model when metric decreases (e.g. loss), 'max' when it increases (e.g. accuracy).
+                            Default: 'max'        
         """
         self.patience = patience
         self.verbose = verbose
@@ -28,30 +30,30 @@ def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', tra
         self.delta = delta
         self.path = path
         self.trace_func = trace_func
-        self.rank = rank
+        self.mode = mode
 
     def __call__(self, metric, model):
-
-        #score = -val_loss, i.e. in case of a loss its negative value is given
-        score = metric
+        if self.mode == 'min':
+            #Loss saved
+            score = -metric
+        else:
+            score = metric
 
         if self.best_score is None:  # initial step
             self.best_score = score
-            # if self.rank == 0:
-            #     self.save_checkpoint(metric, model)
+            self.save_checkpoint(metric, model)
         elif score < self.best_score + self.delta:  # if not improving (i.e. not growing)
             self.counter += 1
             self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
             if self.counter >= self.patience:
                 self.early_stop = True
         else:  # if improved (i.e. grown)
             self.best_score = score
-            # if self.rank == 0:
-            #     self.save_checkpoint(metric, model)
+            self.save_checkpoint(metric, model)
             self.counter = 0
 
     def save_checkpoint(self, metric, model):
-        '''Saves model when validation loss decrease.'''
+        '''Saves model when metric imroves.'''
         if self.verbose: 
             if metric > 0:
                 self.trace_func(f'Validation auc increased ({self.val_metric_min:.6f} --> {metric:.6f}).  Saving model ...')