From aeaaa349d6feb415ecd0c98dfb59356f5513ce16 Mon Sep 17 00:00:00 2001 From: Boris van Breugel <48678682+bvanbreugel@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:36:38 +0000 Subject: [PATCH 1/6] reuse encoders --- src/synthcity/metrics/eval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index 9f57b7af..9cdbfc37 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -200,15 +200,15 @@ def evaluate( if metrics is None: metrics = Metrics.list() - X_gt, _ = X_gt.encode() - X_syn, _ = X_syn.encode() + X_gt, encoders = X_gt.encode() + X_syn, _ = X_syn.encode(encoders) if X_train: - X_train, _ = X_train.encode() + X_train, _ = X_train.encode(encoders) if X_ref_syn: - X_ref_syn, _ = X_ref_syn.encode() + X_ref_syn, _ = X_ref_syn.encode(encoders) if X_augmented: - X_augmented, _ = X_augmented.encode() + X_augmented, _ = X_augmented.encode(encoders) scores = ScoreEvaluator() From 65b747ef5e9f41b929ab5b99490d30e104fdf385 Mon Sep 17 00:00:00 2001 From: Boris van Breugel Date: Wed, 21 Feb 2024 14:11:12 +0000 Subject: [PATCH 2/6] ensure categorical encoder is trained on real and synthetic --- src/synthcity/metrics/eval.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index 9cdbfc37..06f2072f 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -1,9 +1,12 @@ # stdlib +import copy from pathlib import Path from typing import Dict, List, Optional, Union # third party +import numpy as np import pandas as pd +import torch from pydantic import validate_arguments # synthcity absolute @@ -200,9 +203,26 @@ def evaluate( if metrics is None: metrics = Metrics.list() + """ + We need to encode the categorical data in the real and synthetic data. + To ensure each category in the two datasets are mapped to the same index, we merge X_syn into X_gt for computing the encoder. + """ + len_x_gt = len(X_gt.data) + if isinstance(X_gt.data, pd.DataFrame): + X_gt.data = pd.concat([X_gt.data, X_syn.data],axis=0) + elif isinstance(X_gt.data, torch.Tensor): + X_gt.data = torch.cat([X_gt.data, X_syn.data],axis=0) + elif isinstance(X_gt.data, np.ndarray): + X_gt.data = np.concatenate([X_gt.data, X_syn.data],axis=0) + X_gt, encoders = X_gt.encode() + # Reset the data to the original length, to remove the synthetic data + X_gt.data = X_gt.data.iloc[:len_x_gt] + + # Encode the synthetic data and other datasets X_syn, _ = X_syn.encode(encoders) + # TODO: Check whether the below also need to share the same encoders if X_train: X_train, _ = X_train.encode(encoders) if X_ref_syn: From 1272fb2ce7fff0be9ef9fcbf7ef21eace1a62b36 Mon Sep 17 00:00:00 2001 From: Boris van Breugel Date: Wed, 21 Feb 2024 14:33:51 +0000 Subject: [PATCH 3/6] better transformer --- src/synthcity/metrics/eval.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index 06f2072f..ca78f372 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -207,19 +207,12 @@ def evaluate( We need to encode the categorical data in the real and synthetic data. To ensure each category in the two datasets are mapped to the same index, we merge X_syn into X_gt for computing the encoder. """ - len_x_gt = len(X_gt.data) - if isinstance(X_gt.data, pd.DataFrame): - X_gt.data = pd.concat([X_gt.data, X_syn.data],axis=0) - elif isinstance(X_gt.data, torch.Tensor): - X_gt.data = torch.cat([X_gt.data, X_syn.data],axis=0) - elif isinstance(X_gt.data, np.ndarray): - X_gt.data = np.concatenate([X_gt.data, X_syn.data],axis=0) + X_gt_df = X_gt.dataframe() + X_syn_df = X_syn.dataframe() + X_enc = create_from_info(pd.concat([X_gt_df, X_syn_df]), X_gt.info()) + _, encoders = X_enc.encode() - X_gt, encoders = X_gt.encode() - # Reset the data to the original length, to remove the synthetic data - X_gt.data = X_gt.data.iloc[:len_x_gt] - - # Encode the synthetic data and other datasets + X_gt, _ = X_gt.encode(encoders) X_syn, _ = X_syn.encode(encoders) # TODO: Check whether the below also need to share the same encoders From 8d50c2c958982eb9babe5078d548367c2a0d16e6 Mon Sep 17 00:00:00 2001 From: Boris van Breugel Date: Wed, 21 Feb 2024 14:42:02 +0000 Subject: [PATCH 4/6] remove unnecessary imports --- src/synthcity/metrics/eval.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index ca78f372..80c1c42b 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -1,12 +1,9 @@ # stdlib -import copy from pathlib import Path from typing import Dict, List, Optional, Union # third party -import numpy as np import pandas as pd -import torch from pydantic import validate_arguments # synthcity absolute From e86d2e289da3ffc0a3f42c1f7d5ab6eb33127ec6 Mon Sep 17 00:00:00 2001 From: Boris van Breugel Date: Wed, 21 Feb 2024 15:09:21 +0000 Subject: [PATCH 5/6] better error message --- src/synthcity/metrics/eval.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index 80c1c42b..07b3898d 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -201,7 +201,7 @@ def evaluate( metrics = Metrics.list() """ - We need to encode the categorical data in the real and synthetic data. + We need to encode the categorical data in the real and synthetic data. To ensure each category in the two datasets are mapped to the same index, we merge X_syn into X_gt for computing the encoder. """ X_gt_df = X_gt.dataframe() @@ -212,7 +212,8 @@ def evaluate( X_gt, _ = X_gt.encode(encoders) X_syn, _ = X_syn.encode(encoders) - # TODO: Check whether the below also need to share the same encoders + # TODO: Check whether the below also need to share the same encoders, and whether it's necessary to warn the user when + # there are classes that are not present in the training data but are present in one of these if X_train: X_train, _ = X_train.encode(encoders) if X_ref_syn: From d21d00891775e73d5f95510e0e5a66ba9709a5c1 Mon Sep 17 00:00:00 2001 From: Boris van Breugel Date: Thu, 22 Feb 2024 11:38:19 +0000 Subject: [PATCH 6/6] compatbility with DDIM --- src/synthcity/metrics/eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/synthcity/metrics/eval.py b/src/synthcity/metrics/eval.py index 07b3898d..c6d0fbd3 100644 --- a/src/synthcity/metrics/eval.py +++ b/src/synthcity/metrics/eval.py @@ -202,18 +202,18 @@ def evaluate( """ We need to encode the categorical data in the real and synthetic data. - To ensure each category in the two datasets are mapped to the same index, we merge X_syn into X_gt for computing the encoder. + To ensure each category in the two datasets are mapped to the same one hot vector, we merge X_syn into X_gt for computing the encoder. + TODO: Check whether the optional datasets also need to be taking into account when getting the encoder. """ X_gt_df = X_gt.dataframe() X_syn_df = X_syn.dataframe() X_enc = create_from_info(pd.concat([X_gt_df, X_syn_df]), X_gt.info()) _, encoders = X_enc.encode() + # now we encode the data X_gt, _ = X_gt.encode(encoders) X_syn, _ = X_syn.encode(encoders) - # TODO: Check whether the below also need to share the same encoders, and whether it's necessary to warn the user when - # there are classes that are not present in the training data but are present in one of these if X_train: X_train, _ = X_train.encode(encoders) if X_ref_syn: