Skip to content

Commit

Permalink
Update depends (#98)
Browse files Browse the repository at this point in the history
* Update depends

* bump version

* cleanup metrics

* fix rdt version

* update docs
  • Loading branch information
bcebere authored Jan 11, 2023
1 parent 3b7047a commit c7d3bc9
Show file tree
Hide file tree
Showing 7 changed files with 5 additions and 76 deletions.
4 changes: 1 addition & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ install_requires =
lifelines>=0.27
opacus>=1.3
decaf-synthetic-data>=0.1.5
rdt>=1.2.1
diffprivlib
shap
tqdm
Expand All @@ -48,11 +49,8 @@ install_requires =
cloudpickle
scipy
xgboost
copulas
dython
geomloss
ctgan
rdt
deepecho
pgmpy
optuna
Expand Down
47 changes: 0 additions & 47 deletions src/synthcity/metrics/eval_statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
import pandas as pd
import torch
from copulas.univariate.base import Univariate
from dython.nominal import associations
from geomloss import SamplesLoss
from pydantic import validate_arguments
Expand Down Expand Up @@ -258,52 +257,6 @@ def _evaluate(
return {"joint": float(score)}


class InverseCDFDistance(StatisticalEvaluator):
"""
.. inheritance-diagram:: synthcity.metrics.eval_statistical.InverseCDFDistance
:parts: 1
Evaluate the distance between continuous features."""

@validate_arguments(config=dict(arbitrary_types_allowed=True))
def __init__(self, p: int = 2, **kwargs: Any) -> None:
super().__init__(default_metric="marginal", **kwargs)

self.p = p

@staticmethod
def name() -> str:
return "inv_cdf_dist"

@staticmethod
def direction() -> str:
return "minimize"

@validate_arguments(config=dict(arbitrary_types_allowed=True))
def _evaluate(
self,
X_gt: DataLoader,
X_syn: DataLoader,
) -> Dict:
distances = []
for col in X_syn.columns:
if len(X_syn[col].unique()) < 15:
continue
syn_col = X_syn[col]
gt_col = X_gt[col]

predictor = Univariate()
predictor.fit(syn_col)

syn_percentiles = predictor.cdf(np.array(syn_col))
gt_percentiles = predictor.cdf(np.array(gt_col))
distances.append(
np.mean(abs(syn_percentiles - gt_percentiles[1]) ** self.p)
)

return {"marginal": float(self.reduction()(distances))}


class JensenShannonDistance(StatisticalEvaluator):
"""Evaluate the average Jensen-Shannon distance (metric) between two probability arrays."""

Expand Down
2 changes: 1 addition & 1 deletion src/synthcity/plugins/core/models/tabular_gan.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class TabularGAN(torch.nn.Module):
device: Any = DEVICE
CUDA/CPU
adjust_inference_sampling: bool
Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
# privacy settings
dp_enabled: bool
Train the discriminator with Differential Privacy guarantees
Expand Down
2 changes: 1 addition & 1 deletion src/synthcity/plugins/generic/plugin_ctgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class CTGANPlugin(Plugin):
encoder_max_clusters: int
The max number of clusters to create for continuous columns when encoding
adjust_inference_sampling: bool
Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
# early stopping
n_iter_print: int
Number of iterations after which to print updates and check the validation loss.
Expand Down
2 changes: 1 addition & 1 deletion src/synthcity/plugins/privacy/plugin_adsgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class AdsGANPlugin(Plugin):
encoder_max_clusters: int
The max number of clusters to create for continuous columns when encoding
adjust_inference_sampling: bool
Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
# early stopping
n_iter_print: int
Number of iterations after which to print updates and check the validation loss.
Expand Down
2 changes: 1 addition & 1 deletion src/synthcity/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.1"
__version__ = "0.1.2"

MAJOR_VERSION = ".".join(__version__.split(".")[:-1])
MINOR_VERSION = __version__.split(".")[-1]
22 changes: 0 additions & 22 deletions tests/metrics/test_statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
AlphaPrecision,
ChiSquaredTest,
FeatureCorrelation,
InverseCDFDistance,
InverseKLDivergence,
JensenShannonDistance,
KolmogorovSmirnovTest,
Expand Down Expand Up @@ -141,27 +140,6 @@ def test_evaluate_maximum_mean_discrepancy(kernel: str, test_plugin: Plugin) ->
assert MaximumMeanDiscrepancy.direction() == "minimize"


@pytest.mark.parametrize("test_plugin", [Plugins().get("dummy_sampler")])
def test_evaluate_inv_cdf_function(test_plugin: Plugin) -> None:
X, y = load_iris(return_X_y=True, as_frame=True)
X["target"] = y
Xloader = GenericDataLoader(X)

test_plugin.fit(Xloader)
X_gen = test_plugin.generate(1000)

syn_score, rnd_score = _eval_plugin(InverseCDFDistance, Xloader, X_gen)

for key in syn_score:
assert syn_score[key] > 0
assert rnd_score[key] > 0
assert syn_score[key] < rnd_score[key]

assert InverseCDFDistance.name() == "inv_cdf_dist"
assert InverseCDFDistance.type() == "stats"
assert InverseCDFDistance.direction() == "minimize"


@pytest.mark.parametrize("test_plugin", [Plugins().get("dummy_sampler")])
def test_evaluate_avg_jensenshannon_distance(test_plugin: Plugin) -> None:
X, y = load_iris(return_X_y=True, as_frame=True)
Expand Down

0 comments on commit c7d3bc9

Please sign in to comment.