Update depends (#98)

* Update depends * bump version * cleanup metrics * fix rdt version * update docs
vanderschaarlab · Jan 11, 2023 · c7d3bc9 · c7d3bc9
1 parent 3b7047a
commit c7d3bc9
Show file tree

Hide file tree

Showing 7 changed files with 5 additions and 76 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -40,6 +40,7 @@ install_requires =
     lifelines>=0.27
     opacus>=1.3
     decaf-synthetic-data>=0.1.5
+    rdt>=1.2.1
     diffprivlib
     shap
     tqdm
@@ -48,11 +49,8 @@ install_requires =
     cloudpickle
     scipy
     xgboost
-    copulas
     dython
     geomloss
-    ctgan
-    rdt
     deepecho
     pgmpy
     optuna

diff --git a/src/synthcity/metrics/eval_statistical.py b/src/synthcity/metrics/eval_statistical.py
@@ -7,7 +7,6 @@
 import numpy as np
 import pandas as pd
 import torch
-from copulas.univariate.base import Univariate
 from dython.nominal import associations
 from geomloss import SamplesLoss
 from pydantic import validate_arguments
@@ -258,52 +257,6 @@ def _evaluate(
         return {"joint": float(score)}
 
 
-class InverseCDFDistance(StatisticalEvaluator):
-    """
-    .. inheritance-diagram:: synthcity.metrics.eval_statistical.InverseCDFDistance
-        :parts: 1
-
-    Evaluate the distance between continuous features."""
-
-    @validate_arguments(config=dict(arbitrary_types_allowed=True))
-    def __init__(self, p: int = 2, **kwargs: Any) -> None:
-        super().__init__(default_metric="marginal", **kwargs)
-
-        self.p = p
-
-    @staticmethod
-    def name() -> str:
-        return "inv_cdf_dist"
-
-    @staticmethod
-    def direction() -> str:
-        return "minimize"
-
-    @validate_arguments(config=dict(arbitrary_types_allowed=True))
-    def _evaluate(
-        self,
-        X_gt: DataLoader,
-        X_syn: DataLoader,
-    ) -> Dict:
-        distances = []
-        for col in X_syn.columns:
-            if len(X_syn[col].unique()) < 15:
-                continue
-            syn_col = X_syn[col]
-            gt_col = X_gt[col]
-
-            predictor = Univariate()
-            predictor.fit(syn_col)
-
-            syn_percentiles = predictor.cdf(np.array(syn_col))
-            gt_percentiles = predictor.cdf(np.array(gt_col))
-            distances.append(
-                np.mean(abs(syn_percentiles - gt_percentiles[1]) ** self.p)
-            )
-
-        return {"marginal": float(self.reduction()(distances))}
-
-
 class JensenShannonDistance(StatisticalEvaluator):
     """Evaluate the average Jensen-Shannon distance (metric) between two probability arrays."""
 

diff --git a/src/synthcity/plugins/core/models/tabular_gan.py b/src/synthcity/plugins/core/models/tabular_gan.py
@@ -100,7 +100,7 @@ class TabularGAN(torch.nn.Module):
         device: Any = DEVICE
             CUDA/CPU
         adjust_inference_sampling: bool
-            Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
+            Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
         # privacy settings
         dp_enabled: bool
             Train the discriminator with Differential Privacy guarantees

diff --git a/src/synthcity/plugins/generic/plugin_ctgan.py b/src/synthcity/plugins/generic/plugin_ctgan.py
@@ -71,7 +71,7 @@ class CTGANPlugin(Plugin):
         encoder_max_clusters: int
             The max number of clusters to create for continuous columns when encoding
         adjust_inference_sampling: bool
-            Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
+            Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
         # early stopping
         n_iter_print: int
             Number of iterations after which to print updates and check the validation loss.

diff --git a/src/synthcity/plugins/privacy/plugin_adsgan.py b/src/synthcity/plugins/privacy/plugin_adsgan.py
@@ -72,7 +72,7 @@ class AdsGANPlugin(Plugin):
         encoder_max_clusters: int
             The max number of clusters to create for continuous columns when encoding
         adjust_inference_sampling: bool
-            Adjust the conditional probabilities to the ones in the training set. Active only with the ConditionalSampler
+            Adjust the marginal probabilities in the synthetic data to closer match the training set. Active only with the ConditionalSampler
         # early stopping
         n_iter_print: int
             Number of iterations after which to print updates and check the validation loss.

diff --git a/src/synthcity/version.py b/src/synthcity/version.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 MAJOR_VERSION = ".".join(__version__.split(".")[:-1])
 MINOR_VERSION = __version__.split(".")[-1]
diff --git a/tests/metrics/test_statistical.py b/tests/metrics/test_statistical.py
@@ -13,7 +13,6 @@
     AlphaPrecision,
     ChiSquaredTest,
     FeatureCorrelation,
-    InverseCDFDistance,
     InverseKLDivergence,
     JensenShannonDistance,
     KolmogorovSmirnovTest,
@@ -141,27 +140,6 @@ def test_evaluate_maximum_mean_discrepancy(kernel: str, test_plugin: Plugin) ->
     assert MaximumMeanDiscrepancy.direction() == "minimize"
 
 
-@pytest.mark.parametrize("test_plugin", [Plugins().get("dummy_sampler")])
-def test_evaluate_inv_cdf_function(test_plugin: Plugin) -> None:
-    X, y = load_iris(return_X_y=True, as_frame=True)
-    X["target"] = y
-    Xloader = GenericDataLoader(X)
-
-    test_plugin.fit(Xloader)
-    X_gen = test_plugin.generate(1000)
-
-    syn_score, rnd_score = _eval_plugin(InverseCDFDistance, Xloader, X_gen)
-
-    for key in syn_score:
-        assert syn_score[key] > 0
-        assert rnd_score[key] > 0
-        assert syn_score[key] < rnd_score[key]
-
-    assert InverseCDFDistance.name() == "inv_cdf_dist"
-    assert InverseCDFDistance.type() == "stats"
-    assert InverseCDFDistance.direction() == "minimize"
-
-
 @pytest.mark.parametrize("test_plugin", [Plugins().get("dummy_sampler")])
 def test_evaluate_avg_jensenshannon_distance(test_plugin: Plugin) -> None:
     X, y = load_iris(return_X_y=True, as_frame=True)