vanderschaarlab · robsdavis · Apr 25, 2023 · Mar 1, 2023 · Mar 3, 2023 · Mar 5, 2023
diff --git a/src/synthcity/plugins/core/distribution.py b/src/synthcity/plugins/core/distribution.py
@@ -111,17 +111,25 @@ def as_constraint(self) -> Constraints:
 
     @abstractmethod
     def min(self) -> Any:
-        "Get the min value of the distribution"
+        """Get the min value of the distribution."""
         ...
 
     @abstractmethod
     def max(self) -> Any:
-        "Get the max value of the distribution"
+        """Get the max value of the distribution."""
         ...
 
-    @abstractmethod
     def __eq__(self, other: Any) -> bool:
-        ...
+        return type(self) == type(other) and self.get() == other.get()
+
+    def __contains__(self, item: Any) -> bool:
+        """
+        Example:
+        >>> dist = CategoricalDistribution(name="foo", choices=["a", "b", "c"])
+        >>> "a" in dist
+        True
+        """
+        return self.has(item)
 
     @abstractmethod
     def dtype(self) -> str:
@@ -146,7 +154,7 @@ def _validate_choices(cls: Any, v: List, values: Dict) -> List:
             raise ValueError(
                 "Invalid choices for CategoricalDistribution. Provide data or choices params"
             )
-        return v
+        return sorted(set(v))
 
     def get(self) -> List[Any]:
         return [self.name, self.choices]
@@ -176,12 +184,6 @@ def min(self) -> Any:
     def max(self) -> Any:
         return max(self.choices)
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, CategoricalDistribution):
-            return False
-
-        return self.name == other.name and set(self.choices) == set(other.choices)
-
     def dtype(self) -> str:
         types = {
             "object": 0,
@@ -259,20 +261,26 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, FloatDistribution):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "float"
 
 
+class LogDistribution(FloatDistribution):
+    low: float = np.finfo(np.float64).tiny
+    high: float = np.finfo(np.float64).max
+
+    def get(self) -> List[Any]:
+        return [self.name, self.low, self.high]
+
+    def sample(self, count: int = 1) -> Any:
+        np.random.seed(self.random_state)
+        msamples = self.sample_marginal(count)
+        if msamples is not None:
+            return msamples
+        lo, hi = np.log2(self.low), np.log2(self.high)
+        return 2.0 ** np.random.uniform(lo, hi, count)
+
+
 class IntegerDistribution(Distribution):
     """
     .. inheritance-diagram:: synthcity.plugins.core.distribution.IntegerDistribution
@@ -298,6 +306,12 @@ def _validate_high_thresh(cls: Any, v: int, values: Dict) -> int:
             return int(values[mkey].index.max())
         return v
 
+    @validator("step", always=True)
+    def _validate_step(cls: Any, v: int, values: Dict) -> int:
+        if v < 1:
+            raise ValueError("Step must be greater than 0")
+        return v
+
     def get(self) -> List[Any]:
         return [self.name, self.low, self.high, self.step]
 
@@ -307,8 +321,9 @@ def sample(self, count: int = 1) -> Any:
         if msamples is not None:
             return msamples
 
-        choices = [val for val in range(self.low, self.high + 1, self.step)]
-        return np.random.choice(choices, count).tolist()
+        steps = (self.high - self.low) // self.step
+        samples = np.random.choice(steps + 1, count)
+        return samples * self.step + self.low
 
     def has(self, val: Any) -> bool:
         return self.low <= val and val <= self.high
@@ -331,21 +346,31 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, IntegerDistribution):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "int"
 
 
-OFFSET = 120
+class IntLogDistribution(IntegerDistribution):
+    low: int = 1
+    high: int = np.iinfo(np.int64).max
+
+    @validator("step", always=True)
+    def _validate_step(cls: Any, v: int, values: Dict) -> int:
+        if v != 1:
+            raise ValueError("Step must be 1 for IntLogDistribution")
+        return v
+
+    def get(self) -> List[Any]:
+        return [self.name, self.low, self.high]
+
+    def sample(self, count: int = 1) -> Any:
+        np.random.seed(self.random_state)
+        msamples = self.sample_marginal(count)
+        if msamples is not None:
+            return msamples
+        lo, hi = np.log2(self.low), np.log2(self.high)
+        samples = 2.0 ** np.random.uniform(lo, hi, count)
+        return samples.astype(int)
 
 
 class DatetimeDistribution(Distribution):
@@ -356,49 +381,44 @@ class DatetimeDistribution(Distribution):
 
     low: datetime = datetime.utcfromtimestamp(0)
     high: datetime = datetime.now()
+    step: timedelta = timedelta(microseconds=1)
+    offset: timedelta = timedelta(seconds=120)
 
     @validator("low", always=True)
     def _validate_low_thresh(cls: Any, v: datetime, values: Dict) -> datetime:
         mkey = "marginal_distribution"
         if mkey in values and values[mkey] is not None:
             v = values[mkey].index.min()
-
-        return v - timedelta(seconds=OFFSET)
+        return v
 
     @validator("high", always=True)
     def _validate_high_thresh(cls: Any, v: datetime, values: Dict) -> datetime:
         mkey = "marginal_distribution"
         if mkey in values and values[mkey] is not None:
             v = values[mkey].index.max()
-
-        return v + timedelta(seconds=OFFSET)
+        return v
 
     def get(self) -> List[Any]:
-        return [self.name, self.low, self.high]
+        return [self.name, self.low, self.high, self.step, self.offset]
 
     def sample(self, count: int = 1) -> Any:
         np.random.seed(self.random_state)
         msamples = self.sample_marginal(count)
         if msamples is not None:
             return msamples
 
-        samples = np.random.uniform(
-            datetime.timestamp(self.low), datetime.timestamp(self.high), count
-        )
-
-        samples_dt = []
-        for s in samples:
-            samples_dt.append(datetime.fromtimestamp(s))
-
-        return samples_dt
+        n = (self.high - self.low) // self.step + 1
+        samples = np.round(np.random.rand(count) * n - 0.5)
+        return self.low + samples * self.step
 
     def has(self, val: datetime) -> bool:
         return self.low <= val and val <= self.high
 
     def includes(self, other: "Distribution") -> bool:
-        return self.min() - timedelta(
-            seconds=OFFSET
-        ) <= other.min() and other.max() <= self.max() + timedelta(seconds=OFFSET)
+        return (
+            self.min() - self.offset <= other.min()
+            and other.max() <= self.max() + self.offset
+        )
 
     def as_constraint(self) -> Constraints:
         return Constraints(
@@ -415,16 +435,6 @@ def min(self) -> Any:
     def max(self) -> Any:
         return self.high
 
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, DatetimeDistribution):
-            return False
-
-        return (
-            self.name == other.name
-            and self.low == other.low
-            and self.high == other.high
-        )
-
     def dtype(self) -> str:
         return "datetime"
 

diff --git a/src/synthcity/plugins/generic/plugin_ddpm.py b/src/synthcity/plugins/generic/plugin_ddpm.py
@@ -15,7 +15,12 @@
 
 # synthcity absolute
 from synthcity.plugins.core.dataloader import DataLoader
-from synthcity.plugins.core.distribution import CategoricalDistribution, Distribution
+from synthcity.plugins.core.distribution import (
+    Distribution,
+    IntegerDistribution,
+    IntLogDistribution,
+    LogDistribution,
+)
 from synthcity.plugins.core.models.tabular_ddpm import TabDDPM
 from synthcity.plugins.core.plugin import Plugin
 from synthcity.plugins.core.schema import Schema
@@ -174,13 +179,12 @@ def hyperparameter_space(**kwargs: Any) -> List[Distribution]:
         Gaussian diffusion loss MSE
         """
         return [
-            # TODO: change to loguniform distribution
-            CategoricalDistribution(name="lr", choices=[1e-5, 1e-4, 1e-3, 2e-3, 3e-3]),
-            CategoricalDistribution(name="batch_size", choices=[256, 4096]),
-            CategoricalDistribution(name="num_timesteps", choices=[100, 1000]),
-            CategoricalDistribution(name="n_iter", choices=[5000, 10000, 20000]),
-            CategoricalDistribution(name="n_layers_hidden", choices=[2, 4, 6, 8]),
-            CategoricalDistribution(name="dim_hidden", choices=[128, 256, 512, 1024]),
+            LogDistribution(name="lr", low=1e-5, high=1e-1),
+            IntLogDistribution(name="batch_size", low=256, high=4096),
+            IntegerDistribution(name="num_timesteps", low=10, high=1000),
+            IntLogDistribution(name="n_iter", low=1000, high=10000),
+            IntegerDistribution(name="n_layers_hidden", low=2, high=8),
+            IntLogDistribution(name="dim_hidden", low=128, high=1024),
         ]
 
     def _fit(self, X: DataLoader, *args: Any, **kwargs: Any) -> "TabDDPMPlugin":

diff --git a/src/synthcity/plugins/time_series/plugin_fflows.py b/src/synthcity/plugins/time_series/plugin_fflows.py
@@ -11,6 +11,7 @@
 from fflows import FourierFlow
 
 # synthcity absolute
+from synthcity.plugins import Plugins
 from synthcity.plugins.core.dataloader import DataLoader
 from synthcity.plugins.core.distribution import (
     CategoricalDistribution,
@@ -24,7 +25,6 @@
 from synthcity.plugins.core.models.ts_model import TimeSeriesModel
 from synthcity.plugins.core.plugin import Plugin
 from synthcity.plugins.core.schema import Schema
-from synthcity.plugins.generic import GenericPlugins
 from synthcity.utils.constants import DEVICE
 
 
@@ -134,9 +134,7 @@ def __init__(
             normalize=normalize,
         ).to(device)
 
-        self.static_model = GenericPlugins().get(
-            self.static_model_name, device=self.device
-        )
+        self.static_model = Plugins().get(self.static_model_name, device=self.device)
 
         self.temporal_encoder = TimeSeriesTabularEncoder(
             max_clusters=encoder_max_clusters

diff --git a/src/synthcity/utils/optuna_sample.py b/src/synthcity/utils/optuna_sample.py
@@ -0,0 +1,27 @@
+# stdlib
+from typing import Any, Dict, List
+
+# third party
+import optuna
+
+# synthcity absolute
+import synthcity.plugins.core.distribution as D
+
+
+def suggest(trial: optuna.Trial, dist: D.Distribution) -> Any:
+    if isinstance(dist, D.FloatDistribution):
+        return trial.suggest_float(dist.name, dist.low, dist.high)
+    elif isinstance(dist, D.LogDistribution):
+        return trial.suggest_float(dist.name, dist.low, dist.high, log=True)
+    elif isinstance(dist, D.IntegerDistribution):
+        return trial.suggest_int(dist.name, dist.low, dist.high, dist.step)
+    elif isinstance(dist, D.IntLogDistribution):
+        return trial.suggest_int(dist.name, dist.low, dist.high, log=True)
+    elif isinstance(dist, D.CategoricalDistribution):
+        return trial.suggest_categorical(dist.name, dist.choices)
+    else:
+        raise ValueError(f"Unknown dist: {dist}")
+
+
+def suggest_all(trial: optuna.Trial, distributions: List[D.Distribution]) -> Dict:
+    return {dist.name: suggest(trial, dist) for dist in distributions}