Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup build and functions #14

Merged
merged 9 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@ import syndat
real = pd.read_csv("real.csv")
synthetic = pd.read_csv("synthetic.csv")

jsd = syndat.quality.jsd(real, synthetic)
auc = syndat.quality.auc(real, synthetic)
norm = syndat.quality.correlation(real, synthetic)
# How similar are the statistical distributions of real and synthetic features
distribution_similarity_score = syndat.scores.distribution(real, synthetic)

# How hard is it for a classifier to discriminate real and synthetic data
discrimination_score = syndat.scores.discrimination(real, synthetic)

# How well are pairwise feature correlations preserved
correlation_score = syndat.scores.correlation(real, synthetic)
```

Scores are defined in a range of 0-100, with a higher score corresponding to better data fidelity.

## Visualization

Visualize real vs. synthetic data distributions and summary statistics for each feature:
Expand Down
27 changes: 27 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,31 @@
author_email='[email protected]',
description=DESCRIPTION,
long_description=DESCRIPTION,
long_description_content_type='text/markdown',
install_requires=[
'pandas~=2.1.4',
'numpy~=1.26.2',
'scipy~=1.11.4',
'scikit-learn~=1.5.1',
'matplotlib~=3.8.2',
'seaborn~=0.13.0',
'setuptools==70.0.0'
],
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
],
include_package_data=True, # Ensure non-Python files are included
python_requires='>=3.9', # Specify minimum Python version
keywords='synthetic-data, data-quality, data-visualization',
project_urls={
'Documentation': 'https://github.com/SCAI-BIO/syndat#readme',
'Source': 'https://github.com/SCAI-BIO/syndat',
'Tracker': 'https://github.com/SCAI-BIO/syndat/issues',
},
)
4 changes: 2 additions & 2 deletions syndat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from syndat import domain
from syndat import quality
from syndat import visualization
from syndat import scores
from syndat import visualization
63 changes: 42 additions & 21 deletions syndat/quality.py → syndat/scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@

from syndat.domain import AggregationMethod

logger = logging.getLogger(__name__)


def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5,
drop_na_threshold=0.9, score: bool = True) -> float:
"""
Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real
Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real
and synthetic data.

:param real: The real data.
Expand All @@ -27,18 +29,43 @@ def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5,
:param score: Return result in a normalized score in [0,100]. Default is True.
:return: Differentiation Complexity Score / AUC ROC Score
"""

warnings.warn(
"old_function is deprecated and will be removed in a future version. Please use discrimination_score instead.",
"auc is deprecated and will be removed in a future version. Please use discrimination instead.",
DeprecationWarning,
stacklevel=2
)
return discrimination_score(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score)
return discrimination(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score)


def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5,
drop_na_threshold=0.9, score: bool = True) -> float:
def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True,
aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True,
n_unique_threshold=10) -> Union[List[float], float]:
"""
Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real
Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data.

:param real: The real data.
:param synthetic: The synthetic data.
:param aggregate_results: Compute a single aggregated score for all features. Default is True.
:param aggregation_method: How the scores are aggregated. Default is using the median of all feature scores.
:param score: Return result in a normalized score in [0,100]. Default is True.
:param n_unique_threshold: Threshold to determine at which number of unique values bins will span over several
values.
:return: Distribution Similarity / JSD
"""

warnings.warn(
"auc is deprecated and will be removed in a future version. Please use discrimination instead.",
DeprecationWarning,
stacklevel=2
)
return distribution(real, synthetic, aggregate_results, aggregation_method, score, n_unique_threshold)


def discrimination(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5,
drop_na_threshold=0.9, score: bool = True) -> float:
"""
Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real
and synthetic data.

:param real: The real data.
Expand All @@ -52,10 +79,10 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_
real_filtered, synthetic_filtered = __filter_rows_with_common_categories(real, synthetic)
# check for missing values in real data
real_clean = real_filtered.dropna(thresh=int(drop_na_threshold * len(real_filtered)), axis=1)
logging.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} '
logger.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} '
f'due to high missingness (threshold is {drop_na_threshold}).')
real_clean = real_clean.dropna()
logging.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.')
logger.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.')
# assert that both real and synthetic have same columns
synthetic_clean = synthetic_filtered[real_clean.columns]
# one-hot-encode categorical columns
Expand All @@ -74,9 +101,9 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_
return auc_score


def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True,
aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True,
n_unique_threshold=10) -> Union[List[float], float]:
def distribution(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True,
aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True,
n_unique_threshold=10) -> Union[List[float], float]:
"""
Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data.

Expand All @@ -100,7 +127,7 @@ def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = T
col_dtype_real = real[col].dtype
col_dtype_synthetic = synthetic[col].dtype
if col_dtype_real != col_dtype_synthetic:
logging.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. '
logger.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. '
f'Evaluation will be done based on the assumed data type of the real data.')
synthetic[col] = synthetic[col].astype(col_dtype_real)
# categorical column
Expand Down Expand Up @@ -216,21 +243,15 @@ def __filter_rows_with_common_categories(real: pd.DataFrame, synthetic: pd.DataF
synthetic_categorical_cols = synthetic.select_dtypes(include=['object', 'category']).columns
# Identify common categorical columns
common_categorical_cols = set(real_categorical_cols) & set(synthetic_categorical_cols)
if not common_categorical_cols:
logging.warning("No common categorical columns found. Correlation will be computed on numeric data only.")
# Filter rows with common categories in each column
for col in common_categorical_cols:
real_categories = set(real[col].unique())
synthetic_categories = set(synthetic[col].unique())
common_categories = real_categories & synthetic_categories
if len(real_categories - common_categories) > 0:
logging.warning(
f"Categories {real_categories - common_categories} in column '{col}' "
f"are in real data but not in synthetic data and will be excluded.")
if len(synthetic_categories - common_categories) > 0:
logging.warning(
f"Categories {synthetic_categories - common_categories} in column '{col}' "
f"are in synthetic data but not in real data and will be excluded.")
logger.warning(
f"Categories {real_categories - common_categories} in column '{col}' are in real data but not in "
f"synthetic data. They will not be considered in the score computation.")
# Filter rows to keep only common categories
real = real[real[col].isin(common_categories)]
synthetic = synthetic[synthetic[col].isin(common_categories)]
Expand Down
12 changes: 6 additions & 6 deletions tests/test_auc.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,41 +43,41 @@ def preprocess_categorical_data(self, real_data, synthetic_data):
return real_data_encoded, synthetic_data_encoded

def test_auc_score(self):
auc_score = syndat.quality.auc(self.real_data, self.synthetic_data)
auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)

def test_auc_score_normalized(self):
auc_score = syndat.quality.auc(self.real_data, self.synthetic_data)
auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)

def test_auc_score_with_missing_values(self):
# Introduce missing values in real data
self.real_data.iloc[::10, 0] = np.nan # 10% missing data
auc_score = syndat.quality.auc(self.real_data, self.synthetic_data)
auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)

def test_auc_score_with_missing_values_drop_col(self):
# Introduce missing values in real data
self.real_data.iloc[::2, 0] = np.nan # 50% missing data -> col drop
auc_score = syndat.quality.auc(self.real_data, self.synthetic_data)
auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)

def test_auc_score_with_custom_folds(self):
auc_score = syndat.quality.auc(self.real_data, self.synthetic_data, n_folds=5)
auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data, n_folds=5)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)

def test_auc_score_with_categorical_data(self):
auc_score = syndat.quality.auc(self.real_data_cat, self.synthetic_data_cat)
auc_score = syndat.scores.discrimination(self.real_data_cat, self.synthetic_data_cat)
self.assertTrue(isinstance(auc_score, float))
self.assertGreaterEqual(auc_score, 0.0)
self.assertLessEqual(auc_score, 100.0)
2 changes: 1 addition & 1 deletion tests/test_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from syndat.quality import correlation
from syndat.scores import correlation


class TestCorrelation(unittest.TestCase):
Expand Down
48 changes: 24 additions & 24 deletions tests/test_jsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def test_jsd_zero_int64(self):
'feature1': [6, 7, 8, 9, 10],
'feature2': [15, 16, 17, 18, 19]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(0, jsd)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(0, distribution)

def test_jsd_zero_int64_float64(self):
synthetic = pd.DataFrame({
Expand All @@ -30,8 +30,8 @@ def test_jsd_zero_int64_float64(self):
'feature1': [6, 7, 8, 9, 10],
'feature2': [0.6, 0.7, 0.8, 0.9, 1.0]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(jsd, 0)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(distribution, 0)

def test_jsd_perfect_int64(self):
synthetic = pd.DataFrame({
Expand All @@ -43,8 +43,8 @@ def test_jsd_perfect_int64(self):
'feature1': [1, 2, 1, 2, 3],
'feature2': [11, 12, 13, 14, 15]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(jsd, 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(distribution, 100)

def test_jsd_perfect_int64_and_float64(self):
synthetic = pd.DataFrame({
Expand All @@ -56,8 +56,8 @@ def test_jsd_perfect_int64_and_float64(self):
'feature1': [1, 2, 1, 2, 3],
'feature2': [0.1, 0.2, 0.3, 0.4, 0.5]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(jsd, 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(distribution, 100)

def test_jsd_different_col_types(self):
synthetic = pd.DataFrame({
Expand All @@ -69,7 +69,7 @@ def test_jsd_different_col_types(self):
'feature1': [1.2, 2.1, 1.1, 2.1, 3.1],
'feature2': [1, 2, 3, 4, 5]
})
jsd = syndat.quality.jsd(real, synthetic, score=False)
distribution = syndat.scores.distribution(real, synthetic, score=False)

def test_jsd_negative_int64(self):
synthetic = pd.DataFrame({
Expand All @@ -81,7 +81,7 @@ def test_jsd_negative_int64(self):
'feature1': [-1, 2, 3, 4, 5],
'feature2': [1, 2, 3, 4, 5]
})
jsd = syndat.quality.jsd(real, synthetic)
distribution = syndat.scores.distribution(real, synthetic)

def test_jsd_single_outlier(self):
synthetic = pd.DataFrame({
Expand All @@ -93,8 +93,8 @@ def test_jsd_single_outlier(self):
'feature1': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9],
'feature2': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100],
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertTrue(jsd < 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertTrue(distribution < 100)

def test_jsd_categorical_equal(self):
synthetic = pd.DataFrame({
Expand All @@ -106,8 +106,8 @@ def test_jsd_categorical_equal(self):
'feature1': ['A', 'B', 'A', 'B', 'C'],
'feature2': ['X', 'Y', 'Y', 'X', 'Z']
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(jsd, 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(distribution, 100)

def test_jsd_categorical_different(self):
synthetic = pd.DataFrame({
Expand All @@ -119,8 +119,8 @@ def test_jsd_categorical_different(self):
'feature1': ['A', 'B', 'A', 'B', 'D'],
'feature2': ['X', 'Y', 'Z', 'X', 'W']
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertTrue(jsd < 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertTrue(distribution < 100)

def test_jsd_categorical_mixed(self):
synthetic = pd.DataFrame({
Expand All @@ -132,8 +132,8 @@ def test_jsd_categorical_mixed(self):
'feature1': ['A', 'B', 'C', 'F', 'G'],
'feature2': [1.0, 2.0, 3.0, 6.0, 7.0]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertTrue(jsd < 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertTrue(distribution < 100)

def test_jsd_categorical_with_numerical(self):
synthetic = pd.DataFrame({
Expand All @@ -145,8 +145,8 @@ def test_jsd_categorical_with_numerical(self):
'feature1': ['A', 'B', 'C', 'A', 'D'],
'feature2': [1.0, 2.0, 3.0, 4.0, 6.0]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertTrue(jsd < 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertTrue(distribution < 100)

def test_jsd_categorical_with_nan(self):
synthetic = pd.DataFrame({
Expand All @@ -158,8 +158,8 @@ def test_jsd_categorical_with_nan(self):
'feature1': ['A', 'B', 'C', 'D', None],
'feature2': [1.0, 2.0, None, 4.0, 5.0]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertTrue(jsd < 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertTrue(distribution < 100)

def test_jsd_categorical_all_nan(self):
synthetic = pd.DataFrame({
Expand All @@ -171,5 +171,5 @@ def test_jsd_categorical_all_nan(self):
'feature1': [None, None, None, None, None],
'feature2': [None, None, None, None, None]
})
jsd = syndat.quality.jsd(real, synthetic)
self.assertEqual(jsd, 100)
distribution = syndat.scores.distribution(real, synthetic)
self.assertEqual(distribution, 100)
Loading