Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanna Imshenetska authored and Hanna Imshenetska committed Nov 5, 2024
2 parents 29a984b + 71175b8 commit aa78a90
Show file tree
Hide file tree
Showing 13 changed files with 93 additions and 49 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/action-build-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ jobs:
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Databricks-compatibility
on: [push]

jobs:
Databricks-compatibility-test:
Databricks-compatibility-library:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
Expand All @@ -18,6 +18,9 @@ jobs:
- name: Install dependencies
run: |
cp -n databricks/requirements-databricks-15.4-LTS.txt .
cat databricks/setup.cfg > setup.cfg
cat databricks/pyproject.toml > pyproject.toml
sed -i 's/name="syngen"/name="syngen-databricks"/' src/setup.py
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements-databricks-15.4-LTS.txt ]; then pip install -r requirements-databricks-15.4-LTS.txt; fi
Expand All @@ -39,28 +42,14 @@ jobs:
cp -n databricks/databricks.dockerfile .
docker build -t databricks-test-image -f databricks.dockerfile .
build-and-publish:
needs: Databricks-compatibility-test
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install build dependencies
- name: Build Package
run: |
cp -n databricks/setup.cfg .
cp -n databricks/pyproject.toml .
sed -i 's/$/rc1+dbx/' src/syngen/VERSION
cat src/syngen/VERSION
python -m pip install --upgrade pip
pip install build
- name: Build Package
run: python -m build .
python -m build .
- name: Publish package
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_TEST_TOKEN }} # For release: use secrets.PYPI_TOKEN
user: __token__
password: ${{ secrets.PYPI_TOKEN }} # For test release: use secrets.PYPI_TEST_TOKEN and add 'repository_url: https://test.pypi.org/legacy/'
verbose: true
File renamed without changes.
5 changes: 2 additions & 3 deletions databricks/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
[metadata]
name = syngen
name = syngen-databricks
version = file: src/syngen/VERSION
description = file: DESCRIPTION
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/tdspora/syngen
author = EPAM Systems, Inc.
maintainer = Pavel Bobyrev
maintainer = Hanna Imshenetska
license = GPLv3 License
keywords = data, generation, synthetic, vae, tabular
classifiers =
Development Status :: 5 - Production/Stable
Operating System :: POSIX :: Linux
Operating System :: Microsoft :: Windows
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11


Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/tdspora/syngen
author = EPAM Systems, Inc.
maintainer = Pavel Bobyrev
maintainer = Hanna Imshenetska
license = GPLv3 License
keywords = data, generation, synthetic, vae, tabular
classifiers =
Expand All @@ -20,7 +20,7 @@ classifiers =

[options]
package_dir =
= src
= src
packages = find:
include_package_data = True
python_requires = >3.9, <3.12
Expand Down
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.51rc0
0.9.51rc1
9 changes: 6 additions & 3 deletions src/syngen/ml/config/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,12 @@ def _check_existence_of_referenced_columns(self, table_name: str):
]
if non_existed_columns:
message = (
f"The 'referenced.columns' of the {config_of_key['type']} '{key}' - "
f"The 'references.columns' of the {config_of_key['type']} '{key}' - "
f"{', '.join(non_existed_columns)} "
f"don't exist in the referenced table - '{referenced_table}'"
)
self.errors[
"check existence of the key columns in 'referenced.columns'"
"check existence of the key columns in 'references.columns'"
][key] = message

def _fetch_existed_columns(self, table_name: str) -> List[str]:
Expand Down Expand Up @@ -293,9 +293,12 @@ def _run(self):
self.merged_metadata.pop("global", None)
self.metadata.pop("global", None)

if self.type_of_process == "train" and self.validation_source:
for table_name in self.merged_metadata.keys():
self._gather_existed_columns(table_name)

for table_name in self.merged_metadata.keys():
if self.type_of_process == "train" and self.validation_source:
self._gather_existed_columns(table_name)
self._check_existence_of_source(table_name)
self._check_existence_of_key_columns(table_name)
self._check_existence_of_referenced_columns(table_name)
Expand Down
17 changes: 17 additions & 0 deletions src/syngen/ml/handlers/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from syngen.ml.vae import * # noqa: F403
from syngen.ml.data_loaders import DataLoader, DataFrameFetcher
from syngen.ml.reporters import Report
from syngen.ml.vae.models.dataset import Dataset
from syngen.ml.utils import (
fetch_config,
Expand Down Expand Up @@ -492,6 +493,22 @@ def handle(self, **kwargs):
else pd.DataFrame()
)
prepared_data = self._restore_empty_columns(prepared_data)
# workaround for the case when all columns are dropped
# with technical column
tech_columns = list(self.dataset.tech_columns)
if tech_columns:
prepared_data = prepared_data.drop(tech_columns, axis=1)
logger.debug(
"Technical columns "
f"{tech_columns} were removed "
"from the generated table."
)
Report().unregister_reporters(self.table_name)
logger.info(
"Since there were no columns suitable for training, "
"reports will not be generated "
f"for the table '{self.table_name}'."
)

is_pk = self._is_pk()

Expand Down
18 changes: 16 additions & 2 deletions src/syngen/ml/reporters/reporters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from abc import abstractmethod
from typing import Dict, Tuple, Optional, Callable
from typing import (
Dict,
Tuple,
Optional,
Callable,
Union,
List
)
import itertools
from collections import defaultdict

Expand Down Expand Up @@ -189,7 +196,7 @@ class Report:
Singleton metaclass for registration all needed reporters
"""

_reporters: Dict[str, Reporter] = {}
_reporters: Dict[str, Union[Reporter, List]] = {}

def __new__(cls):
if not hasattr(cls, "instance"):
Expand All @@ -205,6 +212,13 @@ def register_reporter(cls, table: str, reporter: Reporter):
list_of_reporters.append(reporter)
cls._reporters[table] = list_of_reporters

@classmethod
def unregister_reporters(cls, table: str):
"""
Unregister all reporters for a table
"""
cls._reporters[table] = list()

@classmethod
def clear_report(cls):
"""
Expand Down
8 changes: 5 additions & 3 deletions src/syngen/ml/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,9 +366,11 @@ def file_sink(message):
"""
Save logs to the log file
"""
with open(os.getenv("SUCCESS_LOG_FILE"), "a") as log_file:
log_message = fetch_log_message(message)
log_file.write(log_message + "\n")
path_to_logs = os.getenv("SUCCESS_LOG_FILE")
if path_to_logs is not None:
with open(path_to_logs, "a") as log_file:
log_message = fetch_log_message(message)
log_file.write(log_message + "\n")


def console_sink(record):
Expand Down
19 changes: 19 additions & 0 deletions src/syngen/ml/vae/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
self.uuid_columns: Set = set()
self.uuid_columns_types: Dict = dict()
self.dropped_columns: Set = set()
self.tech_columns: Set = set()
self.order_of_columns: List = list()
self.custom_categorical_columns: Set = set()
self.categorical_columns: Set = set()
Expand All @@ -71,6 +72,7 @@ def __init__(
self.binary_columns: Set = set()
self.email_columns: Set = set()
self.long_text_columns: Set = set()
self.tech_columns: Set = set()
self.primary_keys_mapping: Dict = dict()
self.primary_keys_list: List = list()
self.primary_key_name: Optional[str] = None
Expand Down Expand Up @@ -1326,6 +1328,23 @@ def pipeline(self) -> pd.DataFrame:
elif column in self.uuid_columns:
logger.info(f"Column '{column}' defined as UUID column")
self._assign_uuid_null_feature(column)

# workaround for the case when all columns are dropped
# add a technical column to proceed with the training process
if not self.features:
tech_column = "syngen_tech_column"
logger.info(
f"Since all columns in the table '{self.table_name}' "
"are uuid/key/long text columns, "
"there are no suitable columns to train on. "
f"A technical column '{tech_column}' will be added "
"to proceed with the training process "
"and will be removed afterwards."
)
self.df[tech_column] = 1
self._assign_float_feature(tech_column)
self.tech_columns.add(tech_column)

self.fit()

# The end of the run related to the preprocessing stage
Expand Down
1 change: 1 addition & 0 deletions src/tests/unit/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def test_save_dataset(rp_logger):
"float_columns",
"int_columns",
"date_columns",
"tech_columns",
"date_mapping",
"binary_columns",
"email_columns",
Expand Down
28 changes: 14 additions & 14 deletions src/tests/unit/validation_metadata/test_validation_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,17 +237,6 @@ def test_validate_metadata_of_related_tables_with_fk_key_in_train_process(
"contained only the primary key and the foreign key used in the training process"
)
test_metadata = {
"table_a": {
"train_settings": {
"source": "path/to/table_a.csv"
},
"keys": {
"pk_id": {
"type": "PK",
"columns": ["id"]
}
}
},
"table_b": {
"train_settings": {
"source": "path/to/table_b.csv"
Expand All @@ -266,7 +255,18 @@ def test_validate_metadata_of_related_tables_with_fk_key_in_train_process(
}
}
}
}
},
"table_a": {
"train_settings": {
"source": "path/to/table_a.csv"
},
"keys": {
"pk_id": {
"type": "PK",
"columns": ["id"]
}
}
},
}
validator = Validator(
metadata=test_metadata,
Expand Down Expand Up @@ -2021,8 +2021,8 @@ def test_check_not_existent_referenced_columns_in_fk(rp_logger):
assert validator.merged_metadata == test_metadata
assert str(error.value) == (
"The validation of the metadata has been failed. The error(s) found in - \n"
"\"check existence of the key columns in 'referenced.columns'\": {\n \"fk_id\": "
"\"The 'referenced.columns' of the FK 'fk_id' - 'non-existent column' don't exist "
"\"check existence of the key columns in 'references.columns'\": {\n \"fk_id\": "
"\"The 'references.columns' of the FK 'fk_id' - 'non-existent column' don't exist "
"in the referenced table - 'table_b'\"}"
)
rp_logger.info(SUCCESSFUL_MESSAGE)

0 comments on commit aa78a90

Please sign in to comment.