resolve conflicts

tdspora · Nov 5, 2024 · aa78a90 · aa78a90
2 parents 29a984b + 71175b8
commit aa78a90
Show file tree

Hide file tree

Showing 13 changed files with 93 additions and 49 deletions.
diff --git a/.github/workflows/action-build-deploy.yml b/.github/workflows/action-build-deploy.yml
@@ -41,9 +41,9 @@ jobs:
 
       - name: Lint with flake8
         run: |
-         # stop the build if there are Python syntax errors or undefined names
+          # stop the build if there are Python syntax errors or undefined names
           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
       - name: Test with pytest

diff --git a/.github/workflows/databricks-test.yml → .github/workflows/databricks-build.yml b/.github/workflows/databricks-test.yml → .github/workflows/databricks-build.yml
@@ -2,7 +2,7 @@ name: Databricks-compatibility
 on: [push]
 
 jobs:
-  Databricks-compatibility-test:
+  Databricks-compatibility-library:
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
@@ -18,6 +18,9 @@ jobs:
       - name: Install dependencies
         run: |
           cp -n databricks/requirements-databricks-15.4-LTS.txt .
+          cat databricks/setup.cfg > setup.cfg
+          cat databricks/pyproject.toml > pyproject.toml
+          sed -i 's/name="syngen"/name="syngen-databricks"/' src/setup.py
           python -m pip install --upgrade pip
           pip install flake8 pytest
           if [ -f requirements-databricks-15.4-LTS.txt ]; then pip install -r requirements-databricks-15.4-LTS.txt; fi
@@ -39,28 +42,14 @@ jobs:
             cp -n databricks/databricks.dockerfile .
             docker build -t databricks-test-image -f databricks.dockerfile .
 
-  build-and-publish:
-    needs: Databricks-compatibility-test
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python 3.11
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install build dependencies
+      - name: Build Package
         run: |
-          cp -n databricks/setup.cfg .
-          cp -n databricks/pyproject.toml .
-          sed -i 's/$/rc1+dbx/' src/syngen/VERSION
-          cat src/syngen/VERSION
-          python -m pip install --upgrade pip
           pip install build
-      - name: Build Package
-        run: python -m build .
+          python -m build .
       - name: Publish package
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          password: ${{ secrets.PYPI_TEST_TOKEN }} # For release: use secrets.PYPI_TOKEN
+          user: __token__
+          password: ${{ secrets.PYPI_TOKEN }} # For test release: use secrets.PYPI_TEST_TOKEN and add 'repository_url: https://test.pypi.org/legacy/'
           verbose: true
diff --git a/databricks/README → databricks/README.md b/databricks/README → databricks/README.md
diff --git a/databricks/setup.cfg b/databricks/setup.cfg
@@ -1,20 +1,19 @@
 [metadata]
-name = syngen
+name = syngen-databricks
 version = file: src/syngen/VERSION
 description = file: DESCRIPTION
 long_description = file: README.md
 long_description_content_type = text/markdown
 url = https://github.com/tdspora/syngen
 author = EPAM Systems, Inc.
-maintainer = Pavel Bobyrev
+maintainer = Hanna Imshenetska
 license = GPLv3 License
 keywords = data, generation, synthetic, vae, tabular
 classifiers =
     Development Status :: 5 - Production/Stable
     Operating System :: POSIX :: Linux
     Operating System :: Microsoft :: Windows
     License :: OSI Approved :: GNU General Public License v3 (GPLv3)
-    Programming Language :: Python :: 3.10
     Programming Language :: Python :: 3.11
 
 

diff --git a/setup.cfg b/setup.cfg
@@ -6,7 +6,7 @@ long_description = file: README.md
 long_description_content_type = text/markdown
 url = https://github.com/tdspora/syngen
 author = EPAM Systems, Inc.
-maintainer = Pavel Bobyrev
+maintainer = Hanna Imshenetska
 license = GPLv3 License
 keywords = data, generation, synthetic, vae, tabular
 classifiers =
@@ -20,7 +20,7 @@ classifiers =
 
 [options]
 package_dir =
-     = src
+    = src
 packages = find:
 include_package_data = True
 python_requires = >3.9, <3.12

diff --git a/src/syngen/VERSION b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.51rc0
+0.9.51rc1
diff --git a/src/syngen/ml/config/validation.py b/src/syngen/ml/config/validation.py
@@ -257,12 +257,12 @@ def _check_existence_of_referenced_columns(self, table_name: str):
                 ]
                 if non_existed_columns:
                     message = (
-                        f"The 'referenced.columns' of the {config_of_key['type']} '{key}' - "
+                        f"The 'references.columns' of the {config_of_key['type']} '{key}' - "
                         f"{', '.join(non_existed_columns)} "
                         f"don't exist in the referenced table - '{referenced_table}'"
                     )
                     self.errors[
-                        "check existence of the key columns in 'referenced.columns'"
+                        "check existence of the key columns in 'references.columns'"
                     ][key] = message
 
     def _fetch_existed_columns(self, table_name: str) -> List[str]:
@@ -293,9 +293,12 @@ def _run(self):
         self.merged_metadata.pop("global", None)
         self.metadata.pop("global", None)
 
+        if self.type_of_process == "train" and self.validation_source:
+            for table_name in self.merged_metadata.keys():
+                self._gather_existed_columns(table_name)
+
         for table_name in self.merged_metadata.keys():
             if self.type_of_process == "train" and self.validation_source:
-                self._gather_existed_columns(table_name)
                 self._check_existence_of_source(table_name)
                 self._check_existence_of_key_columns(table_name)
                 self._check_existence_of_referenced_columns(table_name)

diff --git a/src/syngen/ml/handlers/handlers.py b/src/syngen/ml/handlers/handlers.py
@@ -19,6 +19,7 @@
 
 from syngen.ml.vae import *  # noqa: F403
 from syngen.ml.data_loaders import DataLoader, DataFrameFetcher
+from syngen.ml.reporters import Report
 from syngen.ml.vae.models.dataset import Dataset
 from syngen.ml.utils import (
     fetch_config,
@@ -492,6 +493,22 @@ def handle(self, **kwargs):
             else pd.DataFrame()
         )
         prepared_data = self._restore_empty_columns(prepared_data)
+        # workaround for the case when all columns are dropped
+        # with technical column
+        tech_columns = list(self.dataset.tech_columns)
+        if tech_columns:
+            prepared_data = prepared_data.drop(tech_columns, axis=1)
+            logger.debug(
+                "Technical columns "
+                f"{tech_columns} were removed "
+                "from the generated table."
+                )
+            Report().unregister_reporters(self.table_name)
+            logger.info(
+                    "Since there were no columns suitable for training, "
+                    "reports will not be generated "
+                    f"for the table '{self.table_name}'."
+                    )
 
         is_pk = self._is_pk()
 

diff --git a/src/syngen/ml/reporters/reporters.py b/src/syngen/ml/reporters/reporters.py
@@ -1,5 +1,12 @@
 from abc import abstractmethod
-from typing import Dict, Tuple, Optional, Callable
+from typing import (
+    Dict,
+    Tuple,
+    Optional,
+    Callable,
+    Union,
+    List
+)
 import itertools
 from collections import defaultdict
 
@@ -189,7 +196,7 @@ class Report:
     Singleton metaclass for registration all needed reporters
     """
 
-    _reporters: Dict[str, Reporter] = {}
+    _reporters: Dict[str, Union[Reporter, List]] = {}
 
     def __new__(cls):
         if not hasattr(cls, "instance"):
@@ -205,6 +212,13 @@ def register_reporter(cls, table: str, reporter: Reporter):
         list_of_reporters.append(reporter)
         cls._reporters[table] = list_of_reporters
 
+    @classmethod
+    def unregister_reporters(cls, table: str):
+        """
+        Unregister all reporters for a table
+        """
+        cls._reporters[table] = list()
+
     @classmethod
     def clear_report(cls):
         """

diff --git a/src/syngen/ml/utils/utils.py b/src/syngen/ml/utils/utils.py
@@ -366,9 +366,11 @@ def file_sink(message):
     """
     Save logs to the log file
     """
-    with open(os.getenv("SUCCESS_LOG_FILE"), "a") as log_file:
-        log_message = fetch_log_message(message)
-        log_file.write(log_message + "\n")
+    path_to_logs = os.getenv("SUCCESS_LOG_FILE")
+    if path_to_logs is not None:
+        with open(path_to_logs, "a") as log_file:
+            log_message = fetch_log_message(message)
+            log_file.write(log_message + "\n")
 
 
 def console_sink(record):

diff --git a/src/syngen/ml/vae/models/dataset.py b/src/syngen/ml/vae/models/dataset.py
@@ -60,6 +60,7 @@ def __init__(
         self.uuid_columns: Set = set()
         self.uuid_columns_types: Dict = dict()
         self.dropped_columns: Set = set()
+        self.tech_columns: Set = set()
         self.order_of_columns: List = list()
         self.custom_categorical_columns: Set = set()
         self.categorical_columns: Set = set()
@@ -71,6 +72,7 @@ def __init__(
         self.binary_columns: Set = set()
         self.email_columns: Set = set()
         self.long_text_columns: Set = set()
+        self.tech_columns: Set = set()
         self.primary_keys_mapping: Dict = dict()
         self.primary_keys_list: List = list()
         self.primary_key_name: Optional[str] = None
@@ -1326,6 +1328,23 @@ def pipeline(self) -> pd.DataFrame:
             elif column in self.uuid_columns:
                 logger.info(f"Column '{column}' defined as UUID column")
                 self._assign_uuid_null_feature(column)
+
+        # workaround for the case when all columns are dropped
+        # add a technical column to proceed with the training process
+        if not self.features:
+            tech_column = "syngen_tech_column"
+            logger.info(
+                f"Since all columns in the table '{self.table_name}' "
+                "are uuid/key/long text columns, "
+                "there are no suitable columns to train on. "
+                f"A technical column '{tech_column}' will be added "
+                "to proceed with the training process "
+                "and will be removed afterwards."
+                )
+            self.df[tech_column] = 1
+            self._assign_float_feature(tech_column)
+            self.tech_columns.add(tech_column)
+
         self.fit()
 
         # The end of the run related to the preprocessing stage

diff --git a/src/tests/unit/dataset/test_dataset.py b/src/tests/unit/dataset/test_dataset.py
@@ -150,6 +150,7 @@ def test_save_dataset(rp_logger):
         "float_columns",
         "int_columns",
         "date_columns",
+        "tech_columns",
         "date_mapping",
         "binary_columns",
         "email_columns",

diff --git a/src/tests/unit/validation_metadata/test_validation_metadata.py b/src/tests/unit/validation_metadata/test_validation_metadata.py
@@ -237,17 +237,6 @@ def test_validate_metadata_of_related_tables_with_fk_key_in_train_process(
         "contained only the primary key and the foreign key used in the training process"
     )
     test_metadata = {
-            "table_a": {
-                "train_settings": {
-                    "source": "path/to/table_a.csv"
-                },
-                "keys": {
-                    "pk_id": {
-                        "type": "PK",
-                        "columns": ["id"]
-                    }
-                }
-            },
             "table_b": {
                 "train_settings": {
                     "source": "path/to/table_b.csv"
@@ -266,7 +255,18 @@ def test_validate_metadata_of_related_tables_with_fk_key_in_train_process(
                         }
                     }
                 }
-            }
+            },
+            "table_a": {
+                "train_settings": {
+                    "source": "path/to/table_a.csv"
+                },
+                "keys": {
+                    "pk_id": {
+                        "type": "PK",
+                        "columns": ["id"]
+                    }
+                }
+            },
         }
     validator = Validator(
         metadata=test_metadata,
@@ -2021,8 +2021,8 @@ def test_check_not_existent_referenced_columns_in_fk(rp_logger):
         assert validator.merged_metadata == test_metadata
         assert str(error.value) == (
             "The validation of the metadata has been failed. The error(s) found in - \n"
-            "\"check existence of the key columns in 'referenced.columns'\": {\n    \"fk_id\": "
-            "\"The 'referenced.columns' of the FK 'fk_id' - 'non-existent column' don't exist "
+            "\"check existence of the key columns in 'references.columns'\": {\n    \"fk_id\": "
+            "\"The 'references.columns' of the FK 'fk_id' - 'non-existent column' don't exist "
             "in the referenced table - 'table_b'\"}"
         )
     rp_logger.info(SUCCESSFUL_MESSAGE)