add the class ReportTypes in 'ml/validation_schema'

tdspora · Nov 14, 2024 · 7d591c7 · 7d591c7
1 parent 490cec8
commit 7d591c7
Show file tree

Hide file tree

Showing 9 changed files with 50 additions and 24 deletions.
diff --git a/src/syngen/infer.py b/src/syngen/infer.py
@@ -12,12 +12,12 @@
     check_if_logs_available
 )
 from syngen.ml.utils import validate_parameter_reports
-from syngen.ml.validation_schema import INFER_REPORT_TYPES
+from syngen.ml.validation_schema import ReportTypes
 
 
 validate_reports = validate_parameter_reports(
-    report_types=INFER_REPORT_TYPES,
-    full_list=["accuracy"]
+    report_types=ReportTypes().infer_report_types,
+    full_list=ReportTypes().full_list_of_infer_report_types
 )
 
 

diff --git a/src/syngen/ml/config/validation.py b/src/syngen/ml/config/validation.py
@@ -8,7 +8,7 @@
 from slugify import slugify
 from loguru import logger
 from syngen.ml.data_loaders import MetadataLoader, DataLoader
-from syngen.ml.validation_schema import ValidationSchema, INFER_REPORT_TYPES
+from syngen.ml.validation_schema import ValidationSchema, ReportTypes
 
 
 @dataclass
@@ -69,7 +69,7 @@ def _check_conditions(self, metadata: Dict) -> bool:
             self.type_of_process == "infer"
             or (
                 self.type_of_process == "train" and
-                any([item in INFER_REPORT_TYPES for item in reports])
+                any([item in ReportTypes().infer_report_types for item in reports])
             )
         )
 

diff --git a/src/syngen/ml/data_loaders/data_loaders.py b/src/syngen/ml/data_loaders/data_loaders.py
@@ -24,6 +24,7 @@
 from syngen.ml.validation_schema import (
     ExcelFormatSettingsSchema,
     CSVFormatSettingsSchema,
+    ReportTypes
 )
 
 DELIMITERS = {"\\t": "\t"}
@@ -409,8 +410,8 @@ class YAMLLoader(BaseDataLoader):
     Class for loading and saving data in YAML format
     """
     metadata_sections = ["train_settings", "infer_settings", "format", "keys"]
-    infer_reports = ["accuracy"]
-    train_reports = infer_reports + ["sample"]
+    infer_reports = ReportTypes().full_list_of_infer_report_types
+    train_reports = ReportTypes().full_list_of_train_report_types
 
     def __init__(self, path: str):
         super().__init__(path)

diff --git a/src/syngen/ml/validation_schema/__init__.py b/src/syngen/ml/validation_schema/__init__.py
@@ -8,6 +8,5 @@
     KeysSchema,
     ValidationSchema,
     SUPPORTED_EXCEL_EXTENSIONS,
-    TRAIN_REPORT_TYPES,
-    INFER_REPORT_TYPES
+    ReportTypes
 )
diff --git a/src/syngen/ml/validation_schema/validation_schema.py b/src/syngen/ml/validation_schema/validation_schema.py
@@ -13,8 +13,30 @@
 from loguru import logger
 
 SUPPORTED_EXCEL_EXTENSIONS = [".xls", ".xlsx"]
-INFER_REPORT_TYPES = ["accuracy", "metrics_only"]
-TRAIN_REPORT_TYPES = INFER_REPORT_TYPES + ["sample"]
+
+
+class ReportTypes:
+    def __init__(self):
+        self.infer_report_types = ["accuracy", "metrics_only"]
+        self.train_report_types = self.infer_report_types + ["sample"]
+        self.excluded_reports = ["metrics_only"]
+        self.full_list_of_train_report_types = self.get_list_of_report_types("train")
+        self.full_list_of_infer_report_types = self.get_list_of_report_types("infer")
+
+    def get_list_of_report_types(self, type_of_process: Literal["train", "infer"]):
+        """
+        Get the full list of reports that should be generated
+        if the parameter 'reports' sets to 'all'
+        """
+        report_types = (
+            self.train_report_types
+            if type_of_process == "train"
+            else self.infer_report_types
+        )
+        full_list = report_types.copy()
+        for report in self.excluded_reports:
+            full_list.remove(report)
+        return full_list
 
 
 class ReferenceSchema(Schema):
@@ -82,7 +104,7 @@ class TrainingSettingsSchema(Schema):
         required=False,
         validate=(
             lambda x: isinstance(x, list) and
-            all(isinstance(elem, str) and elem in TRAIN_REPORT_TYPES for elem in x)
+            all(isinstance(elem, str) and elem in ReportTypes().train_report_types for elem in x)
         )
     )
 
@@ -109,7 +131,7 @@ class InferSettingsSchema(Schema):
         required=False,
         validate=(
             lambda x: isinstance(x, list) and
-            all(isinstance(elem, str) and elem in INFER_REPORT_TYPES for elem in x)
+            all(isinstance(elem, str) and elem in ReportTypes().infer_report_types for elem in x)
         )
     )
 

diff --git a/src/syngen/ml/worker/worker.py b/src/syngen/ml/worker/worker.py
@@ -16,7 +16,7 @@
 from syngen.ml.context.context import global_context
 from syngen.ml.utils import ProgressBarHandler
 from syngen.ml.mlflow_tracker import MlflowTracker
-from syngen.ml.validation_schema import INFER_REPORT_TYPES
+from syngen.ml.validation_schema import ReportTypes
 
 
 @define
@@ -245,8 +245,7 @@ def _split_pk_fk_metadata(self, config, tables):
     @staticmethod
     def _should_generate_data(
         config_of_tables: Dict,
-        type_of_process: str,
-        list_of_reports: List[str]
+        type_of_process: str
     ):
         """
         Determine whether the synthetic data should be generated
@@ -255,7 +254,8 @@ def _should_generate_data(
         return any(
             [
                 report in config.get(f"{type_of_process}_settings", {}).get("reports", [])
-                for report in list_of_reports for config in config_of_tables.values()
+                for report in ReportTypes().infer_report_types
+                for config in config_of_tables.values()
             ]
         )
 
@@ -486,7 +486,8 @@ def launch_train(self):
         ) = metadata_for_inference
 
         generation_of_reports = self._should_generate_data(
-            metadata_for_training, "train", list_of_reports=INFER_REPORT_TYPES
+            metadata_for_training,
+            "train"
         )
 
         self.__train_tables(
@@ -521,7 +522,8 @@ def launch_infer(self):
         tables, config_of_tables = self._prepare_metadata_for_process(type_of_process="infer")
 
         generation_of_reports = self._should_generate_data(
-            config_of_tables, "infer", list_of_reports=INFER_REPORT_TYPES
+            config_of_tables,
+            "infer"
         )
         delta = 0.25 / len(tables) if generation_of_reports else 0.5 / len(tables)
 

diff --git a/src/syngen/train.py b/src/syngen/train.py
@@ -13,12 +13,12 @@
     check_if_logs_available
 )
 from syngen.ml.utils import validate_parameter_reports
-from syngen.ml.validation_schema import TRAIN_REPORT_TYPES
+from syngen.ml.validation_schema import ReportTypes
 
 
 validate_reports = validate_parameter_reports(
-    report_types=TRAIN_REPORT_TYPES,
-    full_list=["accuracy", "sample"]
+    report_types=ReportTypes().train_report_types,
+    full_list=ReportTypes().full_list_of_train_report_types
 )
 
 

diff --git a/src/tests/unit/launchers/test_launch_infer.py b/src/tests/unit/launchers/test_launch_infer.py
@@ -4,12 +4,13 @@
 
 from syngen.infer import launch_infer
 from syngen.ml.worker import Worker
-from syngen.ml.validation_schema import INFER_REPORT_TYPES
+from syngen.ml.validation_schema import ReportTypes
 from tests.conftest import SUCCESSFUL_MESSAGE, DIR_NAME
 
 
 TABLE_NAME = "test_table"
 PATH_TO_METADATA = f"{DIR_NAME}/unit/launchers/fixtures/metadata.yaml"
+INFER_REPORT_TYPES = ReportTypes().infer_report_types
 
 
 @patch.object(Worker, "launch_infer")

diff --git a/src/tests/unit/launchers/test_launch_train.py b/src/tests/unit/launchers/test_launch_train.py
@@ -4,12 +4,13 @@
 
 from syngen.train import launch_train
 from syngen.ml.worker import Worker
-from syngen.ml.validation_schema import TRAIN_REPORT_TYPES
+from syngen.ml.validation_schema import ReportTypes
 from tests.conftest import SUCCESSFUL_MESSAGE, DIR_NAME
 
 TABLE_NAME = "test_table"
 PATH_TO_TABLE = f"{DIR_NAME}/unit/launchers/fixtures/table_with_data.csv"
 PATH_TO_METADATA = f"{DIR_NAME}/unit/launchers/fixtures/metadata.yaml"
+TRAIN_REPORT_TYPES = ReportTypes().train_report_types
 
 
 @patch.object(Worker, "launch_train")