provide the section 'reports' in the metadata filed instead of the pa…

…rameter 'print_report' (#471) * provide the parameter 'reports' instead of 'print_report', refactor the code, update unit tests * refactor the code, update unit tests * update 'README.md' * refactor the code, update unit tests * update 'README.md' * update unit tests * update unit tests * update unit tests * update 'README.md' * fix issues raised by 'flake8' * refactor the method 'validate_parameter_reports' of 'ml/utils' * minor changes in help strings provided for parameters in 'ml/train.py', 'ml/infer.py' * refactor the class Worker * refactor the class Worker * refactor the code in 'ml/config', 'ml/strategies', 'ml/worker', update unit tests * refactor the code * fix the method 'run' of the class InferStrategy * update 'VERSION' * update unit tests * update unit tests, fix issues raised by 'flake8' * update unit tests in 'tests/unit/validation_schema' * refactor the code * refactor the code * refactor the code, update unit tests * refactor the class YAMLLoader, update 'README.md', 'VERSION' * update 'README.md' * refactor the code * minor changes * refactor the class InferConfig * update 'VERSION' * minor changes in 'train.py', 'infer.py' * minor changes in the class Validator * add the class ReportTypes in 'ml/validation_schema' * update 'VERSION' * refactor the class Report * refactor the class BaseTest * refactor the method '_should_generate_data' of the class Worker * update 'tests/unit/launchers' * update 'tests/unit/launchers/test_launch-train.py' * refactor the log message in the method '_set_up_reporting' of the class InferConfig * refactor the class ReportTypes in 'ml/validation_schema' * refactor the method '_should_generate_data' of the class Worker * update 'VERSION' * refactor the class Validator * refactor the method 'validate_parameter_reports' of 'ml/utils' * refactor the code * minor changes in the class YAMLLoader * minor changes in 'ml/reporters' * update the method '_update_dataset' of the class VAEWrapper * refactor the class InferConfig * refactor the code * refactor the class InferConfig * refactor the class InferConfig * update 'VERSION' * refactor the code * refactor 'ml/strategies' * update 'VERSION' * refactor the code * refactor the code * refactor the code related to support of the streamlit app * minor changes in 'syngen/streamlit_app/handlers/handlers.py' * upgrade the library 'aiohttp' in 'requirements.txt' * refactor the code * fix issues raised by 'flake8' * update unit tests in 'tests/unit/test_worker/test_worker.py' * update unit tests in 'tests/unit/test_worker' * refactor unit tests in 'tests/unit/test_worker' * fix the vulnerability, update 'requirements.txt', 'setup.cfg', add the warning message in the method 'generate_report' of the class Report * refactor the code * fix issues raised by flake8 * update unit tests * fix issues raised by 'flake8' * update the log message in the method 'generate_report' of the class Report * minor changes in 'ml/config', 'ml/mlflow_tracker' * changes in the documentation * minor changes in 'README.md', 'train.py' * refactor 'README.md' * minor changes in 'train.py', 'infer.py' * refactor the class YAMLLoader, the code in 'ml/validation_schema', update 'README.md', update unit tests * minor changes in 'ml/validation_schema', update 'VERSION' * minor changes in 'ml/handlers' * update 'README.md' * refactor the code in the class Dataset, minor changes in 'ml/validation_schema' * update 'VERSION' * minor changes in 'README.md' --------- Co-authored-by: Hanna Imshenetska <[email protected]@EVZZAMZSA0021.epam.com>
tdspora · Dec 10, 2024 · 0436a15 · 0436a15
1 parent ae15d29
commit 0436a15
Show file tree

Hide file tree

Showing 52 changed files with 3,274 additions and 1,374 deletions.
diff --git a/README.md b/README.md
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-aiohttp>=3.9.0
+aiohttp>=3.10.11
 attrs
 avro
 base32-crockford
@@ -31,6 +31,7 @@ scipy==1.14.*
 seaborn==0.13.*
 setuptools==74.1.*
 tensorflow==2.15.*
+tornado==6.4.*
 tqdm==4.66.3
 Werkzeug==3.1.2
 xlrd

diff --git a/setup.cfg b/setup.cfg
@@ -25,7 +25,7 @@ packages = find:
 include_package_data = True
 python_requires = >3.9, <3.12
 install_requires =
-    aiohttp>=3.9.0
+    aiohttp>=3.10.11
     attrs
     avro
     base32-crockford
@@ -58,6 +58,7 @@ install_requires =
     seaborn==0.13.*
     setuptools==74.1.*
     tensorflow==2.15.*
+    tornado==6.4.*
     tqdm==4.66.3
     Werkzeug==3.1.2
     xlrd

diff --git a/src/syngen/VERSION b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.52
+0.10.0
diff --git a/src/syngen/infer.py b/src/syngen/infer.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+from typing import Optional, List
 import traceback
 
 import click
@@ -11,6 +11,14 @@
     set_log_path,
     check_if_logs_available
 )
+from syngen.ml.utils import validate_parameter_reports
+from syngen.ml.validation_schema import ReportTypes
+
+
+validate_reports = validate_parameter_reports(
+    report_types=ReportTypes().infer_report_types,
+    full_list=ReportTypes().full_list_of_infer_report_types
+)
 
 
 @click.command()
@@ -48,11 +56,18 @@
     "use the same int in this command.",
 )
 @click.option(
-    "--print_report",
-    default=False,
-    type=click.BOOL,
-    help="Whether to print quality report. Might require significant time "
-    "for big generated tables (>1000 rows). If absent, it's defaulted to False",
+    "--reports",
+    default=("none",),
+    type=click.UNPROCESSED,
+    multiple=True,
+    callback=validate_reports,
+    help="Controls the generation of quality reports. "
+    "Might require significant time for big generated tables (>10000 rows). "
+    "If set to 'accuracy', generates an accuracy report. "
+    "If set to 'metrics_only', outputs the metrics information "
+    "only to standard output without generation of a report. "
+    "If set to 'all', generates an accuracy report. "
+    "If it's absent or set to 'none', no reports are generated.",
 )
 @click.option(
     "--log_level",
@@ -67,7 +82,7 @@ def launch_infer(
     table_name: Optional[str],
     run_parallel: bool,
     batch_size: Optional[int],
-    print_report: bool,
+    reports: List[str],
     random_seed: Optional[int],
     log_level: str,
 ):
@@ -80,7 +95,7 @@ def launch_infer(
     table_name
     run_parallel
     batch_size
-    print_report
+    reports
     random_seed
     log_level
     -------
@@ -111,9 +126,8 @@ def launch_infer(
         "size": size,
         "run_parallel": run_parallel,
         "batch_size": batch_size,
-        "print_report": print_report,
-        "random_seed": random_seed,
-        "get_infer_metrics": False
+        "reports": reports,
+        "random_seed": random_seed
     }
     worker = Worker(
         table_name=table_name,

diff --git a/src/syngen/ml/config/configurations.py b/src/syngen/ml/config/configurations.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass, field
-from typing import Optional, Dict, Tuple, Set, List, Callable
+from typing import Optional, Dict, Tuple, Set, List, Callable, Literal
 import os
+from copy import deepcopy
 import shutil
 from datetime import datetime
 
@@ -23,11 +24,12 @@ class TrainConfig:
     drop_null: bool
     row_limit: Optional[int]
     table_name: Optional[str]
-    metadata_path: Optional[str]
-    print_report: bool
+    metadata: Dict
+    reports: List[str]
     batch_size: int
     loader: Optional[Callable[[str], pd.DataFrame]]
     data: pd.DataFrame = field(init=False)
+    initial_data_shape: Tuple[int, int] = field(init=False)
     paths: Dict = field(init=False)
     row_subset: int = field(init=False)
     schema: Dict = field(init=False)
@@ -37,7 +39,7 @@ class TrainConfig:
     dropped_columns: Set = field(init=False)
 
     def __post_init__(self):
-        self.paths = self._get_paths()
+        self._set_paths()
         self._remove_existed_artifacts()
         self._prepare_dirs()
 
@@ -59,6 +61,7 @@ def preprocess_data(self):
         self._remove_empty_columns()
         self._mark_removed_columns()
         self._prepare_data()
+        self._check_reports()
 
     def to_dict(self) -> Dict:
         """
@@ -69,7 +72,7 @@ def to_dict(self) -> Dict:
             "drop_null": self.drop_null,
             "row_subset": self.row_subset,
             "batch_size": self.batch_size,
-            "print_report": self.print_report
+            "reports": self.reports
         }
 
     def _set_batch_size(self):
@@ -78,6 +81,25 @@ def _set_batch_size(self):
         """
         self.batch_size = min(self.batch_size, self.row_subset)
 
+    def _check_sample_report(self):
+        """
+        Check whether it is necessary to generate a certain report
+        """
+        if "sample" in self.reports and self.initial_data_shape[0] == self.row_subset:
+            logger.warning(
+                "The generation of the sample report is unnecessary and won't be produced "
+                "as the source data and sampled data sizes are identical"
+            )
+            reports = deepcopy(self.reports)
+            reports.remove("sample")
+            self.reports = reports
+
+    def _check_reports(self):
+        """
+        Check whether it is necessary to generate a certain report
+        """
+        self._check_sample_report()
+
     def _remove_existed_artifacts(self):
         """
         Remove existed artifacts from previous train process
@@ -166,6 +188,7 @@ def _extract_data(self):
         Extract data and schema necessary for training process
         """
         self.data, self.schema = self._load_source()
+        self.initial_data_shape = self.data.shape
         self._check_if_data_is_empty()
 
     def _preprocess_data(self):
@@ -243,15 +266,15 @@ def _prepare_data(self):
             self._save_input_data()
 
     @slugify_attribute(table_name="slugify_table_name")
-    def _get_paths(self) -> Dict:
+    def _set_paths(self):
         """
         Create the paths which used in training process
         """
         losses_file_name = (
             f"losses_{self.table_name}_"
             f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
         )
-        return {
+        self.paths = {
             "model_artifacts_path": "model_artifacts/",
             "resources_path": f"model_artifacts/resources/{self.slugify_table_name}/",
             "tmp_store_path": f"model_artifacts/tmp_store/{self.slugify_table_name}/",
@@ -285,74 +308,93 @@ class InferConfig:
     """
 
     destination: Optional[str]
+    metadata: Dict
+    metadata_path: Optional[str]
     size: Optional[int]
     table_name: Optional[str]
     run_parallel: bool
     batch_size: Optional[int]
-    metadata_path: Optional[str]
     random_seed: Optional[int]
-    print_report: bool
-    get_infer_metrics: bool
+    reports: List[str]
     both_keys: bool
     log_level: str
     loader: Optional[Callable[[str], pd.DataFrame]]
+    type_of_process: Literal["train", "infer"]
     slugify_table_name: str = field(init=False)
 
     def __post_init__(self):
-        self.paths = self._get_paths()
-        self._set_up_reporting()
+        self._set_paths()
+        self._remove_artifacts()
+        self._set_infer_parameters()
+
+    def _set_infer_parameters(self):
+        self._check_reports()
         self._set_up_size()
         self._set_up_batch_size()
 
+    def _remove_reports(self):
+        path_to_reports = self.paths["reports_path"]
+        if os.path.exists(path_to_reports):
+            shutil.rmtree(path_to_reports)
+            logger.info(
+                f"The reports generated in the previous run of an inference process "
+                f"and located in the path - '{path_to_reports}' were removed"
+            )
+
+    def _remove_generated_data(self):
+        default_path_to_synth_data = self.paths["default_path_to_merged_infer"]
+        if os.path.exists(default_path_to_synth_data):
+            os.remove(default_path_to_synth_data)
+            logger.info(
+                f"The synthetic data generated in the previous run of an inference process and "
+                f"located in the path - '{default_path_to_synth_data}' was removed"
+            )
+
+    def _remove_artifacts(self):
+        """
+        Remove artifacts related to the previous generation process
+        """
+        self._remove_reports()
+        self._remove_generated_data()
+
     def to_dict(self) -> Dict:
         """
         Return the values of the settings of inference process
-        :return:
         """
         return {
             "size": self.size,
             "run_parallel": self.run_parallel,
             "batch_size": self.batch_size,
             "random_seed": self.random_seed,
-            "print_report": self.print_report,
-            "get_infer_metrics": self.get_infer_metrics,
+            "reports": self.reports,
         }
 
-    def _set_up_reporting(self):
+    def _check_required_artifacts(self):
         """
-        Check whether it is possible to generate the report
+        Check whether required artifacts exists
         """
         if (
-                (self.print_report or self.get_infer_metrics)
+                self.reports
                 and (
-                    not DataLoader(self.paths["input_data_path"]).has_existed_path
-                    and not self.loader
+                    DataLoader(self.paths["input_data_path"]).has_existed_path is False
+                    or self.loader is not None
                 )
         ):
-            message = (
-                f"It seems that the path to the sample of the original data "
-                f"of the table '{self.table_name}' - '{self.paths['input_data_path']}' "
-                f"doesn't exist."
+            self.reports = list()
+            log_message = (
+                f"It seems that the path to the sample of the original data for the table "
+                f"'{self.table_name}' at '{self.paths['input_data_path']}' does not exist. "
+                f"As a result, no reports for the table '{self.table_name}' will be generated. "
+                f"The 'reports' parameter for the table '{self.table_name}' "
+                f"has been set to 'none'."
             )
-            logger.warning(message)
-            if self.print_report:
-                self.print_report = False
-                log_message = (
-                    "As a result, the accuracy report of the table - "
-                    f"'{self.table_name}' won't be generated. "
-                    "The parameter '--print_report' of the table - "
-                    f"'{self.table_name}' has been set to False"
-                )
-                logger.warning(log_message)
-            if self.get_infer_metrics:
-                self.get_infer_metrics = False
-                log_message = (
-                    "As a result, the infer metrics related to the table - "
-                    f"'{self.table_name}' won't be fetched. "
-                    "The parameter '--get_infer_metrics' of the table - "
-                    f"'{self.table_name}' has been set to False"
-                )
-                logger.warning(log_message)
+            logger.warning(log_message)
+
+    def _check_reports(self):
+        """
+        Check whether it is possible to generate reports
+        """
+        self._check_required_artifacts()
 
     def _set_up_size(self):
         """
@@ -379,17 +421,19 @@ def _set_up_batch_size(self):
         )
 
     @slugify_attribute(table_name="slugify_table_name")
-    def _get_paths(self) -> Dict:
+    def _set_paths(self):
         """
         Create the paths which used in inference process
         """
         dynamic_name = self.slugify_table_name[:-3] if self.both_keys else self.slugify_table_name
-        return {
+        self.paths = {
             "original_data_path":
                 f"model_artifacts/tmp_store/{dynamic_name}/input_data_{dynamic_name}.pkl",
             "reports_path": f"model_artifacts/tmp_store/{dynamic_name}/reports",
             "input_data_path":
                 f"model_artifacts/tmp_store/{dynamic_name}/input_data_{dynamic_name}.pkl",
+            "default_path_to_merged_infer": f"model_artifacts/tmp_store/{dynamic_name}/"
+                                            f"merged_infer_{dynamic_name}.csv",
             "path_to_merged_infer": self.destination
             if self.destination is not None
             else f"model_artifacts/tmp_store/{dynamic_name}/merged_infer_{dynamic_name}.csv",