Skip to content

Commit

Permalink
provide the section 'reports' in the metadata filed instead of the pa…
Browse files Browse the repository at this point in the history
…rameter 'print_report' (#471)

* provide the parameter 'reports' instead of 'print_report', refactor the code, update unit tests

* refactor the code, update unit tests

* update 'README.md'

* refactor the code, update unit tests

* update 'README.md'

* update unit tests

* update unit tests

* update unit tests

* update 'README.md'

* fix issues raised by 'flake8'

* refactor the method 'validate_parameter_reports' of 'ml/utils'

* minor changes in help strings provided for parameters in 'ml/train.py', 'ml/infer.py'

* refactor the class Worker

* refactor the class Worker

* refactor the code in 'ml/config', 'ml/strategies', 'ml/worker', update unit tests

* refactor the code

* fix the method 'run' of the class InferStrategy

* update 'VERSION'

* update unit tests

* update unit tests, fix issues raised by 'flake8'

* update unit tests in 'tests/unit/validation_schema'

* refactor the code

* refactor the code

* refactor the code, update unit tests

* refactor the class YAMLLoader, update 'README.md', 'VERSION'

* update 'README.md'

* refactor the code

* minor changes

* refactor the class InferConfig

* update 'VERSION'

* minor changes in 'train.py', 'infer.py'

* minor changes in the class Validator

* add the class ReportTypes in 'ml/validation_schema'

* update 'VERSION'

* refactor the class Report

* refactor the class BaseTest

* refactor the method '_should_generate_data' of the class Worker

* update 'tests/unit/launchers'

* update 'tests/unit/launchers/test_launch-train.py'

* refactor the log message in the method '_set_up_reporting' of the class InferConfig

* refactor the class ReportTypes in 'ml/validation_schema'

* refactor the method '_should_generate_data' of the class Worker

* update 'VERSION'

* refactor the class Validator

* refactor the method 'validate_parameter_reports' of 'ml/utils'

* refactor the code

* minor changes in the class YAMLLoader

* minor changes in 'ml/reporters'

* update the method '_update_dataset' of the class VAEWrapper

* refactor the class InferConfig

* refactor the code

* refactor the class InferConfig

* refactor the class InferConfig

* update 'VERSION'

* refactor the code

* refactor 'ml/strategies'

* update 'VERSION'

* refactor the code

* refactor the code

* refactor the code related to support of the streamlit app

* minor changes in 'syngen/streamlit_app/handlers/handlers.py'

* upgrade the library 'aiohttp' in 'requirements.txt'

* refactor the code

* fix issues raised by 'flake8'

* update unit tests in 'tests/unit/test_worker/test_worker.py'

* update unit tests in 'tests/unit/test_worker'

* refactor unit tests in 'tests/unit/test_worker'

* fix the vulnerability, update 'requirements.txt', 'setup.cfg', add the warning message in the method 'generate_report' of the class Report

* refactor the code

* fix issues raised by flake8

* update unit tests

* fix issues raised by 'flake8'

* update the log message in the method 'generate_report' of the class Report

* minor changes in 'ml/config', 'ml/mlflow_tracker'

* changes in the documentation

* minor changes in 'README.md', 'train.py'

* refactor 'README.md'

* minor changes in 'train.py', 'infer.py'

* refactor the class YAMLLoader, the code in 'ml/validation_schema', update 'README.md', update unit tests

* minor changes in 'ml/validation_schema', update 'VERSION'

* minor changes in 'ml/handlers'

* update 'README.md'

* refactor the code in the class Dataset, minor changes in 'ml/validation_schema'

* update 'VERSION'

* minor changes in 'README.md'

---------

Co-authored-by: Hanna Imshenetska <[email protected]@EVZZAMZSA0021.epam.com>
  • Loading branch information
Anna050689 and Hanna Imshenetska authored Dec 10, 2024
1 parent ae15d29 commit 0436a15
Show file tree
Hide file tree
Showing 52 changed files with 3,274 additions and 1,374 deletions.
133 changes: 106 additions & 27 deletions README.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
aiohttp>=3.9.0
aiohttp>=3.10.11
attrs
avro
base32-crockford
Expand Down Expand Up @@ -31,6 +31,7 @@ scipy==1.14.*
seaborn==0.13.*
setuptools==74.1.*
tensorflow==2.15.*
tornado==6.4.*
tqdm==4.66.3
Werkzeug==3.1.2
xlrd
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ packages = find:
include_package_data = True
python_requires = >3.9, <3.12
install_requires =
aiohttp>=3.9.0
aiohttp>=3.10.11
attrs
avro
base32-crockford
Expand Down Expand Up @@ -58,6 +58,7 @@ install_requires =
seaborn==0.13.*
setuptools==74.1.*
tensorflow==2.15.*
tornado==6.4.*
tqdm==4.66.3
Werkzeug==3.1.2
xlrd
Expand Down
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.52
0.10.0
36 changes: 25 additions & 11 deletions src/syngen/infer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Optional
from typing import Optional, List
import traceback

import click
Expand All @@ -11,6 +11,14 @@
set_log_path,
check_if_logs_available
)
from syngen.ml.utils import validate_parameter_reports
from syngen.ml.validation_schema import ReportTypes


validate_reports = validate_parameter_reports(
report_types=ReportTypes().infer_report_types,
full_list=ReportTypes().full_list_of_infer_report_types
)


@click.command()
Expand Down Expand Up @@ -48,11 +56,18 @@
"use the same int in this command.",
)
@click.option(
"--print_report",
default=False,
type=click.BOOL,
help="Whether to print quality report. Might require significant time "
"for big generated tables (>1000 rows). If absent, it's defaulted to False",
"--reports",
default=("none",),
type=click.UNPROCESSED,
multiple=True,
callback=validate_reports,
help="Controls the generation of quality reports. "
"Might require significant time for big generated tables (>10000 rows). "
"If set to 'accuracy', generates an accuracy report. "
"If set to 'metrics_only', outputs the metrics information "
"only to standard output without generation of a report. "
"If set to 'all', generates an accuracy report. "
"If it's absent or set to 'none', no reports are generated.",
)
@click.option(
"--log_level",
Expand All @@ -67,7 +82,7 @@ def launch_infer(
table_name: Optional[str],
run_parallel: bool,
batch_size: Optional[int],
print_report: bool,
reports: List[str],
random_seed: Optional[int],
log_level: str,
):
Expand All @@ -80,7 +95,7 @@ def launch_infer(
table_name
run_parallel
batch_size
print_report
reports
random_seed
log_level
-------
Expand Down Expand Up @@ -111,9 +126,8 @@ def launch_infer(
"size": size,
"run_parallel": run_parallel,
"batch_size": batch_size,
"print_report": print_report,
"random_seed": random_seed,
"get_infer_metrics": False
"reports": reports,
"random_seed": random_seed
}
worker = Worker(
table_name=table_name,
Expand Down
134 changes: 89 additions & 45 deletions src/syngen/ml/config/configurations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass, field
from typing import Optional, Dict, Tuple, Set, List, Callable
from typing import Optional, Dict, Tuple, Set, List, Callable, Literal
import os
from copy import deepcopy
import shutil
from datetime import datetime

Expand All @@ -23,11 +24,12 @@ class TrainConfig:
drop_null: bool
row_limit: Optional[int]
table_name: Optional[str]
metadata_path: Optional[str]
print_report: bool
metadata: Dict
reports: List[str]
batch_size: int
loader: Optional[Callable[[str], pd.DataFrame]]
data: pd.DataFrame = field(init=False)
initial_data_shape: Tuple[int, int] = field(init=False)
paths: Dict = field(init=False)
row_subset: int = field(init=False)
schema: Dict = field(init=False)
Expand All @@ -37,7 +39,7 @@ class TrainConfig:
dropped_columns: Set = field(init=False)

def __post_init__(self):
self.paths = self._get_paths()
self._set_paths()
self._remove_existed_artifacts()
self._prepare_dirs()

Expand All @@ -59,6 +61,7 @@ def preprocess_data(self):
self._remove_empty_columns()
self._mark_removed_columns()
self._prepare_data()
self._check_reports()

def to_dict(self) -> Dict:
"""
Expand All @@ -69,7 +72,7 @@ def to_dict(self) -> Dict:
"drop_null": self.drop_null,
"row_subset": self.row_subset,
"batch_size": self.batch_size,
"print_report": self.print_report
"reports": self.reports
}

def _set_batch_size(self):
Expand All @@ -78,6 +81,25 @@ def _set_batch_size(self):
"""
self.batch_size = min(self.batch_size, self.row_subset)

def _check_sample_report(self):
"""
Check whether it is necessary to generate a certain report
"""
if "sample" in self.reports and self.initial_data_shape[0] == self.row_subset:
logger.warning(
"The generation of the sample report is unnecessary and won't be produced "
"as the source data and sampled data sizes are identical"
)
reports = deepcopy(self.reports)
reports.remove("sample")
self.reports = reports

def _check_reports(self):
"""
Check whether it is necessary to generate a certain report
"""
self._check_sample_report()

def _remove_existed_artifacts(self):
"""
Remove existed artifacts from previous train process
Expand Down Expand Up @@ -166,6 +188,7 @@ def _extract_data(self):
Extract data and schema necessary for training process
"""
self.data, self.schema = self._load_source()
self.initial_data_shape = self.data.shape
self._check_if_data_is_empty()

def _preprocess_data(self):
Expand Down Expand Up @@ -243,15 +266,15 @@ def _prepare_data(self):
self._save_input_data()

@slugify_attribute(table_name="slugify_table_name")
def _get_paths(self) -> Dict:
def _set_paths(self):
"""
Create the paths which used in training process
"""
losses_file_name = (
f"losses_{self.table_name}_"
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
)
return {
self.paths = {
"model_artifacts_path": "model_artifacts/",
"resources_path": f"model_artifacts/resources/{self.slugify_table_name}/",
"tmp_store_path": f"model_artifacts/tmp_store/{self.slugify_table_name}/",
Expand Down Expand Up @@ -285,74 +308,93 @@ class InferConfig:
"""

destination: Optional[str]
metadata: Dict
metadata_path: Optional[str]
size: Optional[int]
table_name: Optional[str]
run_parallel: bool
batch_size: Optional[int]
metadata_path: Optional[str]
random_seed: Optional[int]
print_report: bool
get_infer_metrics: bool
reports: List[str]
both_keys: bool
log_level: str
loader: Optional[Callable[[str], pd.DataFrame]]
type_of_process: Literal["train", "infer"]
slugify_table_name: str = field(init=False)

def __post_init__(self):
self.paths = self._get_paths()
self._set_up_reporting()
self._set_paths()
self._remove_artifacts()
self._set_infer_parameters()

def _set_infer_parameters(self):
self._check_reports()
self._set_up_size()
self._set_up_batch_size()

def _remove_reports(self):
path_to_reports = self.paths["reports_path"]
if os.path.exists(path_to_reports):
shutil.rmtree(path_to_reports)
logger.info(
f"The reports generated in the previous run of an inference process "
f"and located in the path - '{path_to_reports}' were removed"
)

def _remove_generated_data(self):
default_path_to_synth_data = self.paths["default_path_to_merged_infer"]
if os.path.exists(default_path_to_synth_data):
os.remove(default_path_to_synth_data)
logger.info(
f"The synthetic data generated in the previous run of an inference process and "
f"located in the path - '{default_path_to_synth_data}' was removed"
)

def _remove_artifacts(self):
"""
Remove artifacts related to the previous generation process
"""
self._remove_reports()
self._remove_generated_data()

def to_dict(self) -> Dict:
"""
Return the values of the settings of inference process
:return:
"""
return {
"size": self.size,
"run_parallel": self.run_parallel,
"batch_size": self.batch_size,
"random_seed": self.random_seed,
"print_report": self.print_report,
"get_infer_metrics": self.get_infer_metrics,
"reports": self.reports,
}

def _set_up_reporting(self):
def _check_required_artifacts(self):
"""
Check whether it is possible to generate the report
Check whether required artifacts exists
"""
if (
(self.print_report or self.get_infer_metrics)
self.reports
and (
not DataLoader(self.paths["input_data_path"]).has_existed_path
and not self.loader
DataLoader(self.paths["input_data_path"]).has_existed_path is False
or self.loader is not None
)
):
message = (
f"It seems that the path to the sample of the original data "
f"of the table '{self.table_name}' - '{self.paths['input_data_path']}' "
f"doesn't exist."
self.reports = list()
log_message = (
f"It seems that the path to the sample of the original data for the table "
f"'{self.table_name}' at '{self.paths['input_data_path']}' does not exist. "
f"As a result, no reports for the table '{self.table_name}' will be generated. "
f"The 'reports' parameter for the table '{self.table_name}' "
f"has been set to 'none'."
)
logger.warning(message)
if self.print_report:
self.print_report = False
log_message = (
"As a result, the accuracy report of the table - "
f"'{self.table_name}' won't be generated. "
"The parameter '--print_report' of the table - "
f"'{self.table_name}' has been set to False"
)
logger.warning(log_message)
if self.get_infer_metrics:
self.get_infer_metrics = False
log_message = (
"As a result, the infer metrics related to the table - "
f"'{self.table_name}' won't be fetched. "
"The parameter '--get_infer_metrics' of the table - "
f"'{self.table_name}' has been set to False"
)
logger.warning(log_message)
logger.warning(log_message)

def _check_reports(self):
"""
Check whether it is possible to generate reports
"""
self._check_required_artifacts()

def _set_up_size(self):
"""
Expand All @@ -379,17 +421,19 @@ def _set_up_batch_size(self):
)

@slugify_attribute(table_name="slugify_table_name")
def _get_paths(self) -> Dict:
def _set_paths(self):
"""
Create the paths which used in inference process
"""
dynamic_name = self.slugify_table_name[:-3] if self.both_keys else self.slugify_table_name
return {
self.paths = {
"original_data_path":
f"model_artifacts/tmp_store/{dynamic_name}/input_data_{dynamic_name}.pkl",
"reports_path": f"model_artifacts/tmp_store/{dynamic_name}/reports",
"input_data_path":
f"model_artifacts/tmp_store/{dynamic_name}/input_data_{dynamic_name}.pkl",
"default_path_to_merged_infer": f"model_artifacts/tmp_store/{dynamic_name}/"
f"merged_infer_{dynamic_name}.csv",
"path_to_merged_infer": self.destination
if self.destination is not None
else f"model_artifacts/tmp_store/{dynamic_name}/merged_infer_{dynamic_name}.csv",
Expand Down
Loading

0 comments on commit 0436a15

Please sign in to comment.