Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Float rounds are adjusted & case sensitivity in eval metrics is removed #12

Merged
merged 3 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 70 additions & 48 deletions flexml/_model_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
f1_score)

from flexml.logger.logger import get_logger
from flexml.helpers import eval_metric_checker


class ModelTuner:
Expand Down Expand Up @@ -66,6 +67,25 @@ def __init__(self,

self.logger = get_logger(__name__, "PROD", logging_to_file)

@staticmethod
def __eval_metric_revieser(eval_metric: str) -> str:
"""
Scikit-learn based hyperparameter optimization methods (GridSearch & Randomized Search) require spesific namings for evaluation metrics

This method is used to revise the evaluation metric name for the optimization process

Parameters
----------
eval_metric : str
The evaluation metric

Returns
-------
str
The revised evaluation metric name. e.g. 'R2' to 'r2, 'Accuracy' to 'accuracy', 'F1 Score' to 'f1_weighted' etc.
"""
return eval_metric.lower() if eval_metric != 'F1 Score' else 'f1_weighted'

def _param_grid_validator(self,
model_available_params: dict,
param_grid: dict) -> dict:
Expand Down Expand Up @@ -185,38 +205,38 @@ def _model_evaluator(self,
eval_metric : str
The evaluation metric that will be used to evaluate the model. It can be one of the following:

* 'r2' for R^2 score
* 'R2' for R^2 score

* 'mae' for Mean Absolute Error
* 'MAE' for Mean Absolute Error

* 'mse' for Mean Squared Error
* 'MSE' for Mean Squared Error

* 'accuracy' for Accuracy
* 'Accuracy' for Accuracy

* 'precision' for Precision
* 'Precision' for Precision

* 'recall' for Recall
* 'Recall' for Recall

* 'f1' for F1 score
* 'F1 Score' for F1 score
"""
eval_metric = eval_metric.lower()
eval_metric = eval_metric_checker(self.ml_problem_type, eval_metric)

if eval_metric == 'r2':
return r2_score(self.y_test, model.predict(self.X_test))
elif eval_metric == 'mae':
return mean_absolute_error(self.y_test, model.predict(self.X_test))
elif eval_metric == 'mse':
return mean_squared_error(self.y_test, model.predict(self.X_test))
elif eval_metric == 'accuracy':
return accuracy_score(self.y_test, model.predict(self.X_test))
elif eval_metric == 'precision':
return precision_score(self.y_test, model.predict(self.X_test))
elif eval_metric == 'recall':
return recall_score(self.y_test, model.predict(self.X_test))
elif eval_metric == 'f1':
return f1_score(self.y_test, model.predict(self.X_test))
if eval_metric == 'R2':
return round(r2_score(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'MAE':
return round(mean_absolute_error(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'MSE':
return round(mean_squared_error(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'Accuracy':
return round(accuracy_score(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'Precision':
return round(precision_score(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'Recall':
return round(recall_score(self.y_test, model.predict(self.X_test)), 6)
elif eval_metric == 'F1 Score':
return round(f1_score(self.y_test, model.predict(self.X_test)), 6)
else:
error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'r2', 'mae', 'mse', 'accuracy', 'precision', 'recall', 'f1'"
error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'R2', 'MAE', 'MSE', 'Accuracy', 'Precision', 'Recall', 'F1 Score'"
self.logger.error(error_msg)
raise ValueError(error_msg)

Expand All @@ -241,19 +261,19 @@ def grid_search(self,
eval_metric : str
The evaluation metric that will be used to evaluate the model. It can be one of the following:

* 'r2' for R^2 score
* 'R2' for R^2 score

* 'mae' for Mean Absolute Error
* 'MAE' for Mean Absolute Error

* 'mse' for Mean Squared Error
* 'MSE' for Mean Squared Error

* 'accuracy' for Accuracy
* 'Accuracy' for Accuracy

* 'precision' for Precision
* 'Precision' for Precision

* 'recall' for Recall
* 'Recall' for Recall

* 'f1' for F1 score
* 'F1 Score' for F1 score

cv : int (default=3)
The number of cross-validation splits. The default is 3.
Expand Down Expand Up @@ -291,10 +311,11 @@ def grid_search(self,
"""
model_stats = self._setup_tuning("GridSearchCV", model, param_grid, n_iter=None, cv=cv, n_jobs=n_jobs)
param_grid = model_stats['tuning_param_grid']
scoring_eval_metric = self.__eval_metric_revieser(eval_metric)

try:
t_start = time()
search_result = GridSearchCV(model, param_grid, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
search_result = GridSearchCV(model, param_grid, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
t_end = time()
time_taken = round(t_end - t_start, 2)

Expand Down Expand Up @@ -330,19 +351,19 @@ def random_search(self,
eval_metric : str
The evaluation metric that will be used to evaluate the model. It can be one of the following:

* 'r2' for R^2 score
* 'R2' for R^2 score

* 'mae' for Mean Absolute Error
* 'MAE' for Mean Absolute Error

* 'mse' for Mean Squared Error
* 'MSE' for Mean Squared Error

* 'accuracy' for Accuracy
* 'Accuracy' for Accuracy

* 'precision' for Precision
* 'Precision' for Precision

* 'recall' for Recall
* 'Recall' for Recall

* 'f1' for F1 score
* 'F1 Score' for F1 score

n_iter : int, optional (default=10)
The number of trials. The default is 10.
Expand Down Expand Up @@ -374,10 +395,11 @@ def random_search(self,
"""
model_stats = self._setup_tuning("randomized_search", model, param_grid, n_iter=n_iter, cv=cv, n_jobs=n_jobs)
param_grid = model_stats['tuning_param_grid']
scoring_eval_metric = self.__eval_metric_revieser(eval_metric)

try:
t_start = time()
search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
t_end = time()
time_taken = round(t_end - t_start, 2)

Expand Down Expand Up @@ -413,19 +435,19 @@ def optuna_search(self,
eval_metric : str
The evaluation metric that will be used to evaluate the model. It can be one of the following:

* 'r2' for R^2 score
* 'R2' for R^2 score

* 'mae' for Mean Absolute Error
* 'MAE' for Mean Absolute Error

* 'mse' for Mean Squared Error
* 'MSE' for Mean Squared Error

* 'accuracy' for Accuracy
* 'Accuracy' for Accuracy

* 'precision' for Precision
* 'Precision' for Precision

* 'recall' for Recall
* 'Recall' for Recall

* 'f1' for F1 score
* 'F1 Score' for F1 score

n_iter : int, optional (default=100)
The number of trials. The default is 100.
Expand Down Expand Up @@ -483,7 +505,7 @@ def optuna_search(self,
elif verbose == 4:
optuna.logging.set_verbosity(optuna.logging.DEBUG)

study_direction = "maximize" if eval_metric in ['r2', 'accuracy', 'precision', 'recall', 'f1'] else "minimize"
study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score'] else "minimize"

def objective(trial):
"""
Expand Down Expand Up @@ -526,7 +548,7 @@ def objective(trial):

# Update the best score and best hyperparameters If the current score is better than the best one
if model_stats['tuned_model_score'] is None or score > model_stats['tuned_model_score']:
model_stats['tuned_model_score'] = round(score, 4)
model_stats['tuned_model_score'] = round(score, 6)
model_stats['tuned_model'] = test_model

return score
Expand Down
8 changes: 4 additions & 4 deletions flexml/config/supervised_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

# Regression & Classification Evaluation Metrics
EVALUATION_METRICS = {
"Regression": {"DEFAULT": "r2",
"ALL": ["r2", "mae", "mse", "rmse"]},
"Regression": {"DEFAULT": "R2",
"ALL": ["R2", "MAE", "MSE", "RMSE"]},

"Classification": {"DEFAULT": "accuracy",
"ALL": ["accuracy", "precision", "recall", "f1_score"]}
"Classification": {"DEFAULT": "Accuracy",
"ALL": ["Accuracy", "Precision", "Recall", "F1 Score"]}
}
1 change: 1 addition & 0 deletions flexml/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from flexml.helpers.validators import eval_metric_checker
62 changes: 62 additions & 0 deletions flexml/helpers/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from typing import Optional, List
from flexml.config.supervised_config import EVALUATION_METRICS
from flexml.logger.logger import get_logger

def eval_metric_checker(ml_task_type: str,
eval_metric: Optional[str] = None,
all_evaluation_metrics: Optional[List[str]] = None,
default_evaluation_metric: Optional[str] = None) -> str:
"""
Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
this method is used to set and validate the evaluation metric.

Parameters
----------
ml_task_type : str
The type of ML task ('Regression' or 'Classification')

eval_metric : str, (default=None)
The evaluation metric to use for model evaluation

If passed as None, the default evaluation metric of the corresponding ml_task_type will be used

all_evaluation_metrics : List[str], (default=None)
All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE'] for Regression

If passed as None, they will be fetched from the config file

default_evaluation_metric : str, (default=None)
The default evaluation metric to use for the current task (Regression or Classification) e.g. 'R2' for Regression, 'Accuracy' for Classification

If passed as None, it will be fetched from the config file

Returns
-------
str
The evaluation metric to use for model evaluation for the current task (Regression or Classification)
"""
logger = get_logger(__name__, "PROD", False)

if default_evaluation_metric is None or all_evaluation_metrics is None:
default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"]
all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"]

if eval_metric is None:
return default_evaluation_metric

if not isinstance(eval_metric, str):
error_msg = f"eval_metric expected to be a string, got {type(eval_metric)}"
logger.error(error_msg)
raise TypeError(error_msg)

if ml_task_type == "Regression":
eval_metric = eval_metric.upper()
else:
eval_metric = eval_metric.lower().capitalize()

if eval_metric not in all_evaluation_metrics:
error_msg = f"{eval_metric} is not a valid evaluation metric for {ml_task_type}, expected one of the following: {all_evaluation_metrics}"
logger.error(error_msg)
raise ValueError(error_msg)

return eval_metric
Loading
Loading