ozguraslank · ozguraslank · Dec 10, 2024 · Dec 7, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py
@@ -14,6 +14,7 @@
     f1_score)
 
 from flexml.logger.logger import get_logger
+from flexml.helpers import eval_metric_checker
 
 
 class ModelTuner:
@@ -66,6 +67,25 @@ def __init__(self,
 
         self.logger = get_logger(__name__, "PROD", logging_to_file)
 
+    @staticmethod
+    def __eval_metric_revieser(eval_metric: str) -> str:
+        """
+        Scikit-learn based hyperparameter optimization methods (GridSearch & Randomized Search) require spesific namings for evaluation metrics
+
+        This method is used to revise the evaluation metric name for the optimization process
+
+        Parameters
+        ----------
+        eval_metric : str
+            The evaluation metric
+
+        Returns
+        -------
+        str
+            The revised evaluation metric name. e.g. 'R2' to 'r2, 'Accuracy' to 'accuracy', 'F1 Score' to 'f1_weighted' etc.
+        """
+        return eval_metric.lower() if eval_metric != 'F1 Score' else 'f1_weighted'
+
     def _param_grid_validator(self,
                               model_available_params: dict,
                               param_grid: dict) -> dict:
@@ -185,38 +205,38 @@ def _model_evaluator(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
 
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
 
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
 
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
 
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
 
-            * 'precision' for Precision
+            * 'Precision' for Precision
 
-            * 'recall' for Recall
+            * 'Recall' for Recall
 
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
         """
-        eval_metric = eval_metric.lower()
+        eval_metric = eval_metric_checker(self.ml_problem_type, eval_metric)
 
-        if eval_metric == 'r2':
-            return r2_score(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'mae':
-            return mean_absolute_error(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'mse':
-            return mean_squared_error(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'accuracy':
-            return accuracy_score(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'precision':
-            return precision_score(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'recall':
-            return recall_score(self.y_test, model.predict(self.X_test))
-        elif eval_metric == 'f1':
-            return f1_score(self.y_test, model.predict(self.X_test))
+        if eval_metric == 'R2':
+            return round(r2_score(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'MAE':
+            return round(mean_absolute_error(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'MSE':
+            return round(mean_squared_error(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'Accuracy':
+            return round(accuracy_score(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'Precision':
+            return round(precision_score(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'Recall':
+            return round(recall_score(self.y_test, model.predict(self.X_test)), 6)
+        elif eval_metric == 'F1 Score':
+            return round(f1_score(self.y_test, model.predict(self.X_test)), 6)
         else:
-            error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'r2', 'mae', 'mse', 'accuracy', 'precision', 'recall', 'f1'"
+            error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'R2', 'MAE', 'MSE', 'Accuracy', 'Precision', 'Recall', 'F1 Score'"
             self.logger.error(error_msg)
             raise ValueError(error_msg)
 
@@ -241,19 +261,19 @@ def grid_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
 
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
 
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
 
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
 
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
 
-            * 'precision' for Precision
+            * 'Precision' for Precision
 
-            * 'recall' for Recall
+            * 'Recall' for Recall
 
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         cv : int (default=3)
             The number of cross-validation splits. The default is 3.
@@ -291,10 +311,11 @@ def grid_search(self,
         """
         model_stats = self._setup_tuning("GridSearchCV", model, param_grid, n_iter=None, cv=cv, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
+        scoring_eval_metric = self.__eval_metric_revieser(eval_metric)
 
         try:
             t_start = time()
-            search_result = GridSearchCV(model, param_grid, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
+            search_result = GridSearchCV(model, param_grid, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
             t_end = time()
             time_taken = round(t_end - t_start, 2)
 
@@ -330,19 +351,19 @@ def random_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
 
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
 
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
 
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
 
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
 
-            * 'precision' for Precision
+            * 'Precision' for Precision
 
-            * 'recall' for Recall
+            * 'Recall' for Recall
 
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         n_iter : int, optional (default=10)
             The number of trials. The default is 10.
@@ -374,10 +395,11 @@ def random_search(self,
         """
         model_stats = self._setup_tuning("randomized_search", model, param_grid, n_iter=n_iter, cv=cv, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
+        scoring_eval_metric = self.__eval_metric_revieser(eval_metric)
 
         try:
             t_start = time()
-            search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
+            search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
             t_end = time()
             time_taken = round(t_end - t_start, 2)
 
@@ -413,19 +435,19 @@ def optuna_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
 
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
 
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
 
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
 
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
 
-            * 'precision' for Precision
+            * 'Precision' for Precision
 
-            * 'recall' for Recall
+            * 'Recall' for Recall
 
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         n_iter : int, optional (default=100)
             The number of trials. The default is 100.
@@ -483,7 +505,7 @@ def optuna_search(self,
         elif verbose == 4:
             optuna.logging.set_verbosity(optuna.logging.DEBUG)
 
-        study_direction = "maximize" if eval_metric in ['r2', 'accuracy', 'precision', 'recall', 'f1'] else "minimize"
+        study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score'] else "minimize"
 
         def objective(trial):
             """
@@ -526,7 +548,7 @@ def objective(trial):
 
             # Update the best score and best hyperparameters If the current score is better than the best one
             if model_stats['tuned_model_score'] is None or score > model_stats['tuned_model_score']:
-                model_stats['tuned_model_score'] = round(score, 4)
+                model_stats['tuned_model_score'] = round(score, 6)
                 model_stats['tuned_model'] = test_model
 
             return score

diff --git a/flexml/config/supervised_config.py b/flexml/config/supervised_config.py
@@ -11,9 +11,9 @@
 
 # Regression & Classification Evaluation Metrics
 EVALUATION_METRICS = {
-    "Regression": {"DEFAULT": "r2",
-                   "ALL": ["r2", "mae", "mse", "rmse"]},
+    "Regression": {"DEFAULT": "R2",
+                   "ALL": ["R2", "MAE", "MSE", "RMSE"]},
 
-    "Classification": {"DEFAULT": "accuracy",
-                       "ALL": ["accuracy", "precision", "recall", "f1_score"]}
+    "Classification": {"DEFAULT": "Accuracy",
+                       "ALL": ["Accuracy", "Precision", "Recall", "F1 Score"]}
 }
diff --git a/flexml/helpers/__init__.py b/flexml/helpers/__init__.py
@@ -0,0 +1 @@
+from flexml.helpers.validators import eval_metric_checker
diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py
@@ -0,0 +1,62 @@
+from typing import Optional, List
+from flexml.config.supervised_config import EVALUATION_METRICS
+from flexml.logger.logger import get_logger
+
+def eval_metric_checker(ml_task_type: str,
+                        eval_metric: Optional[str] = None,
+                        all_evaluation_metrics: Optional[List[str]] = None,
+                        default_evaluation_metric: Optional[str] = None) -> str:
+    """
+    Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
+    this method is used to set and validate the evaluation metric.
+
+    Parameters
+    ----------
+    ml_task_type : str
+        The type of ML task ('Regression' or 'Classification')
+
+    eval_metric : str, (default=None)
+        The evaluation metric to use for model evaluation
+
+        If passed as None, the default evaluation metric of the corresponding ml_task_type will be used
+
+    all_evaluation_metrics : List[str], (default=None)
+        All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE'] for Regression
+
+        If passed as None, they will be fetched from the config file
+
+    default_evaluation_metric : str, (default=None)
+        The default evaluation metric to use for the current task (Regression or Classification) e.g. 'R2' for Regression, 'Accuracy' for Classification
+
+        If passed as None, it will be fetched from the config file
+
+    Returns
+    -------
+    str
+        The evaluation metric to use for model evaluation for the current task (Regression or Classification)
+    """
+    logger = get_logger(__name__, "PROD", False)
+
+    if default_evaluation_metric is None or all_evaluation_metrics is None:
+        default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"]
+        all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"]
+
+    if eval_metric is None:
+        return default_evaluation_metric
+
+    if not isinstance(eval_metric, str):
+        error_msg = f"eval_metric expected to be a string, got {type(eval_metric)}"
+        logger.error(error_msg)
+        raise TypeError(error_msg)
+
+    if ml_task_type == "Regression":
+        eval_metric = eval_metric.upper()
+    else:
+        eval_metric = eval_metric.lower().capitalize()
+
+    if eval_metric not in all_evaluation_metrics:
+        error_msg = f"{eval_metric} is not a valid evaluation metric for {ml_task_type}, expected one of the following: {all_evaluation_metrics}"
+        logger.error(error_msg)
+        raise ValueError(error_msg)
+
+    return eval_metric
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from flexml.helpers.validators import eval_metric_checker