From 8e30c5a3635b04558d7db3934ecaaa6af0fb0dd2 Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 7 Dec 2024 12:49:47 +0300
Subject: [PATCH 1/3] Float rounds are adjusted & case sensitivity in eval
 metrics is removed

---
 flexml/_model_tuner.py               | 16 ++++++++--------
 flexml/config/supervised_config.py   |  8 ++++----
 flexml/structures/supervised_base.py | 20 +++++++++++---------
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py
index 961a08a..f587f4b 100644
--- a/flexml/_model_tuner.py
+++ b/flexml/_model_tuner.py
@@ -202,19 +202,19 @@ def _model_evaluator(self,
         eval_metric = eval_metric.lower()
         
         if eval_metric == 'r2':
-            return r2_score(self.y_test, model.predict(self.X_test))
+            return round(r2_score(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'mae':
-            return mean_absolute_error(self.y_test, model.predict(self.X_test))
+            return round(mean_absolute_error(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'mse':
-            return mean_squared_error(self.y_test, model.predict(self.X_test))
+            return round(mean_squared_error(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'accuracy':
-            return accuracy_score(self.y_test, model.predict(self.X_test))
+            return round(accuracy_score(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'precision':
-            return precision_score(self.y_test, model.predict(self.X_test))
+            return round(precision_score(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'recall':
-            return recall_score(self.y_test, model.predict(self.X_test))
+            return round(recall_score(self.y_test, model.predict(self.X_test)), 6)
         elif eval_metric == 'f1':
-            return f1_score(self.y_test, model.predict(self.X_test))
+            return round(f1_score(self.y_test, model.predict(self.X_test)), 6)
         else:
             error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'r2', 'mae', 'mse', 'accuracy', 'precision', 'recall', 'f1'"
             self.logger.error(error_msg)
@@ -526,7 +526,7 @@ def objective(trial):
             
             # Update the best score and best hyperparameters If the current score is better than the best one
             if model_stats['tuned_model_score'] is None or score > model_stats['tuned_model_score']:
-                model_stats['tuned_model_score'] = round(score, 4)
+                model_stats['tuned_model_score'] = round(score, 6)
                 model_stats['tuned_model'] = test_model
 
             return score
diff --git a/flexml/config/supervised_config.py b/flexml/config/supervised_config.py
index 8ffea3a..45a7ac6 100644
--- a/flexml/config/supervised_config.py
+++ b/flexml/config/supervised_config.py
@@ -11,9 +11,9 @@
 
 # Regression & Classification Evaluation Metrics
 EVALUATION_METRICS = {
-    "Regression": {"DEFAULT": "r2",
-                   "ALL": ["r2", "mae", "mse", "rmse"]},
+    "Regression": {"DEFAULT": "R2",
+                   "ALL": ["R2", "MAE", "MSE", "RMSE"]},
                    
-    "Classification": {"DEFAULT": "accuracy",
-                       "ALL": ["accuracy", "precision", "recall", "f1_score"]}
+    "Classification": {"DEFAULT": "Accuracy",
+                       "ALL": ["Accuracy", "Precision", "Recall", "F1 Score"]}
 }
diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py
index a87a24f..31b29af 100644
--- a/flexml/structures/supervised_base.py
+++ b/flexml/structures/supervised_base.py
@@ -188,7 +188,9 @@ def __eval_metric_checker(self, eval_metric: Optional[str] = None) -> str:
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
         
-        if eval_metric not in self.__ALL_EVALUATION_METRICS:
+        if ((self.__ML_TASK_TYPE == "Classification" and eval_metric.lower().capitalize() not in self.__ALL_EVALUATION_METRICS) or
+                (self.__ML_TASK_TYPE == "Regression" and eval_metric.upper() not in self.__ALL_EVALUATION_METRICS)):
+
             error_msg = f"{eval_metric} is not a valid evaluation metric for {self.__ML_TASK_TYPE}, expected one of the following: {self.__ALL_EVALUATION_METRICS}"
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
@@ -237,10 +239,10 @@ def __evaluate_model_perf(self, y_test, y_pred):
         """
 
         if self.__ML_TASK_TYPE == "Regression":
-            r2 = round(r2_score(y_test, y_pred), 4)
-            mae = round(mean_absolute_error(y_test, y_pred), 4)
-            mse = round(mean_squared_error(y_test, y_pred), 4)
-            rmse = round(np.sqrt(mse), 4)
+            r2 = round(r2_score(y_test, y_pred), 6)
+            mae = round(mean_absolute_error(y_test, y_pred), 6)
+            mse = round(mean_squared_error(y_test, y_pred), 6)
+            rmse = round(np.sqrt(mse), 6)
             return {
                 "r2": r2,
                 "mae": mae,
@@ -249,10 +251,10 @@ def __evaluate_model_perf(self, y_test, y_pred):
             }
         
         elif self.__ML_TASK_TYPE == "Classification":
-            accuracy = round(accuracy_score(y_test, y_pred), 4)
-            precision = round(precision_score(y_test, y_pred, average='weighted'), 4)
-            recall = round(recall_score(y_test, y_pred, average='weighted'), 4)
-            f1 = round(f1_score(y_test, y_pred, average='weighted'), 4)
+            accuracy = round(accuracy_score(y_test, y_pred), 6)
+            precision = round(precision_score(y_test, y_pred, average='weighted'), 6)
+            recall = round(recall_score(y_test, y_pred, average='weighted'), 6)
+            f1 = round(f1_score(y_test, y_pred, average='weighted'), 6)
             return {
                 "accuracy": accuracy,
                 "precision": precision,

From b894e5e96a79ffdd084ed6fb3572efdf0fcf1d3f Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Tue, 10 Dec 2024 20:30:43 +0300
Subject: [PATCH 2/3] Error fix for #8e30c5a commit

* Since custom metrics are needed for model tuning processes, custom eval_metric_reveiser function is developed
* eval_metric_checker at SupervisedBase is moved to general validator module so that both model tuner and SupervisedBase can use it
---
 flexml/_model_tuner.py               | 102 ++++++++++++++++-----------
 flexml/helpers/__init__.py           |   1 +
 flexml/helpers/validators.py         |  62 ++++++++++++++++
 flexml/structures/supervised_base.py |  89 ++++++++---------------
 4 files changed, 154 insertions(+), 100 deletions(-)
 create mode 100644 flexml/helpers/__init__.py
 create mode 100644 flexml/helpers/validators.py

diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py
index f587f4b..be01339 100644
--- a/flexml/_model_tuner.py
+++ b/flexml/_model_tuner.py
@@ -14,6 +14,7 @@
     f1_score)
 
 from flexml.logger.logger import get_logger
+from flexml.helpers import eval_metric_checker
 
 
 class ModelTuner:
@@ -66,6 +67,25 @@ def __init__(self,
 
         self.logger = get_logger(__name__, "PROD", logging_to_file)
 
+    @staticmethod
+    def __eval_metric_revieser(eval_metric: str) -> str:
+        """
+        Scikit-learn based hyperparameter optimization methods (GridSearch & Randomized Search) require spesific namings for evaluation metrics
+
+        This method is used to revise the evaluation metric name for the optimization process
+
+        Parameters
+        ----------
+        eval_metric : str
+            The evaluation metric
+
+        Returns
+        -------
+        str
+            The revised evaluation metric name. e.g. 'R2' to 'r2, 'Accuracy' to 'accuracy', 'F1 Score' to 'f1_weighted' etc.
+        """
+        return eval_metric.lower() if eval_metric != 'F1 Score' else 'f1_weighted'
+ 
     def _param_grid_validator(self,
                               model_available_params: dict,
                               param_grid: dict) -> dict:
@@ -185,38 +205,38 @@ def _model_evaluator(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
             
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
             
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
             
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
             
-            * 'precision' for Precision
+            * 'Precision' for Precision
             
-            * 'recall' for Recall
+            * 'Recall' for Recall
             
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
         """
-        eval_metric = eval_metric.lower()
+        eval_metric = eval_metric_checker(self.ml_problem_type, eval_metric)
         
-        if eval_metric == 'r2':
+        if eval_metric == 'R2':
             return round(r2_score(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'mae':
+        elif eval_metric == 'MAE':
             return round(mean_absolute_error(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'mse':
+        elif eval_metric == 'MSE':
             return round(mean_squared_error(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'accuracy':
+        elif eval_metric == 'Accuracy':
             return round(accuracy_score(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'precision':
+        elif eval_metric == 'Precision':
             return round(precision_score(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'recall':
+        elif eval_metric == 'Recall':
             return round(recall_score(self.y_test, model.predict(self.X_test)), 6)
-        elif eval_metric == 'f1':
+        elif eval_metric == 'F1 Score':
             return round(f1_score(self.y_test, model.predict(self.X_test)), 6)
         else:
-            error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'r2', 'mae', 'mse', 'accuracy', 'precision', 'recall', 'f1'"
+            error_msg = "Error while evaluating the current model during the model tuning process. The eval_metric should be one of the following: 'R2', 'MAE', 'MSE', 'Accuracy', 'Precision', 'Recall', 'F1 Score'"
             self.logger.error(error_msg)
             raise ValueError(error_msg)
             
@@ -241,19 +261,19 @@ def grid_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
             
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
             
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
             
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
             
-            * 'precision' for Precision
+            * 'Precision' for Precision
             
-            * 'recall' for Recall
+            * 'Recall' for Recall
             
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         cv : int (default=3)
             The number of cross-validation splits. The default is 3.
@@ -291,10 +311,11 @@ def grid_search(self,
         """
         model_stats = self._setup_tuning("GridSearchCV", model, param_grid, n_iter=None, cv=cv, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
+        scoring_eval_metric = self.__eval_metric_revieser(eval_metric)
         
         try:
             t_start = time()
-            search_result = GridSearchCV(model, param_grid, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
+            search_result = GridSearchCV(model, param_grid, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
             t_end = time()
             time_taken = round(t_end - t_start, 2)
 
@@ -330,19 +351,19 @@ def random_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
             
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
             
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
             
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
             
-            * 'precision' for Precision
+            * 'Precision' for Precision
             
-            * 'recall' for Recall
+            * 'Recall' for Recall
             
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         n_iter : int, optional (default=10)
             The number of trials. The default is 10.
@@ -374,10 +395,11 @@ def random_search(self,
         """
         model_stats = self._setup_tuning("randomized_search", model, param_grid, n_iter=n_iter, cv=cv, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
+        scoring_eval_metric = self.__eval_metric_revieser(eval_metric)
 
         try:
             t_start = time()
-            search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
+            search_result = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=n_iter, scoring=scoring_eval_metric, cv=cv, n_jobs=n_jobs, verbose=verbose).fit(self.X_train, self.y_train)
             t_end = time()
             time_taken = round(t_end - t_start, 2)
 
@@ -413,19 +435,19 @@ def optuna_search(self,
         eval_metric : str
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
-            * 'r2' for R^2 score
+            * 'R2' for R^2 score
             
-            * 'mae' for Mean Absolute Error
+            * 'MAE' for Mean Absolute Error
             
-            * 'mse' for Mean Squared Error
+            * 'MSE' for Mean Squared Error
             
-            * 'accuracy' for Accuracy
+            * 'Accuracy' for Accuracy
             
-            * 'precision' for Precision
+            * 'Precision' for Precision
             
-            * 'recall' for Recall
+            * 'Recall' for Recall
             
-            * 'f1' for F1 score
+            * 'F1 Score' for F1 score
 
         n_iter : int, optional (default=100)
             The number of trials. The default is 100.
@@ -483,7 +505,7 @@ def optuna_search(self,
         elif verbose == 4:
             optuna.logging.set_verbosity(optuna.logging.DEBUG)
 
-        study_direction = "maximize" if eval_metric in ['r2', 'accuracy', 'precision', 'recall', 'f1'] else "minimize"
+        study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score'] else "minimize"
 
         def objective(trial):
             """
diff --git a/flexml/helpers/__init__.py b/flexml/helpers/__init__.py
new file mode 100644
index 0000000..98f5192
--- /dev/null
+++ b/flexml/helpers/__init__.py
@@ -0,0 +1 @@
+from flexml.helpers.validators import eval_metric_checker
\ No newline at end of file
diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py
new file mode 100644
index 0000000..5d51e53
--- /dev/null
+++ b/flexml/helpers/validators.py
@@ -0,0 +1,62 @@
+from typing import Optional, List
+from flexml.config.supervised_config import EVALUATION_METRICS
+from flexml.logger.logger import get_logger
+
+def eval_metric_checker(ml_task_type: str,
+                        eval_metric: Optional[str] = None,
+                        all_evaluation_metrics: Optional[List[str]] = None,
+                        default_evaluation_metric: Optional[str] = None) -> str:
+    """
+    Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
+    this method is used to set and validate the evaluation metric.
+
+    Parameters
+    ----------
+    ml_task_type : str
+        The type of ML task ('Regression' or 'Classification')
+
+    eval_metric : str, (default=None)
+        The evaluation metric to use for model evaluation
+
+        If passed as None, the default evaluation metric of the corresponding ml_task_type will be used
+    
+    all_evaluation_metrics : List[str], (default=None)
+        All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE'] for Regression
+
+        If passed as None, they will be fetched from the config file
+
+    default_evaluation_metric : str, (default=None)
+        The default evaluation metric to use for the current task (Regression or Classification) e.g. 'R2' for Regression, 'Accuracy' for Classification
+
+        If passed as None, it will be fetched from the config file
+
+    Returns
+    -------
+    str
+        The evaluation metric to use for model evaluation for the current task (Regression or Classification)
+    """
+    logger = get_logger(__name__, "PROD", False)
+    
+    if default_evaluation_metric is None or all_evaluation_metrics is None:
+        default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"]
+        all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"]
+
+    if eval_metric is None:
+        return default_evaluation_metric
+    
+    if not isinstance(eval_metric, str):
+        error_msg = f"eval_metric expected to be a string, got {type(eval_metric)}"
+        logger.error(error_msg)
+        raise TypeError(error_msg)
+    
+    if ml_task_type == "Regression":
+        eval_metric = eval_metric.upper()
+    else:
+        eval_metric = eval_metric.lower().capitalize()
+
+    if eval_metric not in all_evaluation_metrics:
+        error_msg = f"{eval_metric} is not a valid evaluation metric for {ml_task_type}, expected one of the following: {all_evaluation_metrics}"
+        logger.error(error_msg)
+        raise ValueError(error_msg)
+    
+    return eval_metric
\ No newline at end of file
diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py
index 31b29af..5c88a8f 100644
--- a/flexml/structures/supervised_base.py
+++ b/flexml/structures/supervised_base.py
@@ -17,6 +17,7 @@
 
 from flexml.config.supervised_config import ML_MODELS, EVALUATION_METRICS
 from flexml.logger.logger import get_logger
+from flexml.helpers import eval_metric_checker
 from flexml._model_tuner import ModelTuner
 
 
@@ -164,38 +165,6 @@ def __train_test_split(self, test_size: float, random_state: int) -> list[np.nda
             error_msg = f"An error occurred while splitting the data into train and test: {str(e)}"
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
-        
-    def __eval_metric_checker(self, eval_metric: Optional[str] = None) -> str:
-        """
-        Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
-        this method is used to set and validate the evaluation metric.
-
-        Parameters
-        ----------
-        eval_metric : str
-            The evaluation metric to use for model evaluation.
-        
-        Returns
-        -------
-        str
-            The evaluation metric to use for model evaluation for the current task (Regression or Classification)
-        """
-        if eval_metric is None: # If the user passed nothing, the default evaluation metric will be used ('r2' for Regression, 'accuracy' for Classification)
-            return self.__DEFAULT_EVALUATION_METRIC
-        
-        if not isinstance(eval_metric, str):
-            error_msg = f"eval_metric expected to be a string, got {type(eval_metric)}"
-            self.__logger.error(error_msg)
-            raise ValueError(error_msg)
-        
-        if ((self.__ML_TASK_TYPE == "Classification" and eval_metric.lower().capitalize() not in self.__ALL_EVALUATION_METRICS) or
-                (self.__ML_TASK_TYPE == "Regression" and eval_metric.upper() not in self.__ALL_EVALUATION_METRICS)):
-
-            error_msg = f"{eval_metric} is not a valid evaluation metric for {self.__ML_TASK_TYPE}, expected one of the following: {self.__ALL_EVALUATION_METRICS}"
-            self.__logger.error(error_msg)
-            raise ValueError(error_msg)
-        
-        return eval_metric
     
     def __top_n_models_checker(self, top_n_models: Optional[int]) -> int:
         """
@@ -233,9 +202,9 @@ def __evaluate_model_perf(self, y_test, y_pred):
         dict
             A dictionary containing the evaluation metric of the current task
                 
-                * r2, mae, mse, rmse for Regression tasks
+                * R2, MAE, MSE, RMSE for Regression tasks
 
-                * accuracy, precision, recall, f1_score for Classification tasks
+                * Accuracy, Precision, Recall, F1 Score for Classification tasks
         """
 
         if self.__ML_TASK_TYPE == "Regression":
@@ -244,10 +213,10 @@ def __evaluate_model_perf(self, y_test, y_pred):
             mse = round(mean_squared_error(y_test, y_pred), 6)
             rmse = round(np.sqrt(mse), 6)
             return {
-                "r2": r2,
-                "mae": mae,
-                "mse": mse,
-                "rmse": rmse
+                "R2": r2,
+                "MAE": mae,
+                "MSE": mse,
+                "RMSE": rmse
             }
         
         elif self.__ML_TASK_TYPE == "Classification":
@@ -256,10 +225,10 @@ def __evaluate_model_perf(self, y_test, y_pred):
             recall = round(recall_score(y_test, y_pred, average='weighted'), 6)
             f1 = round(f1_score(y_test, y_pred, average='weighted'), 6)
             return {
-                "accuracy": accuracy,
-                "precision": precision,
-                "recall": recall,
-                "f1_score": f1
+                "Accuracy": accuracy,
+                "Precision": precision,
+                "Recall": recall,
+                "F1 Score": f1
             }
         
         else:
@@ -289,7 +258,7 @@ def start_experiment(self,
         test_size : float, (default=0.25)
             The size of the test data in the train-test split process.
 
-        eval_metric : str (default='r2' for Regression, 'accuracy' for Classification)
+        eval_metric : str (default='R2' for Regression, 'Accuracy' for Classification)
             The evaluation metric to use for model evaluation.
 
         random_state : int, (default=42)
@@ -298,7 +267,7 @@ def start_experiment(self,
             For more info, visit https://scikit-learn.org/stable/glossary.html#term-random_state
         """
         
-        self.eval_metric = self.__eval_metric_checker(eval_metric)
+        self.eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         self.experiment_size = experiment_size
         self.test_size = test_size
         self.random_state = random_state
@@ -374,12 +343,12 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int =
         ----------
         top_n_models : int
             The number of top models to select based on the evaluation metric.
-        eval_metric : str (default='r2 for Regression, 'accuracy' for Classification)
+        eval_metric : str (default='R2 for Regression, 'Accuracy' for Classification)
             The evaluation metric to use for model evaluation:
                 
-                * r2, mae, mse, rmse for Regression tasks
+                * R2, MAE, MSE, RMSE for Regression tasks
 
-                * accuracy, precision, recall, f1_score for Classification tasks
+                * Accuracy, Precision, Recall, F1 Score for Classification tasks
         Returns
         -------
         object or list[object]
@@ -393,7 +362,7 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int =
         top_n_models = self.__top_n_models_checker(top_n_models)
 
         if eval_metric is not None:
-            eval_metric = self.__eval_metric_checker(eval_metric)
+            eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         else: # If the user doesn't pass a eval_metric, get the evaluation metric passed to the start_experiment function
             eval_metric = self.eval_metric
         
@@ -425,8 +394,8 @@ def __sort_models(self, eval_metric: Optional[str] = None):
 
         Parameters
         ----------
-        eval_metric : str (default='r2')
-            The evaluation metric to use for model evaluation (e.g. 'r2', 'mae', 'mse', 'rmse')
+        eval_metric : str (default='R2')
+            The evaluation metric to use for model evaluation (e.g. 'R2', 'MAE', 'MSE', 'RMSE')
 
         Returns
         -------
@@ -438,10 +407,10 @@ def __sort_models(self, eval_metric: Optional[str] = None):
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
         
-        eval_metric = self.__eval_metric_checker(eval_metric)
+        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         
         # Since lower is better for mae, mse and rmse in Regression tasks, they should be sorted in ascending order
-        if self.__ML_TASK_TYPE == "Regression" and eval_metric in ['mae', 'mse', 'rmse']:
+        if self.__ML_TASK_TYPE == "Regression" and eval_metric in ['MAE', 'MSE', 'RMSE']:
             return self.__model_stats_df.sort_values(by=eval_metric, ascending=True).reset_index(drop = True)
         else:
             return self.__model_stats_df.sort_values(by=eval_metric, ascending=False).reset_index(drop = True)
@@ -452,11 +421,11 @@ def show_model_stats(self, eval_metric: Optional[str] = None):
 
         Parameters
         ----------
-        eval_metric : str (default='r2' for regression, 'accuracy' for classification)
+        eval_metric : str (default='R2' for regression, 'Accuracy' for classification)
             The evaluation metric to use for model evaluation
         
-            * r2, mae, mse, rmse for Regression tasks
-            * accuracy, precision, recall, f1_score for Classification tasks
+            * R2, MAE, MSE, RMSE for Regression tasks
+            * Accuracy, Precision, Recall, F1 Score for Classification tasks
         """
         def highlight_best(s: pd.Series) -> list[str]:
             """
@@ -478,7 +447,7 @@ def highlight_best(s: pd.Series) -> list[str]:
                 is_best = s == s.max()
             return ['background-color: green' if v else '' for v in is_best]
         
-        eval_metric = self.__eval_metric_checker(eval_metric)
+        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         sorted_model_stats_df = self.__sort_models(eval_metric)
         sorted_model_stats_df['Time Taken (sec)'] = sorted_model_stats_df['Time Taken (sec)'].apply(lambda x: round(x, 2))
         sorted_model_stats_df.index += 1
@@ -567,12 +536,12 @@ def tune_model(self,
             
             * 'optuna' for Optuna (https://optuna.readthedocs.io/en/stable/)
 
-        eval_metric : str (default='r2' for regression, 'accuracy' for classification)
+        eval_metric : str (default='R2' for regression, 'Accuracy' for classification)
             The evaluation metric to use for model evaluation
         
-            * r2, mae, mse, rmse for Regression tasks
+            * R2, MAE, MSE, RMSE for Regression tasks
 
-            * accuracy, precision, recall, f1_score for Classification tasks
+            * Accuracy, Precision, Recall, F1 Score for Classification tasks
 
         param_grid : dict (default = defined custom param dict in flexml/config/tune_model_config.py)
             The parameter set to use for model tuning.
@@ -646,7 +615,7 @@ def _show_tuning_report(tuning_report: dict):
             self.get_best_models() # Update the self.__model_stats_df
             self.show_model_stats()
 
-        eval_metric = self.__eval_metric_checker(eval_metric)
+        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
 
         # Create the ModelTuner object If It's not created before, avoid creating it everytime tune_model() function is called
         if not hasattr(self, 'model_tuner'):

From c7c23c89ced95c5c18eabe115152850b0ac8d904 Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Tue, 10 Dec 2024 21:15:00 +0300
Subject: [PATCH 3/3] Limited Scikit-learn to <=1.5.2 to avoid depcreated
 __sklearn_tags__ attribute error in XGBoost and LightGBM

---
 requirements-test.txt | 2 +-
 requirements.txt      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index ca1c420..1fbabdf 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,6 +1,6 @@
 numpy>=1.21,<=1.26.4
 pandas>=2.0.1
-scikit-learn>=1.5.0
+scikit-learn>=1.5.0,<=1.5.2
 xgboost>=2.0.0
 lightgbm>=4.0.0
 catboost>=1.2.3
diff --git a/requirements.txt b/requirements.txt
index 03ff2c3..d3f5591 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy>=1.21,<=1.26.4
 pandas>=2.0.1
-scikit-learn>=1.5.0
+scikit-learn>=1.5.0,<=1.5.2
 xgboost>=2.0.0
 lightgbm>=4.0.0
 catboost>=1.2.3