tryolabs · pazcuturi · Jun 27, 2024 · Jun 27, 2024
diff --git a/challenge/model.py b/challenge/model.py
@@ -1,19 +1,70 @@
+import pickle
+
+import numpy as np
 import pandas as pd
+import xgboost as xgb
 
+from datetime import datetime
 from typing import Tuple, Union, List
 
+from sklearn.model_selection import train_test_split
+
+
+THRESHOLD_IN_MINUTES = 15
+MODEL_FILE_NAME = "delay_model.pkl"
+
+
 class DelayModel:
 
     def __init__(
         self
     ):
-        self._model = None # Model should be saved in this attribute.
+        self._features = [
+            "OPERA_Latin American Wings",
+            "MES_7",
+            "MES_10",
+            "OPERA_Grupo LATAM",
+            "MES_12",
+            "TIPOVUELO_I",
+            "MES_4",
+            "MES_11",
+            "OPERA_Sky Airline",
+            "OPERA_Copa Air"
+        ]
+        self._model = self.__load_model(MODEL_FILE_NAME)
+
+    def __load_model(self, file_name):
+        try:
+            with open(file_name, 'rb') as fp:
+                return pickle.load(fp)
+        except FileNotFoundError:
+            return None
+
+    def __save_model(self, filename):
+        with open(filename, 'wb') as fp:
+            pickle.dump(self._model, fp)
+
+    def get_min_diff(self, data):
+        """
+        Calculate the minute difference between two datetime values.
+
+        Args:
+            data (pd.DataFrame): raw data.
+
+        Returns:
+            float: Minute difference between 'Fecha-O' and 'Fecha-I'.
+        """
+
+        fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
+        fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
+        min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
+        return min_diff
 
     def preprocess(
         self,
         data: pd.DataFrame,
         target_column: str = None
-    ) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame):
+    ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
         """
         Prepare raw data for training or predict.
 
@@ -26,7 +77,28 @@ def preprocess(
             or
             pd.DataFrame: features.
         """
-        return
+
+        features = pd.concat([
+            pd.get_dummies(data['OPERA'], prefix='OPERA'),
+            pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'),
+            pd.get_dummies(data['MES'], prefix='MES')],
+            axis=1
+        )
+
+        for feature in self._features:
+            if feature not in features.columns:
+                features[feature] = 0
+
+        if target_column:
+            data['min_diff'] = data.apply(self.get_min_diff, axis=1)
+
+            data[target_column] = np.where(
+                data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0
+            )
+
+            return features[self._features], data[[target_column]]
+        else:
+            return features[self._features]
 
     def fit(
         self,
@@ -40,7 +112,21 @@ def fit(
             features (pd.DataFrame): preprocessed data.
             target (pd.DataFrame): target.
         """
-        return
+
+        x_train, _, y_train, _ = train_test_split(
+            features, target, test_size=0.33, random_state=42
+        )
+
+        n_y0 = int((target == 0).sum())
+        n_y1 = int((target == 1).sum())
+        scale = n_y0 / n_y1
+
+        self._model = xgb.XGBClassifier(
+            random_state=1, learning_rate=0.01, scale_pos_weight=scale
+        )
+
+        self._model.fit(x_train, y_train)
+        self.__save_model(MODEL_FILE_NAME)
 
     def predict(
         self,
@@ -51,8 +137,14 @@ def predict(
 
         Args:
             features (pd.DataFrame): preprocessed data.
-        
+
         Returns:
             (List[int]): predicted targets.
         """
-        return
+
+        if self._model is None:
+            self.__load_model(MODEL_FILE_NAME)
+
+        predictions = self._model.predict(features)
+
+        return predictions.tolist()
diff --git a/docs/challenge.md b/docs/challenge.md
@@ -16,4 +16,8 @@ When comparing the different models' performances, I want to focus on the positi
 5. **Logistic Regression with Feature Importante and with Balance**: 0.36
 6. **Logistic Regression with Feature Importante but without Balance**: 0.03
 
-With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features.
+With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets.
+
+
+# Bugs fixed on `test_model.py`
+- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected.
diff --git a/tests/model/test_model.py b/tests/model/test_model.py
@@ -28,7 +28,7 @@ class TestModel(unittest.TestCase):
     def setUp(self) -> None:
         super().setUp()
         self.model = DelayModel()
-        self.data = pd.read_csv(filepath_or_buffer="../data/data.csv")
+        self.data = pd.read_csv(filepath_or_buffer="./data/data.csv")
 
 
     def test_model_preprocess_for_training(