From efa8aecdff239a77fb4060b143381200826c9917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 09:58:27 -0300 Subject: [PATCH] Transcribe model from notebook to Python script --- challenge/model.py | 104 +++++++++++++++++++++++++++++++++++--- docs/challenge.md | 6 ++- tests/model/test_model.py | 2 +- 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/challenge/model.py b/challenge/model.py index 173ac6c..1143620 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -1,19 +1,70 @@ +import pickle + +import numpy as np import pandas as pd +import xgboost as xgb +from datetime import datetime from typing import Tuple, Union, List +from sklearn.model_selection import train_test_split + + +THRESHOLD_IN_MINUTES = 15 +MODEL_FILE_NAME = "delay_model.pkl" + + class DelayModel: def __init__( self ): - self._model = None # Model should be saved in this attribute. + self._features = [ + "OPERA_Latin American Wings", + "MES_7", + "MES_10", + "OPERA_Grupo LATAM", + "MES_12", + "TIPOVUELO_I", + "MES_4", + "MES_11", + "OPERA_Sky Airline", + "OPERA_Copa Air" + ] + self._model = self.__load_model(MODEL_FILE_NAME) + + def __load_model(self, file_name): + try: + with open(file_name, 'rb') as fp: + return pickle.load(fp) + except FileNotFoundError: + return None + + def __save_model(self, filename): + with open(filename, 'wb') as fp: + pickle.dump(self._model, fp) + + def get_min_diff(self, data): + """ + Calculate the minute difference between two datetime values. + + Args: + data (pd.DataFrame): raw data. + + Returns: + float: Minute difference between 'Fecha-O' and 'Fecha-I'. + """ + + fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S') + fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S') + min_diff = ((fecha_o - fecha_i).total_seconds()) / 60 + return min_diff def preprocess( self, data: pd.DataFrame, target_column: str = None - ) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame): + ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]: """ Prepare raw data for training or predict. @@ -26,7 +77,28 @@ def preprocess( or pd.DataFrame: features. """ - return + + features = pd.concat([ + pd.get_dummies(data['OPERA'], prefix='OPERA'), + pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'), + pd.get_dummies(data['MES'], prefix='MES')], + axis=1 + ) + + for feature in self._features: + if feature not in features.columns: + features[feature] = 0 + + if target_column: + data['min_diff'] = data.apply(self.get_min_diff, axis=1) + + data[target_column] = np.where( + data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0 + ) + + return features[self._features], data[[target_column]] + else: + return features[self._features] def fit( self, @@ -40,7 +112,21 @@ def fit( features (pd.DataFrame): preprocessed data. target (pd.DataFrame): target. """ - return + + x_train, _, y_train, _ = train_test_split( + features, target, test_size=0.33, random_state=42 + ) + + n_y0 = int((target == 0).sum()) + n_y1 = int((target == 1).sum()) + scale = n_y0 / n_y1 + + self._model = xgb.XGBClassifier( + random_state=1, learning_rate=0.01, scale_pos_weight=scale + ) + + self._model.fit(x_train, y_train) + self.__save_model(MODEL_FILE_NAME) def predict( self, @@ -51,8 +137,14 @@ def predict( Args: features (pd.DataFrame): preprocessed data. - + Returns: (List[int]): predicted targets. """ - return \ No newline at end of file + + if self._model is None: + self.__load_model(MODEL_FILE_NAME) + + predictions = self._model.predict(features) + + return predictions.tolist() diff --git a/docs/challenge.md b/docs/challenge.md index ddb64a3..a4b0a45 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -17,4 +17,8 @@ When comparing the different models' performances, I want to focus on the positi 5. **Logistic Regression with Feature Importante and with Balance**: 0.36 6. **Logistic Regression with Feature Importante but without Balance**: 0.03 -With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. \ No newline at end of file +With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. + + +# Bugs fixed on `test_model.py` +- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected. diff --git a/tests/model/test_model.py b/tests/model/test_model.py index e4afabb..97b28e1 100644 --- a/tests/model/test_model.py +++ b/tests/model/test_model.py @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase): def setUp(self) -> None: super().setUp() self.model = DelayModel() - self.data = pd.read_csv(filepath_or_buffer="../data/data.csv") + self.data = pd.read_csv(filepath_or_buffer="./data/data.csv") def test_model_preprocess_for_training(