Skip to content

Commit

Permalink
Transcribe model from notebook to Python script
Browse files Browse the repository at this point in the history
  • Loading branch information
María Paz Cuturi authored and María Paz Cuturi committed Jun 27, 2024
1 parent f5b2fdb commit efa8aec
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 8 deletions.
104 changes: 98 additions & 6 deletions challenge/model.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,70 @@
import pickle

import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from typing import Tuple, Union, List

from sklearn.model_selection import train_test_split


THRESHOLD_IN_MINUTES = 15
MODEL_FILE_NAME = "delay_model.pkl"


class DelayModel:

def __init__(
self
):
self._model = None # Model should be saved in this attribute.
self._features = [
"OPERA_Latin American Wings",
"MES_7",
"MES_10",
"OPERA_Grupo LATAM",
"MES_12",
"TIPOVUELO_I",
"MES_4",
"MES_11",
"OPERA_Sky Airline",
"OPERA_Copa Air"
]
self._model = self.__load_model(MODEL_FILE_NAME)

def __load_model(self, file_name):
try:
with open(file_name, 'rb') as fp:
return pickle.load(fp)
except FileNotFoundError:
return None

def __save_model(self, filename):
with open(filename, 'wb') as fp:
pickle.dump(self._model, fp)

def get_min_diff(self, data):
"""
Calculate the minute difference between two datetime values.
Args:
data (pd.DataFrame): raw data.
Returns:
float: Minute difference between 'Fecha-O' and 'Fecha-I'.
"""

fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
return min_diff

def preprocess(
self,
data: pd.DataFrame,
target_column: str = None
) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame):
) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
"""
Prepare raw data for training or predict.
Expand All @@ -26,7 +77,28 @@ def preprocess(
or
pd.DataFrame: features.
"""
return

features = pd.concat([
pd.get_dummies(data['OPERA'], prefix='OPERA'),
pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'),
pd.get_dummies(data['MES'], prefix='MES')],
axis=1
)

for feature in self._features:
if feature not in features.columns:
features[feature] = 0

if target_column:
data['min_diff'] = data.apply(self.get_min_diff, axis=1)

data[target_column] = np.where(
data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0
)

return features[self._features], data[[target_column]]
else:
return features[self._features]

def fit(
self,
Expand All @@ -40,7 +112,21 @@ def fit(
features (pd.DataFrame): preprocessed data.
target (pd.DataFrame): target.
"""
return

x_train, _, y_train, _ = train_test_split(
features, target, test_size=0.33, random_state=42
)

n_y0 = int((target == 0).sum())
n_y1 = int((target == 1).sum())
scale = n_y0 / n_y1

self._model = xgb.XGBClassifier(
random_state=1, learning_rate=0.01, scale_pos_weight=scale
)

self._model.fit(x_train, y_train)
self.__save_model(MODEL_FILE_NAME)

def predict(
self,
Expand All @@ -51,8 +137,14 @@ def predict(
Args:
features (pd.DataFrame): preprocessed data.
Returns:
(List[int]): predicted targets.
"""
return

if self._model is None:
self.__load_model(MODEL_FILE_NAME)

predictions = self._model.predict(features)

return predictions.tolist()
6 changes: 5 additions & 1 deletion docs/challenge.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,8 @@ When comparing the different models' performances, I want to focus on the positi
5. **Logistic Regression with Feature Importante and with Balance**: 0.36
6. **Logistic Regression with Feature Importante but without Balance**: 0.03

With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets.
With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets.


# Bugs fixed on `test_model.py`
- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected.
2 changes: 1 addition & 1 deletion tests/model/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase):
def setUp(self) -> None:
super().setUp()
self.model = DelayModel()
self.data = pd.read_csv(filepath_or_buffer="../data/data.csv")
self.data = pd.read_csv(filepath_or_buffer="./data/data.csv")


def test_model_preprocess_for_training(
Expand Down

0 comments on commit efa8aec

Please sign in to comment.