Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transcribe model from notebook to Python script #3

Merged
merged 1 commit into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 98 additions & 6 deletions challenge/model.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,70 @@
import pickle

import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from typing import Tuple, Union, List

from sklearn.model_selection import train_test_split


THRESHOLD_IN_MINUTES = 15
MODEL_FILE_NAME = "delay_model.pkl"


class DelayModel:

def __init__(
self
):
self._model = None # Model should be saved in this attribute.
self._features = [
"OPERA_Latin American Wings",
"MES_7",
"MES_10",
"OPERA_Grupo LATAM",
"MES_12",
"TIPOVUELO_I",
"MES_4",
"MES_11",
"OPERA_Sky Airline",
"OPERA_Copa Air"
]
self._model = self.__load_model(MODEL_FILE_NAME)

def __load_model(self, file_name):
try:
with open(file_name, 'rb') as fp:
return pickle.load(fp)
except FileNotFoundError:
return None

def __save_model(self, filename):
with open(filename, 'wb') as fp:
pickle.dump(self._model, fp)

def get_min_diff(self, data):
"""
Calculate the minute difference between two datetime values.

Args:
data (pd.DataFrame): raw data.

Returns:
float: Minute difference between 'Fecha-O' and 'Fecha-I'.
"""

fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
return min_diff

def preprocess(
self,
data: pd.DataFrame,
target_column: str = None
) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame):
) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
"""
Prepare raw data for training or predict.

Expand All @@ -26,7 +77,28 @@ def preprocess(
or
pd.DataFrame: features.
"""
return

features = pd.concat([
pd.get_dummies(data['OPERA'], prefix='OPERA'),
pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'),
pd.get_dummies(data['MES'], prefix='MES')],
axis=1
)

for feature in self._features:
if feature not in features.columns:
features[feature] = 0

if target_column:
data['min_diff'] = data.apply(self.get_min_diff, axis=1)

data[target_column] = np.where(
data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0
)

return features[self._features], data[[target_column]]
else:
return features[self._features]

def fit(
self,
Expand All @@ -40,7 +112,21 @@ def fit(
features (pd.DataFrame): preprocessed data.
target (pd.DataFrame): target.
"""
return

x_train, _, y_train, _ = train_test_split(
features, target, test_size=0.33, random_state=42
)

n_y0 = int((target == 0).sum())
n_y1 = int((target == 1).sum())
scale = n_y0 / n_y1

self._model = xgb.XGBClassifier(
random_state=1, learning_rate=0.01, scale_pos_weight=scale
)

self._model.fit(x_train, y_train)
self.__save_model(MODEL_FILE_NAME)

def predict(
self,
Expand All @@ -51,8 +137,14 @@ def predict(

Args:
features (pd.DataFrame): preprocessed data.

Returns:
(List[int]): predicted targets.
"""
return

if self._model is None:
self.__load_model(MODEL_FILE_NAME)

predictions = self._model.predict(features)

return predictions.tolist()
6 changes: 5 additions & 1 deletion docs/challenge.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ When comparing the different models' performances, I want to focus on the positi
5. **Logistic Regression with Feature Importante and with Balance**: 0.36
6. **Logistic Regression with Feature Importante but without Balance**: 0.03

With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features.
With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets.


# Bugs fixed on `test_model.py`
- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected.
2 changes: 1 addition & 1 deletion tests/model/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase):
def setUp(self) -> None:
super().setUp()
self.model = DelayModel()
self.data = pd.read_csv(filepath_or_buffer="../data/data.csv")
self.data = pd.read_csv(filepath_or_buffer="./data/data.csv")


def test_model_preprocess_for_training(
Expand Down