Skip to content

Commit

Permalink
Merge pull request #3 from tryolabs/transcribe-model
Browse files Browse the repository at this point in the history
Transcribe model from notebook to Python script
  • Loading branch information
pazcuturi authored Jun 27, 2024
2 parents d3c81de + eb39a25 commit 9dd04cc
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 8 deletions.
104 changes: 98 additions & 6 deletions challenge/model.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,70 @@
import pickle

import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from typing import Tuple, Union, List

from sklearn.model_selection import train_test_split


THRESHOLD_IN_MINUTES = 15
MODEL_FILE_NAME = "delay_model.pkl"


class DelayModel:

def __init__(
self
):
self._model = None # Model should be saved in this attribute.
self._features = [
"OPERA_Latin American Wings",
"MES_7",
"MES_10",
"OPERA_Grupo LATAM",
"MES_12",
"TIPOVUELO_I",
"MES_4",
"MES_11",
"OPERA_Sky Airline",
"OPERA_Copa Air"
]
self._model = self.__load_model(MODEL_FILE_NAME)

def __load_model(self, file_name):
try:
with open(file_name, 'rb') as fp:
return pickle.load(fp)
except FileNotFoundError:
return None

def __save_model(self, filename):
with open(filename, 'wb') as fp:
pickle.dump(self._model, fp)

def get_min_diff(self, data):
"""
Calculate the minute difference between two datetime values.
Args:
data (pd.DataFrame): raw data.
Returns:
float: Minute difference between 'Fecha-O' and 'Fecha-I'.
"""

fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
min_diff = ((fecha_o - fecha_i).total_seconds()) / 60
return min_diff

def preprocess(
self,
data: pd.DataFrame,
target_column: str = None
) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame):
) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
"""
Prepare raw data for training or predict.
Expand All @@ -26,7 +77,28 @@ def preprocess(
or
pd.DataFrame: features.
"""
return

features = pd.concat([
pd.get_dummies(data['OPERA'], prefix='OPERA'),
pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'),
pd.get_dummies(data['MES'], prefix='MES')],
axis=1
)

for feature in self._features:
if feature not in features.columns:
features[feature] = 0

if target_column:
data['min_diff'] = data.apply(self.get_min_diff, axis=1)

data[target_column] = np.where(
data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0
)

return features[self._features], data[[target_column]]
else:
return features[self._features]

def fit(
self,
Expand All @@ -40,7 +112,21 @@ def fit(
features (pd.DataFrame): preprocessed data.
target (pd.DataFrame): target.
"""
return

x_train, _, y_train, _ = train_test_split(
features, target, test_size=0.33, random_state=42
)

n_y0 = int((target == 0).sum())
n_y1 = int((target == 1).sum())
scale = n_y0 / n_y1

self._model = xgb.XGBClassifier(
random_state=1, learning_rate=0.01, scale_pos_weight=scale
)

self._model.fit(x_train, y_train)
self.__save_model(MODEL_FILE_NAME)

def predict(
self,
Expand All @@ -51,8 +137,14 @@ def predict(
Args:
features (pd.DataFrame): preprocessed data.
Returns:
(List[int]): predicted targets.
"""
return

if self._model is None:
self.__load_model(MODEL_FILE_NAME)

predictions = self._model.predict(features)

return predictions.tolist()
6 changes: 5 additions & 1 deletion docs/challenge.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@ When comparing the different models' performances, I want to focus on the positi
5. **Logistic Regression with Feature Importante and with Balance**: 0.36
6. **Logistic Regression with Feature Importante but without Balance**: 0.03

With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features.
With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets.


# Bugs fixed on `test_model.py`
- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected.
2 changes: 1 addition & 1 deletion tests/model/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase):
def setUp(self) -> None:
super().setUp()
self.model = DelayModel()
self.data = pd.read_csv(filepath_or_buffer="../data/data.csv")
self.data = pd.read_csv(filepath_or_buffer="./data/data.csv")


def test_model_preprocess_for_training(
Expand Down

0 comments on commit 9dd04cc

Please sign in to comment.