From cce585eeb34c8b2cf0a6d5e1ef8e9c11bef37184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Wed, 26 Jun 2024 10:12:16 -0300 Subject: [PATCH 1/9] Review notebook, fix bugs and pick best model --- challenge/exploration.ipynb | 149 ++++++++++++++++-------------------- docs/challenge.md | 20 +++++ requirements.txt | 3 +- 3 files changed, 89 insertions(+), 83 deletions(-) diff --git a/challenge/exploration.ipynb b/challenge/exploration.ipynb index d6d5f87..07a8af4 100644 --- a/challenge/exploration.ipynb +++ b/challenge/exploration.ipynb @@ -56,9 +56,9 @@ "outputs": [], "source": [ "flights_by_airline = data['OPERA'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flights_by_airline.index, flights_by_airline.values, alpha=0.9)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_airline.index, y=flights_by_airline.values, alpha=0.9)\n", "plt.title('Flights by Airline')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Airline', fontsize=12)\n", @@ -73,9 +73,9 @@ "outputs": [], "source": [ "flights_by_day = data['DIA'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style = \"darkgrid\")\n", - "sns.barplot(flights_by_day.index, flights_by_day.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_day.index, y=flights_by_day.values, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Day')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Day', fontsize=12)\n", @@ -90,9 +90,9 @@ "outputs": [], "source": [ "flights_by_month = data['MES'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style = \"darkgrid\")\n", - "sns.barplot(flights_by_month.index, flights_by_month.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_month.index, y=flights_by_month.values, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Month')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Month', fontsize=12)\n", @@ -125,9 +125,9 @@ " flights_by_day_in_week.values[6], \n", " flights_by_day_in_week.values[3]\n", "]\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(days, values_by_day, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=days, y=values_by_day, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Day in Week')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Day in Week', fontsize=12)\n", @@ -142,9 +142,9 @@ "outputs": [], "source": [ "flights_by_type = data['TIPOVUELO'].value_counts()\n", - "sns.set(style=\"darkgrid\")\n", - "plt.figure(figsize = (10, 2))\n", - "sns.barplot(flights_by_type.index, flights_by_type.values, alpha=0.9)\n", + "sns.set_theme(style=\"darkgrid\")\n", + "plt.figure(figsize=(10, 2))\n", + "sns.barplot(x=flights_by_type.index, y=flights_by_type.values, alpha=0.9)\n", "plt.title('Flights by Type')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Type', fontsize=12)\n", @@ -158,9 +158,9 @@ "outputs": [], "source": [ "flight_by_destination = data['SIGLADES'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flight_by_destination.index, flight_by_destination.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flight_by_destination.index, y=flight_by_destination.values, color='lightblue', alpha=0.8)\n", "plt.title('Flight by Destination')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Destination', fontsize=12)\n", @@ -243,14 +243,14 @@ "def is_high_season(fecha):\n", " fecha_año = int(fecha.split('-')[0])\n", " fecha = datetime.strptime(fecha, '%Y-%m-%d %H:%M:%S')\n", - " range1_min = datetime.strptime('15-Dec', '%d-%b').replace(year = fecha_año)\n", - " range1_max = datetime.strptime('31-Dec', '%d-%b').replace(year = fecha_año)\n", - " range2_min = datetime.strptime('1-Jan', '%d-%b').replace(year = fecha_año)\n", - " range2_max = datetime.strptime('3-Mar', '%d-%b').replace(year = fecha_año)\n", - " range3_min = datetime.strptime('15-Jul', '%d-%b').replace(year = fecha_año)\n", - " range3_max = datetime.strptime('31-Jul', '%d-%b').replace(year = fecha_año)\n", - " range4_min = datetime.strptime('11-Sep', '%d-%b').replace(year = fecha_año)\n", - " range4_max = datetime.strptime('30-Sep', '%d-%b').replace(year = fecha_año)\n", + " range1_min = datetime.strptime('15-Dec 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range1_max = datetime.strptime('31-Dec 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range2_min = datetime.strptime('1-Jan 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range2_max = datetime.strptime('3-Mar 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range3_min = datetime.strptime('15-Jul 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range3_max = datetime.strptime('31-Jul 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range4_min = datetime.strptime('11-Sep 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range4_max = datetime.strptime('30-Sep 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", " \n", " if ((fecha >= range1_min and fecha <= range1_max) or \n", " (fecha >= range2_min and fecha <= range2_max) or \n", @@ -297,7 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "data['min_diff'] = data.apply(get_min_diff, axis = 1)" + "data['min_diff'] = data.apply(get_min_diff, axis=1)" ] }, { @@ -362,11 +362,11 @@ " rates = {}\n", " for name, total in total.items():\n", " if name in delays:\n", - " rates[name] = round(total / delays[name], 2)\n", + " rates[name] = round(100 * delays[name] / total, 2)\n", " else:\n", " rates[name] = 0\n", " \n", - " return pd.DataFrame.from_dict(data = rates, orient = 'index', columns = ['Tasa (%)'])" + " return pd.DataFrame.from_dict(data=rates, orient='index', columns=['Tasa (%)'])" ] }, { @@ -377,9 +377,9 @@ "source": [ "destination_rate = get_rate_from_column(data, 'SIGLADES')\n", "destination_rate_values = data['SIGLADES'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(destination_rate_values, destination_rate['Tasa (%)'], alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=destination_rate_values, y=destination_rate['Tasa (%)'], alpha=0.75)\n", "plt.title('Delay Rate by Destination')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Destination', fontsize=12)\n", @@ -395,9 +395,9 @@ "source": [ "airlines_rate = get_rate_from_column(data, 'OPERA')\n", "airlines_rate_values = data['OPERA'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(airlines_rate_values, airlines_rate['Tasa (%)'], alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=airlines_rate_values, y=airlines_rate['Tasa (%)'], alpha=0.75)\n", "plt.title('Delay Rate by Airline')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Airline', fontsize=12)\n", @@ -413,14 +413,13 @@ "source": [ "month_rate = get_rate_from_column(data, 'MES')\n", "month_rate_value = data['MES'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(month_rate_value, month_rate['Tasa (%)'], color = 'blue', alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=month_rate_value, y=month_rate['Tasa (%)'], color='blue', alpha=0.75)\n", "plt.title('Delay Rate by Month')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Month', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,10)\n", "plt.show()" ] }, @@ -433,14 +432,13 @@ "days_rate = get_rate_from_column(data, 'DIANOM')\n", "days_rate_value = data['DIANOM'].value_counts().index\n", "\n", - "sns.set(style=\"darkgrid\")\n", - "plt.figure(figsize = (20, 5))\n", - "sns.barplot(days_rate_value, days_rate['Tasa (%)'], color = 'blue', alpha = 0.75)\n", + "sns.set_theme(style=\"darkgrid\")\n", + "plt.figure(figsize=(20, 5))\n", + "sns.barplot(x=days_rate_value, y=days_rate['Tasa (%)'], color='blue', alpha=0.75)\n", "plt.title('Delay Rate by Day')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Days', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,7)\n", "plt.show()" ] }, @@ -453,14 +451,13 @@ "high_season_rate = get_rate_from_column(data, 'high_season')\n", "high_season_rate_values = data['high_season'].value_counts().index\n", "\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot([\"no\", \"yes\"], high_season_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=[\"no\", \"yes\"], y=high_season_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Season')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('High Season', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,6)\n", "plt.show()" ] }, @@ -472,13 +469,12 @@ "source": [ "flight_type_rate = get_rate_from_column(data, 'TIPOVUELO')\n", "flight_type_rate_values = data['TIPOVUELO'].value_counts().index\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flight_type_rate_values, flight_type_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flight_type_rate_values, y=flight_type_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Flight Type')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Flight Type', fontsize=12)\n", - "plt.ylim(0,7)\n", "plt.show()" ] }, @@ -490,13 +486,12 @@ "source": [ "period_day_rate = get_rate_from_column(data, 'period_day')\n", "period_day_rate_values = data['period_day'].value_counts().index\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(period_day_rate_values, period_day_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=period_day_rate_values, y=period_day_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Period of Day')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Period', fontsize=12)\n", - "plt.ylim(3,7)\n", "plt.show()" ] }, @@ -523,19 +518,9 @@ "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", - "from sklearn.utils import shuffle\n", "from sklearn.metrics import confusion_matrix, classification_report" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data = shuffle(data[['OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'delay']], random_state = 111)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -543,10 +528,10 @@ "outputs": [], "source": [ "features = pd.concat([\n", - " pd.get_dummies(data['OPERA'], prefix = 'OPERA'),\n", - " pd.get_dummies(data['TIPOVUELO'], prefix = 'TIPOVUELO'), \n", - " pd.get_dummies(data['MES'], prefix = 'MES')], \n", - " axis = 1\n", + " pd.get_dummies(data['OPERA'], prefix='OPERA'),\n", + " pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'), \n", + " pd.get_dummies(data['MES'], prefix='MES')], \n", + " axis=1\n", ")\n", "target = data['delay']" ] @@ -557,7 +542,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.33, random_state = 42)" + "x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)" ] }, { @@ -575,7 +560,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_train.value_counts('%')*100" + "y_train.value_counts('%') * 100" ] }, { @@ -584,7 +569,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_test.value_counts('%')*100" + "y_test.value_counts('%') * 100" ] }, { @@ -726,7 +711,7 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize = (10,5))\n", + "plt.figure(figsize=(10,5))\n", "plot_importance(xgb_model)" ] }, @@ -738,15 +723,15 @@ "source": [ "top_10_features = [\n", " \"OPERA_Latin American Wings\", \n", - " \"MES_7\",\n", " \"MES_10\",\n", + " \"MES_7\",\n", " \"OPERA_Grupo LATAM\",\n", - " \"MES_12\",\n", - " \"TIPOVUELO_I\",\n", + " \"MES_6\",\n", " \"MES_4\",\n", - " \"MES_11\",\n", + " \"MES_8\",\n", + " \"MES_12\",\n", " \"OPERA_Sky Airline\",\n", - " \"OPERA_Copa Air\"\n", + " \"TIPOVUELO_I\",\n", "]" ] }, @@ -766,7 +751,7 @@ "source": [ "n_y0 = len(y_train[y_train == 0])\n", "n_y1 = len(y_train[y_train == 1])\n", - "scale = n_y0/n_y1\n", + "scale = n_y0 / n_y1\n", "print(scale)" ] }, @@ -792,7 +777,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_train2, x_test2, y_train2, y_test2 = train_test_split(features[top_10_features], target, test_size = 0.33, random_state = 42)" + "x_train2, x_test2, y_train2, y_test2 = train_test_split(features[top_10_features], target, test_size=0.33, random_state=42)" ] }, { @@ -817,7 +802,7 @@ "metadata": {}, "outputs": [], "source": [ - "xgb_model_2 = xgb.XGBClassifier(random_state=1, learning_rate=0.01, scale_pos_weight = scale)\n", + "xgb_model_2 = xgb.XGBClassifier(random_state=1, learning_rate=0.01, scale_pos_weight=scale)\n", "xgb_model_2.fit(x_train2, y_train2)" ] }, @@ -1027,7 +1012,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.12.4" }, "orig_nbformat": 4 }, diff --git a/docs/challenge.md b/docs/challenge.md index e69de29..ddb64a3 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -0,0 +1,20 @@ +# Bugs fixed on `exploration.ipynb` +- The function `is_high_season` had an issue since it didn't consider the time. For example, `is_high_season("2017-12-31 14:55:00")` returned `0` when in reality it should return `1`. +- All calls to `sns.barplot` were missing the `x` and `y` definition. +- To improve visualizations and correctly show the delay rate, the method `get_rate_from_column` was updated. Instead of calculating `rates[name] = round(total / delays[name], 2)`, I think it's best to do `rates[name] = round(100 * delays[name] / total, 2)` to get the ratio of delayed flights to the total number of flights (for a specific column value). This value is now between 0 and 100, where `0` indicates that no flights with that specific column value were delayed and `100` indicates that all flights with that specific column value were delayed. After implementing this change, visualization code had to be updated as well to avoid limiting the y-axis. +- `training_data` was defined but never used. This cell was deleted. +- `top_10_features` included more than 10 features and they weren't the top ones. +- Minor style changes were applied, such as using spaces around operators and removing spaces when defining the value of certain method arguments. +- `xgboost` was not included under installed dependencies. + + +# Model pick +When comparing the different models' performances, I want to focus on the positive (minority) class since that's the class that represents delays and the model is intended to predict the probability of **delay**. For this, I'll focus on the F1-score for the positive class, since it combines precision and recall into a single metric, providing a balance between the two and accounting for false positives and false negatives. It offers a consolidated evaluation of the model's performance in predicting the positive class while factoring in the class imbalance. Let's review the results for each model: +1. **XGBoost**: 0.00 +2. **Logistic Regression**: 0.06 +3. **XGBoost with Feature Importance and with Balance**: 0.36 +4. **XGBoost with Feature Importance but without Balance**: 0.01 +5. **Logistic Regression with Feature Importante and with Balance**: 0.36 +6. **Logistic Regression with Feature Importante but without Balance**: 0.03 + +With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 64fde77..99ea409 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ fastapi~=0.86.0 pydantic~=1.10.2 uvicorn~=0.15.0 -numpy~=1.22.4 +numpy~=1.26.4 pandas~=1.3.5 scikit-learn~=1.3.0 +xgboost~=2.1.0 From ed343495e5dd4277d40c2464acc77b2ece0708a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Wed, 26 Jun 2024 10:40:59 -0300 Subject: [PATCH 2/9] Keep original TOP_10_FEATURES --- challenge/exploration.ipynb | 10 +++++----- docs/challenge.md | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/challenge/exploration.ipynb b/challenge/exploration.ipynb index 07a8af4..639827a 100644 --- a/challenge/exploration.ipynb +++ b/challenge/exploration.ipynb @@ -723,15 +723,15 @@ "source": [ "top_10_features = [\n", " \"OPERA_Latin American Wings\", \n", - " \"MES_10\",\n", " \"MES_7\",\n", + " \"MES_10\",\n", " \"OPERA_Grupo LATAM\",\n", - " \"MES_6\",\n", - " \"MES_4\",\n", - " \"MES_8\",\n", " \"MES_12\",\n", - " \"OPERA_Sky Airline\",\n", " \"TIPOVUELO_I\",\n", + " \"MES_4\",\n", + " \"MES_11\",\n", + " \"OPERA_Sky Airline\",\n", + " \"OPERA_Copa Air\"\n", "]" ] }, diff --git a/docs/challenge.md b/docs/challenge.md index ddb64a3..4754963 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -3,7 +3,6 @@ - All calls to `sns.barplot` were missing the `x` and `y` definition. - To improve visualizations and correctly show the delay rate, the method `get_rate_from_column` was updated. Instead of calculating `rates[name] = round(total / delays[name], 2)`, I think it's best to do `rates[name] = round(100 * delays[name] / total, 2)` to get the ratio of delayed flights to the total number of flights (for a specific column value). This value is now between 0 and 100, where `0` indicates that no flights with that specific column value were delayed and `100` indicates that all flights with that specific column value were delayed. After implementing this change, visualization code had to be updated as well to avoid limiting the y-axis. - `training_data` was defined but never used. This cell was deleted. -- `top_10_features` included more than 10 features and they weren't the top ones. - Minor style changes were applied, such as using spaces around operators and removing spaces when defining the value of certain method arguments. - `xgboost` was not included under installed dependencies. @@ -12,9 +11,9 @@ When comparing the different models' performances, I want to focus on the positive (minority) class since that's the class that represents delays and the model is intended to predict the probability of **delay**. For this, I'll focus on the F1-score for the positive class, since it combines precision and recall into a single metric, providing a balance between the two and accounting for false positives and false negatives. It offers a consolidated evaluation of the model's performance in predicting the positive class while factoring in the class imbalance. Let's review the results for each model: 1. **XGBoost**: 0.00 2. **Logistic Regression**: 0.06 -3. **XGBoost with Feature Importance and with Balance**: 0.36 +3. **XGBoost with Feature Importance and with Balance**: 0.37 4. **XGBoost with Feature Importance but without Balance**: 0.01 5. **Logistic Regression with Feature Importante and with Balance**: 0.36 6. **Logistic Regression with Feature Importante but without Balance**: 0.03 -With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. \ No newline at end of file +With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. \ No newline at end of file From eb39a2551668204a0d5d68f2da2680ad1b409aa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 09:58:27 -0300 Subject: [PATCH 3/9] Transcribe model from notebook to Python script --- challenge/model.py | 104 +++++++++++++++++++++++++++++++++++--- docs/challenge.md | 6 ++- tests/model/test_model.py | 2 +- 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/challenge/model.py b/challenge/model.py index 173ac6c..1143620 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -1,19 +1,70 @@ +import pickle + +import numpy as np import pandas as pd +import xgboost as xgb +from datetime import datetime from typing import Tuple, Union, List +from sklearn.model_selection import train_test_split + + +THRESHOLD_IN_MINUTES = 15 +MODEL_FILE_NAME = "delay_model.pkl" + + class DelayModel: def __init__( self ): - self._model = None # Model should be saved in this attribute. + self._features = [ + "OPERA_Latin American Wings", + "MES_7", + "MES_10", + "OPERA_Grupo LATAM", + "MES_12", + "TIPOVUELO_I", + "MES_4", + "MES_11", + "OPERA_Sky Airline", + "OPERA_Copa Air" + ] + self._model = self.__load_model(MODEL_FILE_NAME) + + def __load_model(self, file_name): + try: + with open(file_name, 'rb') as fp: + return pickle.load(fp) + except FileNotFoundError: + return None + + def __save_model(self, filename): + with open(filename, 'wb') as fp: + pickle.dump(self._model, fp) + + def get_min_diff(self, data): + """ + Calculate the minute difference between two datetime values. + + Args: + data (pd.DataFrame): raw data. + + Returns: + float: Minute difference between 'Fecha-O' and 'Fecha-I'. + """ + + fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S') + fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S') + min_diff = ((fecha_o - fecha_i).total_seconds()) / 60 + return min_diff def preprocess( self, data: pd.DataFrame, target_column: str = None - ) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame): + ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]: """ Prepare raw data for training or predict. @@ -26,7 +77,28 @@ def preprocess( or pd.DataFrame: features. """ - return + + features = pd.concat([ + pd.get_dummies(data['OPERA'], prefix='OPERA'), + pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'), + pd.get_dummies(data['MES'], prefix='MES')], + axis=1 + ) + + for feature in self._features: + if feature not in features.columns: + features[feature] = 0 + + if target_column: + data['min_diff'] = data.apply(self.get_min_diff, axis=1) + + data[target_column] = np.where( + data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0 + ) + + return features[self._features], data[[target_column]] + else: + return features[self._features] def fit( self, @@ -40,7 +112,21 @@ def fit( features (pd.DataFrame): preprocessed data. target (pd.DataFrame): target. """ - return + + x_train, _, y_train, _ = train_test_split( + features, target, test_size=0.33, random_state=42 + ) + + n_y0 = int((target == 0).sum()) + n_y1 = int((target == 1).sum()) + scale = n_y0 / n_y1 + + self._model = xgb.XGBClassifier( + random_state=1, learning_rate=0.01, scale_pos_weight=scale + ) + + self._model.fit(x_train, y_train) + self.__save_model(MODEL_FILE_NAME) def predict( self, @@ -51,8 +137,14 @@ def predict( Args: features (pd.DataFrame): preprocessed data. - + Returns: (List[int]): predicted targets. """ - return \ No newline at end of file + + if self._model is None: + self.__load_model(MODEL_FILE_NAME) + + predictions = self._model.predict(features) + + return predictions.tolist() diff --git a/docs/challenge.md b/docs/challenge.md index 4754963..7a09c1d 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -16,4 +16,8 @@ When comparing the different models' performances, I want to focus on the positi 5. **Logistic Regression with Feature Importante and with Balance**: 0.36 6. **Logistic Regression with Feature Importante but without Balance**: 0.03 -With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. \ No newline at end of file +With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. + + +# Bugs fixed on `test_model.py` +- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected. diff --git a/tests/model/test_model.py b/tests/model/test_model.py index e4afabb..97b28e1 100644 --- a/tests/model/test_model.py +++ b/tests/model/test_model.py @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase): def setUp(self) -> None: super().setUp() self.model = DelayModel() - self.data = pd.read_csv(filepath_or_buffer="../data/data.csv") + self.data = pd.read_csv(filepath_or_buffer="./data/data.csv") def test_model_preprocess_for_training( From 47b14fb57f8bf2484697450f58cdd0e90802f7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 10:20:28 -0300 Subject: [PATCH 4/9] Deploy the model in an API with FastAPI --- .gitignore | 6 ++++ challenge/api.py | 72 ++++++++++++++++++++++++++++++++++++++++++------ requirements.txt | 6 ++-- 3 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ff3cb08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ + +# Unit test / coverage reports +.coverage +reports/ diff --git a/challenge/api.py b/challenge/api.py index 1a0f76f..90d4ccc 100644 --- a/challenge/api.py +++ b/challenge/api.py @@ -1,13 +1,69 @@ -import fastapi +from fastapi import FastAPI, HTTPException +import pandas as pd + +from challenge.model import DelayModel + +from pydantic import BaseModel, validator +from typing import List + +app = FastAPI() +delay_model = DelayModel() + + +class Flight(BaseModel): + OPERA: str + MES: int + TIPOVUELO: str + + @validator('MES') + def validate_month(cls, v): + if v < 1 or v > 12: + raise HTTPException( + status_code=400, + detail='MES must be between 1 and 12' + ) + return v + + @validator('TIPOVUELO') + def validate_flight_type(cls, v): + if v not in ['I', 'N']: + raise HTTPException( + status_code=400, + detail='TIPOVUELO must be either "I" or "N"' + ) + return v + + +class PredictionInfo(BaseModel): + flights: List[Flight] -app = fastapi.FastAPI() @app.get("/health", status_code=200) async def get_health() -> dict: - return { - "status": "OK" - } + return {"status": "OK"} + + +@app.post("/predict", response_model=dict, status_code=200) +async def post_predict(input: PredictionInfo) -> dict: + try: + data = [ + { + "OPERA": flight.OPERA, + "MES": flight.MES, + "TIPOVUELO": flight.TIPOVUELO + } for flight in input.flights + ] + df = pd.DataFrame(data) + + preprocessed_data = delay_model.preprocess(df) + predictions = delay_model.predict(preprocessed_data) + + return {"predict": predictions} -@app.post("/predict", status_code=200) -async def post_predict() -> dict: - return \ No newline at end of file + except ValueError as ve: + raise HTTPException(status_code=400, detail=str(ve)) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"An error occurred while processing the prediction: {str(e)}" + ) diff --git a/requirements.txt b/requirements.txt index 99ea409..0cb3b96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -fastapi~=0.86.0 -pydantic~=1.10.2 -uvicorn~=0.15.0 +fastapi~=0.111.0 +pydantic~=1.10.17 +uvicorn~=0.30.1 numpy~=1.26.4 pandas~=1.3.5 scikit-learn~=1.3.0 From a2eda82418c0ae11d6f02124f62be5babde815a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 12:03:26 -0300 Subject: [PATCH 5/9] Dockerfile with deployed image --- Dockerfile | 24 +++++++++++++++++++++--- Makefile | 2 +- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index ef0b367..3ce356c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,21 @@ -# syntax=docker/dockerfile:1.2 -FROM python:latest -# put you docker configuration here \ No newline at end of file +FROM python:3.9-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements files into the container +COPY requirements.txt requirements.txt +COPY requirements-dev.txt requirements-dev.txt + +# Install the required Python packages +RUN pip install -r requirements.txt +RUN pip install -r requirements-dev.txt + +# Copy all files from the current directory to the working directory in the container +COPY . . + +# Expose port 8080 of the container to external network +EXPOSE 8080 + +# Command to run the FastAPI application with Uvicorn +CMD ["uvicorn", "challenge.api:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/Makefile b/Makefile index 3218c8d..981286f 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ install: ## Install dependencies pip install -r requirements-test.txt pip install -r requirements.txt -STRESS_URL = http://127.0.0.1:8000 +STRESS_URL = https://paz-challenge-tryolabs-latam-6fru3wsz3q-uc.a.run.app .PHONY: stress-test stress-test: # change stress url to your deployed app From 4939ca6533977f85bd213526e763cfe1f8277281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 15:18:04 -0300 Subject: [PATCH 6/9] Add Continuous Integration GA --- .github/workflows/ci.yml | 45 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2e994a4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,45 @@ +name: 'Continuous Integration' + +on: + push: + branches: + - main + - develop + pull_request: + branches: + - main + - develop + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + pip install -r requirements-test.txt + + - name: Run model tests + run: make model-test + + - name: Run API tests + run: make api-test From bbf88f34a88688a11eda5257d299f7720751c92f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 16:30:42 -0300 Subject: [PATCH 7/9] Add Continuous Deployment GA --- .github/workflows/cd.yml | 67 ++++++++++++++++++++++++++++++++++++++++ requirements-test.txt | 2 +- 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/cd.yml diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..e4dbfbe --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,67 @@ +name: 'Continuous Delivery' + +on: + push: + branches: + - main + - release/* + pull_request: + branches: + - main + - release/* + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + pip install -r requirements-test.txt + + - name: Set up Google Cloud SDK + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_CREDENTIALS }} + + - name: Configure Docker + run: gcloud auth configure-docker + + - name: Setup gcloud CLI + uses: google-github-actions/setup-gcloud@v1 + with: + version: '390.0.0' + service_account_key: ${{ secrets.GCP_CREDENTIALS }} + + - name: Download Model from GCS + run: gsutil cp gs://delay-models/source/${{ secrets.MODEL_VERSION }}.pkl delay_model.pkl + + - name: Submit Build + run: gcloud builds submit --region ${{ secrets.GCP_REGION }} --tag ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest + + - name: Deploy to Cloud Run + id: deploy + uses: google-github-actions/deploy-cloudrun@v0 + run: gcloud run deploy ${{ secrets.GCP_IMAGE_NAME }} --image ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest --allow-unauthenticated --region ${{ secrets.GCP_REGION }} + + - name: Run Stress Test + run: make stress-test STRESS_URL=${{ steps.deploy.outputs.url }} diff --git a/requirements-test.txt b/requirements-test.txt index 2753f60..91f2e79 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ -locust~=1.6 +locust~=2.29.1 coverage~=5.5 pytest~=6.2.5 pytest-cov~=2.12.1 From e3a78e2029316786e1cb5aa50a6147904db289a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Thu, 27 Jun 2024 16:30:42 -0300 Subject: [PATCH 8/9] Add Continuous Deployment GA --- .github/workflows/cd.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index e4dbfbe..77f494d 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -59,9 +59,7 @@ jobs: run: gcloud builds submit --region ${{ secrets.GCP_REGION }} --tag ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest - name: Deploy to Cloud Run - id: deploy - uses: google-github-actions/deploy-cloudrun@v0 run: gcloud run deploy ${{ secrets.GCP_IMAGE_NAME }} --image ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest --allow-unauthenticated --region ${{ secrets.GCP_REGION }} - name: Run Stress Test - run: make stress-test STRESS_URL=${{ steps.deploy.outputs.url }} + run: make stress-test From 563259dbccad890d65082c631b150c9a0fed1d40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Paz=20Cuturi?= Date: Fri, 28 Jun 2024 14:40:56 -0300 Subject: [PATCH 9/9] Add relevant justifications to challenge.md --- docs/challenge.md | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/challenge.md b/docs/challenge.md index 7a09c1d..46a11ef 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -1,4 +1,5 @@ -# Bugs fixed on `exploration.ipynb` +# Part I +## Bugs fixed on `exploration.ipynb` - The function `is_high_season` had an issue since it didn't consider the time. For example, `is_high_season("2017-12-31 14:55:00")` returned `0` when in reality it should return `1`. - All calls to `sns.barplot` were missing the `x` and `y` definition. - To improve visualizations and correctly show the delay rate, the method `get_rate_from_column` was updated. Instead of calculating `rates[name] = round(total / delays[name], 2)`, I think it's best to do `rates[name] = round(100 * delays[name] / total, 2)` to get the ratio of delayed flights to the total number of flights (for a specific column value). This value is now between 0 and 100, where `0` indicates that no flights with that specific column value were delayed and `100` indicates that all flights with that specific column value were delayed. After implementing this change, visualization code had to be updated as well to avoid limiting the y-axis. @@ -6,8 +7,7 @@ - Minor style changes were applied, such as using spaces around operators and removing spaces when defining the value of certain method arguments. - `xgboost` was not included under installed dependencies. - -# Model pick +## Model pick When comparing the different models' performances, I want to focus on the positive (minority) class since that's the class that represents delays and the model is intended to predict the probability of **delay**. For this, I'll focus on the F1-score for the positive class, since it combines precision and recall into a single metric, providing a balance between the two and accounting for false positives and false negatives. It offers a consolidated evaluation of the model's performance in predicting the positive class while factoring in the class imbalance. Let's review the results for each model: 1. **XGBoost**: 0.00 2. **Logistic Regression**: 0.06 @@ -18,6 +18,24 @@ When comparing the different models' performances, I want to focus on the positi With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. +# Part II +The model was kept as unchanged as possible. My intention was to avoid hardcoding the top 10 features but the test `test_model.py` asserted that only those columns were present. +Some improvements that could be done here are: +- Have `THRESHOLD_IN_MINUTES` and `MODEL_FILE_NAME` as environment variables. That way, if we want to update either of those values, we can do that without the need to update the code. +- Upload the trained model directly to Google Cloud Storage (GCS) or to another service that can store the different versions of the trained model. For now, we're saving it locally and manually uploading it to GCS, which is where the Continuous Deployment pipeline gets it from. -# Bugs fixed on `test_model.py` +## Bugs fixed on `test_model.py` - Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected. + +# Part III +API was deployed by building a Docker image and pushing it to Artifact Registry (GCP). The container was then deployed to CloudRun. + +# Part IV +Continuous Integration workflow runs whenever there is a push or pull request to the main/develop branches. +Continuous Deployment workflow runs whenever there is a push or pull request to the main/release branches. +The following secrets were set directly as Repository Secrets on Github: +- GCP_CREDENTIALS: content of GCP's `credentials.json`. +- GCP_IMAGE_NAME: name of the image pushed to Artifact Registry on GCP. +- GCP_PROJECT_ID: name of the project used on GCP (e.g. 'project-id'). +- GCP_REGION: region used on GCP (e.g. 'us-central1'). +- MODEL_VERSION: name of the GCS model that will be downloaded and served (e.g. 'v0').