diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..77f494d --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,65 @@ +name: 'Continuous Delivery' + +on: + push: + branches: + - main + - release/* + pull_request: + branches: + - main + - release/* + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + pip install -r requirements-test.txt + + - name: Set up Google Cloud SDK + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_CREDENTIALS }} + + - name: Configure Docker + run: gcloud auth configure-docker + + - name: Setup gcloud CLI + uses: google-github-actions/setup-gcloud@v1 + with: + version: '390.0.0' + service_account_key: ${{ secrets.GCP_CREDENTIALS }} + + - name: Download Model from GCS + run: gsutil cp gs://delay-models/source/${{ secrets.MODEL_VERSION }}.pkl delay_model.pkl + + - name: Submit Build + run: gcloud builds submit --region ${{ secrets.GCP_REGION }} --tag ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest + + - name: Deploy to Cloud Run + run: gcloud run deploy ${{ secrets.GCP_IMAGE_NAME }} --image ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/challenge/${{ secrets.GCP_IMAGE_NAME }}:latest --allow-unauthenticated --region ${{ secrets.GCP_REGION }} + + - name: Run Stress Test + run: make stress-test diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2e994a4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,45 @@ +name: 'Continuous Integration' + +on: + push: + branches: + - main + - develop + pull_request: + branches: + - main + - develop + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + pip install -r requirements-test.txt + + - name: Run model tests + run: make model-test + + - name: Run API tests + run: make api-test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ff3cb08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ + +# Unit test / coverage reports +.coverage +reports/ diff --git a/Dockerfile b/Dockerfile index ef0b367..3ce356c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,21 @@ -# syntax=docker/dockerfile:1.2 -FROM python:latest -# put you docker configuration here \ No newline at end of file +FROM python:3.9-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements files into the container +COPY requirements.txt requirements.txt +COPY requirements-dev.txt requirements-dev.txt + +# Install the required Python packages +RUN pip install -r requirements.txt +RUN pip install -r requirements-dev.txt + +# Copy all files from the current directory to the working directory in the container +COPY . . + +# Expose port 8080 of the container to external network +EXPOSE 8080 + +# Command to run the FastAPI application with Uvicorn +CMD ["uvicorn", "challenge.api:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/Makefile b/Makefile index 3218c8d..981286f 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ install: ## Install dependencies pip install -r requirements-test.txt pip install -r requirements.txt -STRESS_URL = http://127.0.0.1:8000 +STRESS_URL = https://paz-challenge-tryolabs-latam-6fru3wsz3q-uc.a.run.app .PHONY: stress-test stress-test: # change stress url to your deployed app diff --git a/challenge/api.py b/challenge/api.py index 1a0f76f..90d4ccc 100644 --- a/challenge/api.py +++ b/challenge/api.py @@ -1,13 +1,69 @@ -import fastapi +from fastapi import FastAPI, HTTPException +import pandas as pd + +from challenge.model import DelayModel + +from pydantic import BaseModel, validator +from typing import List + +app = FastAPI() +delay_model = DelayModel() + + +class Flight(BaseModel): + OPERA: str + MES: int + TIPOVUELO: str + + @validator('MES') + def validate_month(cls, v): + if v < 1 or v > 12: + raise HTTPException( + status_code=400, + detail='MES must be between 1 and 12' + ) + return v + + @validator('TIPOVUELO') + def validate_flight_type(cls, v): + if v not in ['I', 'N']: + raise HTTPException( + status_code=400, + detail='TIPOVUELO must be either "I" or "N"' + ) + return v + + +class PredictionInfo(BaseModel): + flights: List[Flight] -app = fastapi.FastAPI() @app.get("/health", status_code=200) async def get_health() -> dict: - return { - "status": "OK" - } + return {"status": "OK"} + + +@app.post("/predict", response_model=dict, status_code=200) +async def post_predict(input: PredictionInfo) -> dict: + try: + data = [ + { + "OPERA": flight.OPERA, + "MES": flight.MES, + "TIPOVUELO": flight.TIPOVUELO + } for flight in input.flights + ] + df = pd.DataFrame(data) + + preprocessed_data = delay_model.preprocess(df) + predictions = delay_model.predict(preprocessed_data) + + return {"predict": predictions} -@app.post("/predict", status_code=200) -async def post_predict() -> dict: - return \ No newline at end of file + except ValueError as ve: + raise HTTPException(status_code=400, detail=str(ve)) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"An error occurred while processing the prediction: {str(e)}" + ) diff --git a/challenge/exploration.ipynb b/challenge/exploration.ipynb index d6d5f87..639827a 100644 --- a/challenge/exploration.ipynb +++ b/challenge/exploration.ipynb @@ -56,9 +56,9 @@ "outputs": [], "source": [ "flights_by_airline = data['OPERA'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flights_by_airline.index, flights_by_airline.values, alpha=0.9)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_airline.index, y=flights_by_airline.values, alpha=0.9)\n", "plt.title('Flights by Airline')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Airline', fontsize=12)\n", @@ -73,9 +73,9 @@ "outputs": [], "source": [ "flights_by_day = data['DIA'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style = \"darkgrid\")\n", - "sns.barplot(flights_by_day.index, flights_by_day.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_day.index, y=flights_by_day.values, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Day')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Day', fontsize=12)\n", @@ -90,9 +90,9 @@ "outputs": [], "source": [ "flights_by_month = data['MES'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style = \"darkgrid\")\n", - "sns.barplot(flights_by_month.index, flights_by_month.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flights_by_month.index, y=flights_by_month.values, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Month')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Month', fontsize=12)\n", @@ -125,9 +125,9 @@ " flights_by_day_in_week.values[6], \n", " flights_by_day_in_week.values[3]\n", "]\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(days, values_by_day, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=days, y=values_by_day, color='lightblue', alpha=0.8)\n", "plt.title('Flights by Day in Week')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Day in Week', fontsize=12)\n", @@ -142,9 +142,9 @@ "outputs": [], "source": [ "flights_by_type = data['TIPOVUELO'].value_counts()\n", - "sns.set(style=\"darkgrid\")\n", - "plt.figure(figsize = (10, 2))\n", - "sns.barplot(flights_by_type.index, flights_by_type.values, alpha=0.9)\n", + "sns.set_theme(style=\"darkgrid\")\n", + "plt.figure(figsize=(10, 2))\n", + "sns.barplot(x=flights_by_type.index, y=flights_by_type.values, alpha=0.9)\n", "plt.title('Flights by Type')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Type', fontsize=12)\n", @@ -158,9 +158,9 @@ "outputs": [], "source": [ "flight_by_destination = data['SIGLADES'].value_counts()\n", - "plt.figure(figsize = (10, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flight_by_destination.index, flight_by_destination.values, color = 'lightblue', alpha=0.8)\n", + "plt.figure(figsize=(10, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flight_by_destination.index, y=flight_by_destination.values, color='lightblue', alpha=0.8)\n", "plt.title('Flight by Destination')\n", "plt.ylabel('Flights', fontsize=12)\n", "plt.xlabel('Destination', fontsize=12)\n", @@ -243,14 +243,14 @@ "def is_high_season(fecha):\n", " fecha_año = int(fecha.split('-')[0])\n", " fecha = datetime.strptime(fecha, '%Y-%m-%d %H:%M:%S')\n", - " range1_min = datetime.strptime('15-Dec', '%d-%b').replace(year = fecha_año)\n", - " range1_max = datetime.strptime('31-Dec', '%d-%b').replace(year = fecha_año)\n", - " range2_min = datetime.strptime('1-Jan', '%d-%b').replace(year = fecha_año)\n", - " range2_max = datetime.strptime('3-Mar', '%d-%b').replace(year = fecha_año)\n", - " range3_min = datetime.strptime('15-Jul', '%d-%b').replace(year = fecha_año)\n", - " range3_max = datetime.strptime('31-Jul', '%d-%b').replace(year = fecha_año)\n", - " range4_min = datetime.strptime('11-Sep', '%d-%b').replace(year = fecha_año)\n", - " range4_max = datetime.strptime('30-Sep', '%d-%b').replace(year = fecha_año)\n", + " range1_min = datetime.strptime('15-Dec 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range1_max = datetime.strptime('31-Dec 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range2_min = datetime.strptime('1-Jan 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range2_max = datetime.strptime('3-Mar 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range3_min = datetime.strptime('15-Jul 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range3_max = datetime.strptime('31-Jul 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range4_min = datetime.strptime('11-Sep 00:00:00', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", + " range4_max = datetime.strptime('30-Sep 23:59:59', '%d-%b %H:%M:%S').replace(year=fecha_año)\n", " \n", " if ((fecha >= range1_min and fecha <= range1_max) or \n", " (fecha >= range2_min and fecha <= range2_max) or \n", @@ -297,7 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "data['min_diff'] = data.apply(get_min_diff, axis = 1)" + "data['min_diff'] = data.apply(get_min_diff, axis=1)" ] }, { @@ -362,11 +362,11 @@ " rates = {}\n", " for name, total in total.items():\n", " if name in delays:\n", - " rates[name] = round(total / delays[name], 2)\n", + " rates[name] = round(100 * delays[name] / total, 2)\n", " else:\n", " rates[name] = 0\n", " \n", - " return pd.DataFrame.from_dict(data = rates, orient = 'index', columns = ['Tasa (%)'])" + " return pd.DataFrame.from_dict(data=rates, orient='index', columns=['Tasa (%)'])" ] }, { @@ -377,9 +377,9 @@ "source": [ "destination_rate = get_rate_from_column(data, 'SIGLADES')\n", "destination_rate_values = data['SIGLADES'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(destination_rate_values, destination_rate['Tasa (%)'], alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=destination_rate_values, y=destination_rate['Tasa (%)'], alpha=0.75)\n", "plt.title('Delay Rate by Destination')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Destination', fontsize=12)\n", @@ -395,9 +395,9 @@ "source": [ "airlines_rate = get_rate_from_column(data, 'OPERA')\n", "airlines_rate_values = data['OPERA'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(airlines_rate_values, airlines_rate['Tasa (%)'], alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=airlines_rate_values, y=airlines_rate['Tasa (%)'], alpha=0.75)\n", "plt.title('Delay Rate by Airline')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Airline', fontsize=12)\n", @@ -413,14 +413,13 @@ "source": [ "month_rate = get_rate_from_column(data, 'MES')\n", "month_rate_value = data['MES'].value_counts().index\n", - "plt.figure(figsize = (20,5))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(month_rate_value, month_rate['Tasa (%)'], color = 'blue', alpha = 0.75)\n", + "plt.figure(figsize=(20,5))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=month_rate_value, y=month_rate['Tasa (%)'], color='blue', alpha=0.75)\n", "plt.title('Delay Rate by Month')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Month', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,10)\n", "plt.show()" ] }, @@ -433,14 +432,13 @@ "days_rate = get_rate_from_column(data, 'DIANOM')\n", "days_rate_value = data['DIANOM'].value_counts().index\n", "\n", - "sns.set(style=\"darkgrid\")\n", - "plt.figure(figsize = (20, 5))\n", - "sns.barplot(days_rate_value, days_rate['Tasa (%)'], color = 'blue', alpha = 0.75)\n", + "sns.set_theme(style=\"darkgrid\")\n", + "plt.figure(figsize=(20, 5))\n", + "sns.barplot(x=days_rate_value, y=days_rate['Tasa (%)'], color='blue', alpha=0.75)\n", "plt.title('Delay Rate by Day')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Days', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,7)\n", "plt.show()" ] }, @@ -453,14 +451,13 @@ "high_season_rate = get_rate_from_column(data, 'high_season')\n", "high_season_rate_values = data['high_season'].value_counts().index\n", "\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot([\"no\", \"yes\"], high_season_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=[\"no\", \"yes\"], y=high_season_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Season')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('High Season', fontsize=12)\n", "plt.xticks(rotation=90)\n", - "plt.ylim(0,6)\n", "plt.show()" ] }, @@ -472,13 +469,12 @@ "source": [ "flight_type_rate = get_rate_from_column(data, 'TIPOVUELO')\n", "flight_type_rate_values = data['TIPOVUELO'].value_counts().index\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(flight_type_rate_values, flight_type_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=flight_type_rate_values, y=flight_type_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Flight Type')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Flight Type', fontsize=12)\n", - "plt.ylim(0,7)\n", "plt.show()" ] }, @@ -490,13 +486,12 @@ "source": [ "period_day_rate = get_rate_from_column(data, 'period_day')\n", "period_day_rate_values = data['period_day'].value_counts().index\n", - "plt.figure(figsize = (5, 2))\n", - "sns.set(style=\"darkgrid\")\n", - "sns.barplot(period_day_rate_values, period_day_rate['Tasa (%)'])\n", + "plt.figure(figsize=(5, 2))\n", + "sns.set_theme(style=\"darkgrid\")\n", + "sns.barplot(x=period_day_rate_values, y=period_day_rate['Tasa (%)'])\n", "plt.title('Delay Rate by Period of Day')\n", "plt.ylabel('Delay Rate [%]', fontsize=12)\n", "plt.xlabel('Period', fontsize=12)\n", - "plt.ylim(3,7)\n", "plt.show()" ] }, @@ -523,19 +518,9 @@ "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", - "from sklearn.utils import shuffle\n", "from sklearn.metrics import confusion_matrix, classification_report" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data = shuffle(data[['OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'delay']], random_state = 111)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -543,10 +528,10 @@ "outputs": [], "source": [ "features = pd.concat([\n", - " pd.get_dummies(data['OPERA'], prefix = 'OPERA'),\n", - " pd.get_dummies(data['TIPOVUELO'], prefix = 'TIPOVUELO'), \n", - " pd.get_dummies(data['MES'], prefix = 'MES')], \n", - " axis = 1\n", + " pd.get_dummies(data['OPERA'], prefix='OPERA'),\n", + " pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'), \n", + " pd.get_dummies(data['MES'], prefix='MES')], \n", + " axis=1\n", ")\n", "target = data['delay']" ] @@ -557,7 +542,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.33, random_state = 42)" + "x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)" ] }, { @@ -575,7 +560,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_train.value_counts('%')*100" + "y_train.value_counts('%') * 100" ] }, { @@ -584,7 +569,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_test.value_counts('%')*100" + "y_test.value_counts('%') * 100" ] }, { @@ -726,7 +711,7 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize = (10,5))\n", + "plt.figure(figsize=(10,5))\n", "plot_importance(xgb_model)" ] }, @@ -766,7 +751,7 @@ "source": [ "n_y0 = len(y_train[y_train == 0])\n", "n_y1 = len(y_train[y_train == 1])\n", - "scale = n_y0/n_y1\n", + "scale = n_y0 / n_y1\n", "print(scale)" ] }, @@ -792,7 +777,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_train2, x_test2, y_train2, y_test2 = train_test_split(features[top_10_features], target, test_size = 0.33, random_state = 42)" + "x_train2, x_test2, y_train2, y_test2 = train_test_split(features[top_10_features], target, test_size=0.33, random_state=42)" ] }, { @@ -817,7 +802,7 @@ "metadata": {}, "outputs": [], "source": [ - "xgb_model_2 = xgb.XGBClassifier(random_state=1, learning_rate=0.01, scale_pos_weight = scale)\n", + "xgb_model_2 = xgb.XGBClassifier(random_state=1, learning_rate=0.01, scale_pos_weight=scale)\n", "xgb_model_2.fit(x_train2, y_train2)" ] }, @@ -1027,7 +1012,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.12.4" }, "orig_nbformat": 4 }, diff --git a/challenge/model.py b/challenge/model.py index 173ac6c..1143620 100644 --- a/challenge/model.py +++ b/challenge/model.py @@ -1,19 +1,70 @@ +import pickle + +import numpy as np import pandas as pd +import xgboost as xgb +from datetime import datetime from typing import Tuple, Union, List +from sklearn.model_selection import train_test_split + + +THRESHOLD_IN_MINUTES = 15 +MODEL_FILE_NAME = "delay_model.pkl" + + class DelayModel: def __init__( self ): - self._model = None # Model should be saved in this attribute. + self._features = [ + "OPERA_Latin American Wings", + "MES_7", + "MES_10", + "OPERA_Grupo LATAM", + "MES_12", + "TIPOVUELO_I", + "MES_4", + "MES_11", + "OPERA_Sky Airline", + "OPERA_Copa Air" + ] + self._model = self.__load_model(MODEL_FILE_NAME) + + def __load_model(self, file_name): + try: + with open(file_name, 'rb') as fp: + return pickle.load(fp) + except FileNotFoundError: + return None + + def __save_model(self, filename): + with open(filename, 'wb') as fp: + pickle.dump(self._model, fp) + + def get_min_diff(self, data): + """ + Calculate the minute difference between two datetime values. + + Args: + data (pd.DataFrame): raw data. + + Returns: + float: Minute difference between 'Fecha-O' and 'Fecha-I'. + """ + + fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S') + fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S') + min_diff = ((fecha_o - fecha_i).total_seconds()) / 60 + return min_diff def preprocess( self, data: pd.DataFrame, target_column: str = None - ) -> Union(Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame): + ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]: """ Prepare raw data for training or predict. @@ -26,7 +77,28 @@ def preprocess( or pd.DataFrame: features. """ - return + + features = pd.concat([ + pd.get_dummies(data['OPERA'], prefix='OPERA'), + pd.get_dummies(data['TIPOVUELO'], prefix='TIPOVUELO'), + pd.get_dummies(data['MES'], prefix='MES')], + axis=1 + ) + + for feature in self._features: + if feature not in features.columns: + features[feature] = 0 + + if target_column: + data['min_diff'] = data.apply(self.get_min_diff, axis=1) + + data[target_column] = np.where( + data['min_diff'] > THRESHOLD_IN_MINUTES, 1, 0 + ) + + return features[self._features], data[[target_column]] + else: + return features[self._features] def fit( self, @@ -40,7 +112,21 @@ def fit( features (pd.DataFrame): preprocessed data. target (pd.DataFrame): target. """ - return + + x_train, _, y_train, _ = train_test_split( + features, target, test_size=0.33, random_state=42 + ) + + n_y0 = int((target == 0).sum()) + n_y1 = int((target == 1).sum()) + scale = n_y0 / n_y1 + + self._model = xgb.XGBClassifier( + random_state=1, learning_rate=0.01, scale_pos_weight=scale + ) + + self._model.fit(x_train, y_train) + self.__save_model(MODEL_FILE_NAME) def predict( self, @@ -51,8 +137,14 @@ def predict( Args: features (pd.DataFrame): preprocessed data. - + Returns: (List[int]): predicted targets. """ - return \ No newline at end of file + + if self._model is None: + self.__load_model(MODEL_FILE_NAME) + + predictions = self._model.predict(features) + + return predictions.tolist() diff --git a/docs/challenge.md b/docs/challenge.md index e69de29..46a11ef 100644 --- a/docs/challenge.md +++ b/docs/challenge.md @@ -0,0 +1,41 @@ +# Part I +## Bugs fixed on `exploration.ipynb` +- The function `is_high_season` had an issue since it didn't consider the time. For example, `is_high_season("2017-12-31 14:55:00")` returned `0` when in reality it should return `1`. +- All calls to `sns.barplot` were missing the `x` and `y` definition. +- To improve visualizations and correctly show the delay rate, the method `get_rate_from_column` was updated. Instead of calculating `rates[name] = round(total / delays[name], 2)`, I think it's best to do `rates[name] = round(100 * delays[name] / total, 2)` to get the ratio of delayed flights to the total number of flights (for a specific column value). This value is now between 0 and 100, where `0` indicates that no flights with that specific column value were delayed and `100` indicates that all flights with that specific column value were delayed. After implementing this change, visualization code had to be updated as well to avoid limiting the y-axis. +- `training_data` was defined but never used. This cell was deleted. +- Minor style changes were applied, such as using spaces around operators and removing spaces when defining the value of certain method arguments. +- `xgboost` was not included under installed dependencies. + +## Model pick +When comparing the different models' performances, I want to focus on the positive (minority) class since that's the class that represents delays and the model is intended to predict the probability of **delay**. For this, I'll focus on the F1-score for the positive class, since it combines precision and recall into a single metric, providing a balance between the two and accounting for false positives and false negatives. It offers a consolidated evaluation of the model's performance in predicting the positive class while factoring in the class imbalance. Let's review the results for each model: +1. **XGBoost**: 0.00 +2. **Logistic Regression**: 0.06 +3. **XGBoost with Feature Importance and with Balance**: 0.37 +4. **XGBoost with Feature Importance but without Balance**: 0.01 +5. **Logistic Regression with Feature Importante and with Balance**: 0.36 +6. **Logistic Regression with Feature Importante but without Balance**: 0.03 + +With this in mind, the best model picked was the third one: **XGBoost with Feature Importance and with Balance**. XGBoost is an ensemble method that can capture complex relationships in the data and it's more suitable to capture non-linear patterns and interactions between features. Plus, XGBoost is highly scalable and effective for large datasets, while LogisticRegression may struggle with very large datasets. + +# Part II +The model was kept as unchanged as possible. My intention was to avoid hardcoding the top 10 features but the test `test_model.py` asserted that only those columns were present. +Some improvements that could be done here are: +- Have `THRESHOLD_IN_MINUTES` and `MODEL_FILE_NAME` as environment variables. That way, if we want to update either of those values, we can do that without the need to update the code. +- Upload the trained model directly to Google Cloud Storage (GCS) or to another service that can store the different versions of the trained model. For now, we're saving it locally and manually uploading it to GCS, which is where the Continuous Deployment pipeline gets it from. + +## Bugs fixed on `test_model.py` +- Data could not be loaded because the path was incorrect. After changing `"../data/data.csv"` to `"./data/data.csv"` it worked as expected. + +# Part III +API was deployed by building a Docker image and pushing it to Artifact Registry (GCP). The container was then deployed to CloudRun. + +# Part IV +Continuous Integration workflow runs whenever there is a push or pull request to the main/develop branches. +Continuous Deployment workflow runs whenever there is a push or pull request to the main/release branches. +The following secrets were set directly as Repository Secrets on Github: +- GCP_CREDENTIALS: content of GCP's `credentials.json`. +- GCP_IMAGE_NAME: name of the image pushed to Artifact Registry on GCP. +- GCP_PROJECT_ID: name of the project used on GCP (e.g. 'project-id'). +- GCP_REGION: region used on GCP (e.g. 'us-central1'). +- MODEL_VERSION: name of the GCS model that will be downloaded and served (e.g. 'v0'). diff --git a/requirements-test.txt b/requirements-test.txt index 2753f60..91f2e79 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ -locust~=1.6 +locust~=2.29.1 coverage~=5.5 pytest~=6.2.5 pytest-cov~=2.12.1 diff --git a/requirements.txt b/requirements.txt index 64fde77..0cb3b96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -fastapi~=0.86.0 -pydantic~=1.10.2 -uvicorn~=0.15.0 -numpy~=1.22.4 +fastapi~=0.111.0 +pydantic~=1.10.17 +uvicorn~=0.30.1 +numpy~=1.26.4 pandas~=1.3.5 scikit-learn~=1.3.0 +xgboost~=2.1.0 diff --git a/tests/model/test_model.py b/tests/model/test_model.py index e4afabb..97b28e1 100644 --- a/tests/model/test_model.py +++ b/tests/model/test_model.py @@ -28,7 +28,7 @@ class TestModel(unittest.TestCase): def setUp(self) -> None: super().setUp() self.model = DelayModel() - self.data = pd.read_csv(filepath_or_buffer="../data/data.csv") + self.data = pd.read_csv(filepath_or_buffer="./data/data.csv") def test_model_preprocess_for_training(