Merge pull request #5 from tryolabs/feature-cicd-pipelines

Create and configure CI/CD pipelines
tryolabs · Jun 28, 2024 · 53ad125 · 53ad125
2 parents fcd919d + 31de9ec
commit 53ad125
Show file tree

Hide file tree

Showing 14 changed files with 233 additions and 72 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,12 @@
+[flake8]
+max-line-length = 100
+exclude = .venv
+ignore =
+    # disable, whitespace before ':' https://github.com/TCLResearchEurope/ctw-pre-commit-hooks/issues/3
+    E203,
+    # disable "too many leading '#' for block comment"
+    E266,
+    # disable "line break before binary operator" which clashes with black
+    W503,
+    # disable "invalid escape sequence"
+    W605,
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -0,0 +1,86 @@
+name: 'Continuous Deployment'
+
+on:
+  push:
+    branches:
+      - main
+      - develop
+      - release/*
+
+jobs:
+  deployment:
+    runs-on: ubuntu-latest
+    environment: dev
+    env:
+      branch: main
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get the branch name
+        id: get_branch_name
+        run: |
+          echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Authenticate to GCP
+        uses: 'google-github-actions/auth@v2'
+        with:
+          credentials_json: '${{ secrets.CD_SA_KEYS }}'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt -r requirements-dev.txt
+
+      - name: Run training script
+        run: |
+          python train.py
+
+      - name: Authenticate Docker to GAR
+        uses: docker/login-action@v3
+        with:
+          registry: '${{ vars.GCP_REGION }}-docker.pkg.dev'
+          username: _json_key
+          password: ${{ secrets.CD_SA_KEYS }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          push: true
+          tags: '${{ vars.GAR_REPOSITORY }}/${{ vars.GAR_IMAGE_NAME }}-${{ steps.get_branch_name.outputs.branch }}'
+
+      - name: Deploy the service to Cloud Run
+        id: 'deploy'
+        uses: 'google-github-actions/deploy-cloudrun@v2'
+        with:
+          service: '${{ vars.GCR_SERVICE_NAME }}-${{ steps.get_branch_name.outputs.branch }}'
+          image: '${{ vars.GAR_REPOSITORY }}/${{ vars.GAR_IMAGE_NAME }}-${{ steps.get_branch_name.outputs.branch }}'
+          region: '${{ vars.GCP_REGION }}'
+          flags: '--allow-unauthenticated'
+
+    outputs:
+      service_url: ${{ steps.deploy.outputs.url }}
+
+  stress_test:
+    runs-on: ubuntu-latest
+    needs: deployment
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-test.txt
+
+      - name: Run stress test
+        run: |
+          make stress-test API_URL=${{ needs.deployment.outputs.service_url }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,34 @@
+name: 'Continuous Integration'
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt -r requirements-dev.txt -r requirements-test.txt
+
+      - name: Check format with black
+        run: black --check --extend-exclude tests .
+
+      - name: Check style with flake8
+        run: flake8 --extend-exclude tests .
+
+      - name: Check import sorting with isort
+        run: isort --check --extend-skip tests .
+
+      - name: Run model tests
+        run: make model-test
+
+      - name: Run API tests
+        run: make api-test
diff --git a/.gitignore b/.gitignore
@@ -161,5 +161,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-reports
-*.pkl
+reports
diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ install:		## Install dependencies
 	pip install -r requirements-test.txt
 	pip install -r requirements.txt
 
-STRESS_URL = https://delay-model-dpmrk4cwxq-uw.a.run.app
+STRESS_URL = $(API_URL)
 .PHONY: stress-test
 stress-test:
 	# change stress url to your deployed app

diff --git a/challenge/__init__.py b/challenge/__init__.py
@@ -1,3 +1,3 @@
 from challenge.api import app
 
-application = app
+application = app
diff --git a/challenge/model.py b/challenge/model.py
@@ -1,9 +1,9 @@
-import pandas as pd
-
-from typing import Tuple, Union, List
-from sklearn.linear_model import LogisticRegression
 from datetime import datetime
 from pickle import dump, load
+from typing import List, Tuple, Union
+
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
 
 
 class DataError(Exception):

diff --git a/challenge/tmp/model_checkpoint.pkl b/challenge/tmp/model_checkpoint.pkl
diff --git a/docs/challenge.md b/docs/challenge.md
@@ -122,3 +122,43 @@ The API is deployed as a Cloud Run Service that exposes a public endpoint. In or
 After the deployment is completed, the API is available at https://delay-model-dpmrk4cwxq-uw.a.run.app, and the prediction endpoint is available at https://delay-model-dpmrk4cwxq-uw.a.run.app/predict. We can test the service using Postman or run the provided stress test.
 
 The results of the stress test are an error rate of 0%, an average response time of 343ms, a maximum response time of 743ms and the API is able to respond to 87.69 requests per second.
+
+
+## CI/CD Pipeline
+
+On this final step, the goal is to setup a proper CI/CD pipeline.
+
+The Continuous Integration (CI) workflow focuses on running the tests and assesing the quality of the code each time there's a push to the repository, with the goal of detecting bugs earlier, correcting code faster and ensuring good code quality practices.
+
+The Continuous Deployment (CD) workflow focuses on training the model, deploying the API and running the stress test against it. This workflow only runs when there's a push to the `main`, `develop` or `release` branches.
+
+Let's describe each workflow with more detail.
+
+### Continuous Integration
+
+The goals of this workflow are checking the code quality and testing it. For the first goal, the code is checked using `black`, `flake8` and `isort` to ensure that the style and format are correct and fit the repository standards. For the second goal, the provided test suites (`model-test` and `api-test`) are ran to ensure that the changes done on the code don't affect the functionality of the `DelayModel` class and the API.
+
+Important decisions made on this step:
+
+* The test suites require a trained model available for testing purposes. However, this test suites run on Github workers and don't have access to local models. To circumvent this, the model checkpoint is tracked with Git and uploaded to the remote. This is not desirable, since model's can crow rapidly in size and managing them inside the repository can become a problem. The ideal solution would be to maintain a proper Model Registry, with remote storage and a good version management, so that trained models can be uploaded to it or downloaded for testing or deployment. Due to time restrictions and since the model checkpoint is lightweight on this case, the decision to track the model was taken.
+* The `model-test` suite had to be modified due to an error. The path to the data file on the suite was `../../data/data.csv`, which assumed that the tests were ran from the `tests/model/` directory, but tests should actually be run from the project root folder, where the `Makefile` is. To fix this, we change the path to be `data/data.csv`. With this change, tests run correctly and can be triggered from the GA.
+
+### Continuous Deployment
+
+The goal of this workflow is to train the model, build the Docker image with it and deploy it to a Cloud Run service. This workflow only runs when there's a push to the `main`, `develop` or `release` branches and it deploys a different API for each of these. The reasoning is that having different deployments for different stages of the development of features and releases can help in testing how the changes affect the deployment, while keeping the `main` API intact and serving only the released code features.
+
+Here are the most important steps taken to develop this workflow:
+
+* A small and simple training script (`train.py`) was created so that the GA trains the model before deploying it. This training script uses all the available data, preprocesses it, trains the model and writes it to the location used by the Dockerfile to put the model inside the Docker image. This is a simplification of a real scenario. Ideally, the data would be stored remotely and we would have different remote jobs for preprocessing the data, training the model and uploading it to a Model Registry. These remote jobs could be triggered by the same events that trigger this workflow, but none of the preprocessing or training would run inside the GA synchronously.
+* A GCP Service Account `cd-pipeline-sa` was created to grant the Github Action runner with permissions to push the Docker image to the Artifact Registry repository and to deploy the Cloud Run Service. The roles given to this SA are:
+    - `Artifact Registry Writer`: enables the SA to push Docker images to the Artifact Registry repositories
+    - `Cloud Run Admin`: gives the SA full control over the Cloud Run services deployed
+    - `Service Account User`: gives the SA the necessary permissions to act as the default Cloud Run service account. This permission is needed for deploying from the Github Action.
+We created one single SA for simplification, since we only use it in a single workflow. Ideally, we should have multiple SAs, each with more granular and reduced permissions; for example, we could have a "Cloud Run SA" which only has control over the services and nothing else, and a separate "Artifact Registry SA" which only has access to the repository.
+* A `dev` environment was created on the Github Repository, containing various configuration variables (mostly names used through the GCP deployment) and secrets (the key to access the SA `cd-pipeline-sa`). The created configuration variables are:
+    - `GAR_IMAGE_NAME=delay-model-api`
+    - `GAR_REPOSITORY=us-west1-docker.pkg.dev/rodrigo-tryolabs-latam/delay-model-service`
+    - `GCP_PROJECT_ID=rodrigo-tryolabs-latam`
+    - `GCP_REGION=us-west1`
+    - `GCR_SERVICE_NAME=delay-model`
+* After deployment of the service, the stress tests run against the deployed API. As mentioned, different APIs are deployed depending on the branch. To point the stress test script to the correct API, a small modification was needed to be done to the `Makefile`, so that the URL of the API is passed as an argument on the `make stress-test` command. The final command is `make stress-test API_URL=<api-url>`.
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -3,4 +3,6 @@ coverage~=5.5
 pytest~=6.2.5
 pytest-cov~=2.12.1
 mockito~=1.2.2
-flask>=2.2.2
+jinja2==3.0.3
+itsdangerous==2.0.1
+werkzeug==2.0.3
diff --git a/tests/model/test_model.py b/tests/model/test_model.py
@@ -5,10 +5,11 @@
 from sklearn.model_selection import train_test_split
 from challenge.model import DelayModel
 
+
 class TestModel(unittest.TestCase):
 
     FEATURES_COLS = [
-        "OPERA_Latin American Wings", 
+        "OPERA_Latin American Wings",
         "MES_7",
         "MES_10",
         "OPERA_Grupo LATAM",
@@ -17,27 +18,18 @@ class TestModel(unittest.TestCase):
         "MES_4",
         "MES_11",
         "OPERA_Sky Airline",
-        "OPERA_Copa Air"
-    ]
-
-    TARGET_COL = [
-        "delay"
+        "OPERA_Copa Air",
     ]
 
+    TARGET_COL = ["delay"]
 
     def setUp(self) -> None:
         super().setUp()
         self.model = DelayModel()
-        self.data = pd.read_csv(filepath_or_buffer="../data/data.csv")
-
-
-    def test_model_preprocess_for_training(
-        self
-    ):
-        features, target = self.model.preprocess(
-            data=self.data,
-            target_column="delay"
-        )
+        self.data = pd.read_csv(filepath_or_buffer="data/data.csv")
+
+    def test_model_preprocess_for_training(self):
+        features, target = self.model.preprocess(data=self.data, target_column="delay")
 
         assert isinstance(features, pd.DataFrame)
         assert features.shape[1] == len(self.FEATURES_COLS)
@@ -47,57 +39,40 @@ def test_model_preprocess_for_training(
         assert target.shape[1] == len(self.TARGET_COL)
         assert set(target.columns) == set(self.TARGET_COL)
 
-
-    def test_model_preprocess_for_serving(
-        self
-    ):
-        features = self.model.preprocess(
-            data=self.data
-        )
+    def test_model_preprocess_for_serving(self):
+        features = self.model.preprocess(data=self.data)
 
         assert isinstance(features, pd.DataFrame)
         assert features.shape[1] == len(self.FEATURES_COLS)
         assert set(features.columns) == set(self.FEATURES_COLS)
 
+    def test_model_fit(self):
+        features, target = self.model.preprocess(data=self.data, target_column="delay")
 
-    def test_model_fit(
-        self
-    ):
-        features, target = self.model.preprocess(
-            data=self.data,
-            target_column="delay"
+        _, features_validation, _, target_validation = train_test_split(
+            features, target, test_size=0.33, random_state=42
         )
 
-        _, features_validation, _, target_validation = train_test_split(features, target, test_size = 0.33, random_state = 42)
+        self.model.fit(features=features, target=target)
 
-        self.model.fit(
-            features=features,
-            target=target
-        )
+        predicted_target = self.model._model.predict(features_validation)
 
-        predicted_target = self.model._model.predict(
-            features_validation
+        report = classification_report(
+            target_validation, predicted_target, output_dict=True
         )
 
-        report = classification_report(target_validation, predicted_target, output_dict=True)
-
         assert report["0"]["recall"] < 0.60
         assert report["0"]["f1-score"] < 0.70
         assert report["1"]["recall"] > 0.60
         assert report["1"]["f1-score"] > 0.30
 
+    def test_model_predict(self):
+        features = self.model.preprocess(data=self.data)
 
-    def test_model_predict(
-        self
-    ):
-        features = self.model.preprocess(
-            data=self.data
-        )
-
-        predicted_targets = self.model.predict(
-            features=features
-        )
+        predicted_targets = self.model.predict(features=features)
 
         assert isinstance(predicted_targets, list)
         assert len(predicted_targets) == features.shape[0]
-        assert all(isinstance(predicted_target, int) for predicted_target in predicted_targets)
+        assert all(
+            isinstance(predicted_target, int) for predicted_target in predicted_targets
+        )
diff --git a/train.py b/train.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+from challenge.model import DelayModel
+
+print("Loading data...")
+# Read the data
+df = pd.read_csv("data/data.csv")
+print("-> Data loaded")
+
+# Create the model
+model = DelayModel()
+
+print("Preprocessing data...")
+# Preprocess the data
+X_train, y_train = model.preprocess(df, "delay")
+print("-> Preprocessed data")
+
+
+print("Training model...")
+# Train the model
+model.fit(X_train, y_train)
+print("-> Model trained")
+
+print("Saving model...")
+# Store the model
+model.save("challenge/tmp/model_checkpoint.pkl")
+print("-> Model saved")
diff --git a/workflows/cd.yml b/workflows/cd.yml
diff --git a/workflows/ci.yml b/workflows/ci.yml
-Original file line number
+Diff line change
@@ Expand Up / @@ -161,5 +161,4 @@ cython_debug/ @@
     #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
     #.idea/
-    reports
-    *.pkl
+    reports