diff --git a/.gitignore b/.gitignore
index 2bc18e4..9094443 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,4 +62,5 @@ tutorials/openstack/secure.yaml
# Others
old/
istio*
-temp/
\ No newline at end of file
+temp/
+.platform/
\ No newline at end of file
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1b7e320..caa8276 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -2,7 +2,7 @@
This file lists all the individuals who have contributed to this project. Thanks to each and every one of you for your valuable contributions!
-### Silo AI:
+### [Silo AI](https://www.silo.ai/):
Original project leads and main developers:
@@ -40,12 +40,14 @@ Other developers and testers of the platform:
- Kristian Sikiric
- Kaustav Tamuly
- Jonathan Burdge
+- Ammar Aldhahyani
-### IML4E project and other contributors:
+### [IML4E](https://itea4.org/project/iml4e.html) project and other contributors:
-Univerwity of Helsinki:
+[University of Helsinki](https://www.helsinki.fi/en/researchgroups/empirical-software-engineering):
- Niila Siilasjoki
+- Dennis Muiruri
Fraunhofer Institute:
diff --git a/config.env b/config.env
index c0a7bdc..90f77b7 100644
--- a/config.env
+++ b/config.env
@@ -1,4 +1,2 @@
HOST_IP="127.0.0.1"
-CLUSTER_NAME="kind-ep"
-INSTALL_LOCAL_REGISTRY="true"
-INSTALL_RAY="false"
\ No newline at end of file
+CLUSTER_NAME="mlops-platform"
\ No newline at end of file
diff --git a/deployment/README.md b/deployment/README.md
index fb32127..49a2fff 100644
--- a/deployment/README.md
+++ b/deployment/README.md
@@ -1,7 +1,19 @@
## Deploy the stack
+Choose the deployment option that best fits your needs:
+1. `kubeflow-monitoring`: Full Kubeflow deployment with all components.
+2. `kubeflow`: Full Kubeflow deployment without monitoring components (prometheus, grafana).
+3. `standalone-kfp-monitoring`: Standalone KFP deployment.
+4. `standalone-kfp`: Standalone KFP deployment without monitoring components (prometheus, grafana).
+5. `standalone-kfp-kserve-monitoring`: Standalone KFP and Kserve deployment.
+6. `standalone-kfp-kserve`: Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana).
+
+```bash
+export DEPLOYMENT_OPTION=kubeflow-monitoring
+```
+
Deploy to your kubernetes cluster with the following command:
```bash
-while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
+while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
```
\ No newline at end of file
diff --git a/deployment/kubeflow-custom/kserve-sa.yaml b/deployment/custom/kserve-custom/base/kserve-sa.yaml
similarity index 83%
rename from deployment/kubeflow-custom/kserve-sa.yaml
rename to deployment/custom/kserve-custom/base/kserve-sa.yaml
index dd2d9cb..a300f81 100644
--- a/deployment/kubeflow-custom/kserve-sa.yaml
+++ b/deployment/custom/kserve-custom/base/kserve-sa.yaml
@@ -2,7 +2,6 @@ apiVersion: v1
kind: Secret
metadata:
name: mysecret
- namespace: kubeflow-user-example-com
annotations:
serving.kserve.io/s3-endpoint: mlflow-minio-service.mlflow.svc.cluster.local:9000
serving.kserve.io/s3-usehttps: "0"
@@ -15,6 +14,5 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: kserve-sa
- namespace: kubeflow-user-example-com
secrets:
- name: mysecret
diff --git a/deployment/kubeflow-custom/kustomization.yaml b/deployment/custom/kserve-custom/base/kustomization.yaml
similarity index 83%
rename from deployment/kubeflow-custom/kustomization.yaml
rename to deployment/custom/kserve-custom/base/kustomization.yaml
index 2bf776f..12b6241 100644
--- a/deployment/kubeflow-custom/kustomization.yaml
+++ b/deployment/custom/kserve-custom/base/kustomization.yaml
@@ -2,5 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
-- aws-secret.yaml
- kserve-sa.yaml
\ No newline at end of file
diff --git a/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml b/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml
new file mode 100644
index 0000000..0ad2c65
--- /dev/null
+++ b/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kubeflow-user-example-com
+
+resources:
+- ../../base
diff --git a/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml b/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml
new file mode 100644
index 0000000..950c92b
--- /dev/null
+++ b/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: kserve-inference
\ No newline at end of file
diff --git a/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml b/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml
new file mode 100644
index 0000000..762483e
--- /dev/null
+++ b/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kserve-inference
+
+resources:
+ - ../../base
+ - kserve-inference-namespace.yaml
diff --git a/deployment/kubeflow-custom/aws-secret.yaml b/deployment/custom/kubeflow-custom/base/aws-secret.yaml
similarity index 86%
rename from deployment/kubeflow-custom/aws-secret.yaml
rename to deployment/custom/kubeflow-custom/base/aws-secret.yaml
index b63fb1e..7203f52 100644
--- a/deployment/kubeflow-custom/aws-secret.yaml
+++ b/deployment/custom/kubeflow-custom/base/aws-secret.yaml
@@ -2,7 +2,6 @@ apiVersion: v1
kind: Secret
metadata:
name: aws-secret
- namespace: kubeflow-user-example-com
type: Opaque
data:
# your BASE64 encoded AWS_ACCESS_KEY_ID
diff --git a/deployment/custom/kubeflow-custom/base/kustomization.yaml b/deployment/custom/kubeflow-custom/base/kustomization.yaml
new file mode 100644
index 0000000..e6ae779
--- /dev/null
+++ b/deployment/custom/kubeflow-custom/base/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- aws-secret.yaml
\ No newline at end of file
diff --git a/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml b/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml
new file mode 100644
index 0000000..0ad2c65
--- /dev/null
+++ b/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kubeflow-user-example-com
+
+resources:
+- ../../base
diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml
new file mode 100644
index 0000000..0ac0071
--- /dev/null
+++ b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml
@@ -0,0 +1,110 @@
+# Required for deploy model to have the necessery permissions to create inference services
+
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: kserve-deployer
+rules:
+ - verbs:
+ - '*'
+ apiGroups:
+ - ''
+ resources:
+ - secrets
+ - serviceaccounts
+ - verbs:
+ - get
+ - watch
+ - list
+ apiGroups:
+ - ''
+ resources:
+ - configmaps
+ - verbs:
+ - '*'
+ apiGroups:
+ - ''
+ resources:
+ - persistentvolumes
+ - persistentvolumeclaims
+ - verbs:
+ - create
+ - delete
+ - get
+ apiGroups:
+ - snapshot.storage.k8s.io
+ resources:
+ - volumesnapshots
+ - verbs:
+ - get
+ - list
+ - watch
+ - update
+ - patch
+ apiGroups:
+ - argoproj.io
+ resources:
+ - workflows
+ - verbs:
+ - '*'
+ apiGroups:
+ - ''
+ resources:
+ - pods
+ - pods/exec
+ - pods/log
+ - services
+ - verbs:
+ - '*'
+ apiGroups:
+ - ''
+ - apps
+ - extensions
+ resources:
+ - deployments
+ - replicasets
+ - verbs:
+ - '*'
+ apiGroups:
+ - kubeflow.org
+ resources:
+ - '*'
+ - verbs:
+ - '*'
+ apiGroups:
+ - batch
+ resources:
+ - jobs
+ - verbs:
+ - '*'
+ apiGroups:
+ - machinelearning.seldon.io
+ resources:
+ - seldondeployments
+ - verbs:
+ - '*'
+ apiGroups:
+ - serving.kserve.io
+ resources:
+ - '*'
+ - verbs:
+ - '*'
+ apiGroups:
+ - networking.istio.io
+ resources:
+ - '*'
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: pipeline-runner-binding-cluster
+ labels:
+ application-crd-id: kubeflow-pipelines
+subjects:
+ - kind: ServiceAccount
+ name: pipeline-runner
+ namespace: kubeflow
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: kserve-deployer
\ No newline at end of file
diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml
new file mode 100644
index 0000000..1eec75e
--- /dev/null
+++ b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kubeflow
+
+resources:
+ - ../../base
+ - kserve-deployer.yaml
diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml
new file mode 100644
index 0000000..1241abe
--- /dev/null
+++ b/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: kubeflow
+
+resources:
+ - ../../base
diff --git a/deployment/envs/kubeflow-monitoring/kustomization.yaml b/deployment/envs/kubeflow-monitoring/kustomization.yaml
new file mode 100644
index 0000000..3e7c6ce
--- /dev/null
+++ b/deployment/envs/kubeflow-monitoring/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/kubeflow
+- ../../custom/kubeflow-custom/env/kubeflow
+- ../../custom/kserve-custom/env/kubeflow
+- ../../mlflow/env/local
+- ../../monitoring
\ No newline at end of file
diff --git a/deployment/envs/kubeflow/kustomization.yaml b/deployment/envs/kubeflow/kustomization.yaml
new file mode 100644
index 0000000..c4c1ca5
--- /dev/null
+++ b/deployment/envs/kubeflow/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/kubeflow
+- ../../custom/kubeflow-custom/env/kubeflow
+- ../../custom/kserve-custom/env/kubeflow
+- ../../mlflow/env/local
\ No newline at end of file
diff --git a/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml b/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml
new file mode 100644
index 0000000..5d27568
--- /dev/null
+++ b/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve
+- ../../custom/kubeflow-custom/env/standalone-kfp-kserve
+- ../../custom/kserve-custom/env/standalone-kfp
+- ../../mlflow/env/local
+- ../../monitoring
\ No newline at end of file
diff --git a/deployment/envs/standalone-kfp-kserve/kustomization.yaml b/deployment/envs/standalone-kfp-kserve/kustomization.yaml
new file mode 100644
index 0000000..e757443
--- /dev/null
+++ b/deployment/envs/standalone-kfp-kserve/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve
+- ../../custom/kubeflow-custom/env/standalone-kfp-kserve
+- ../../custom/kserve-custom/env/standalone-kfp
+- ../../mlflow/env/local
\ No newline at end of file
diff --git a/deployment/envs/standalone-kfp-monitoring/kustomization.yaml b/deployment/envs/standalone-kfp-monitoring/kustomization.yaml
new file mode 100644
index 0000000..696c68b
--- /dev/null
+++ b/deployment/envs/standalone-kfp-monitoring/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp
+- ../../custom/kubeflow-custom/env/standalone-kfp
+- ../../mlflow/env/local
+- ../../monitoring
\ No newline at end of file
diff --git a/deployment/envs/standalone-kfp/kustomization.yaml b/deployment/envs/standalone-kfp/kustomization.yaml
new file mode 100644
index 0000000..16cc703
--- /dev/null
+++ b/deployment/envs/standalone-kfp/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp
+- ../../custom/kubeflow-custom/env/standalone-kfp
+- ../../mlflow/env/local
\ No newline at end of file
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md
new file mode 100644
index 0000000..8dc1847
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md
@@ -0,0 +1,10 @@
+# Kubeflow
+
+Components:
+- Multiuser isolation
+- Central Dashboard
+- Jupyter Notebooks
+- Kubeflow Pipelines (KFP)
+- Kserve
+- Katib
+- TensorBoard
\ No newline at end of file
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml
new file mode 100644
index 0000000..d3c11cf
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml
@@ -0,0 +1,87 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+sortOptions:
+ order: legacy
+ legacySortOptions:
+ orderFirst:
+ - Namespace
+ - ResourceQuota
+ - StorageClass
+ - CustomResourceDefinition
+ - MutatingWebhookConfiguration
+ - ServiceAccount
+ - PodSecurityPolicy
+ - Role
+ - ClusterRole
+ - RoleBinding
+ - ClusterRoleBinding
+ - ConfigMap
+ - Secret
+ - Endpoints
+ - Service
+ - LimitRange
+ - PriorityClass
+ - PersistentVolume
+ - PersistentVolumeClaim
+ - Deployment
+ - StatefulSet
+ - CronJob
+ - PodDisruptionBudget
+ orderLast:
+ - ValidatingWebhookConfiguration
+
+resources:
+# Cert-Manager
+- ../../common/cert-manager/cert-manager/base
+- ../../common/cert-manager/kubeflow-issuer/base
+# Istio
+- ../../common/istio-1-17/istio-crds/base
+- ../../common/istio-1-17/istio-namespace/base
+- ../../common/istio-1-17/istio-install/base
+# OIDC Authservice
+- ../../common/oidc-client/oidc-authservice/base
+# Dex
+- ../../common/dex/overlays/istio
+# KNative
+- ../../common/knative/knative-serving/overlays/gateways
+- ../../common/knative/knative-eventing/base
+- ../../common/istio-1-17/cluster-local-gateway/base
+# Kubeflow namespace
+- ../../common/kubeflow-namespace/base
+# Kubeflow Roles
+- ../../common/kubeflow-roles/base
+# Kubeflow Istio Resources
+- ../../common/istio-1-17/kubeflow-istio-resources/base
+
+
+# Kubeflow Pipelines
+- ../../apps/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user
+# Katib
+- ../../apps/katib/upstream/installs/katib-with-kubeflow
+# Central Dashboard
+- ../../apps/centraldashboard/upstream/overlays/kserve
+# Admission Webhook
+- ../../apps/admission-webhook/upstream/overlays/cert-manager
+# Jupyter Web App
+- ../../apps/jupyter/jupyter-web-app/upstream/overlays/istio
+# Notebook Controller
+- ../../apps/jupyter/notebook-controller/upstream/overlays/kubeflow
+# Profiles + KFAM
+- ../../apps/profiles/upstream/overlays/kubeflow
+# PVC Viewer
+- ../../apps/pvcviewer-controller/upstream/base/
+# Volumes Web App
+- ../../apps/volumes-web-app/upstream/overlays/istio
+# Tensorboards Controller
+- ../../apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow
+# Tensorboard Web App
+- ../../apps/tensorboard/tensorboards-web-app/upstream/overlays/istio
+# Training Operator
+- ../../apps/training-operator/upstream/overlays/kubeflow
+# User namespace
+- ../../common/user-namespace/base
+
+# KServe
+- ../../contrib/kserve/kserve
+- ../../contrib/kserve/models-web-app/overlays/kubeflow
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml
deleted file mode 100644
index c1a8578..0000000
--- a/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-sortOptions:
- order: legacy
- legacySortOptions:
- orderFirst:
- - Namespace
- - ResourceQuota
- - StorageClass
- - CustomResourceDefinition
- - MutatingWebhookConfiguration
- - ServiceAccount
- - PodSecurityPolicy
- - Role
- - ClusterRole
- - RoleBinding
- - ClusterRoleBinding
- - ConfigMap
- - Secret
- - Endpoints
- - Service
- - LimitRange
- - PriorityClass
- - PersistentVolume
- - PersistentVolumeClaim
- - Deployment
- - StatefulSet
- - CronJob
- - PodDisruptionBudget
- orderLast:
- - ValidatingWebhookConfiguration
-
-resources:
-# Cert-Manager
-- ../common/cert-manager/cert-manager/base
-- ../common/cert-manager/kubeflow-issuer/base
-# Istio
-- ../common/istio-1-17/istio-crds/base
-- ../common/istio-1-17/istio-namespace/base
-- ../common/istio-1-17/istio-install/base
-# OIDC Authservice
-- ../common/oidc-client/oidc-authservice/base
-# Dex
-- ../common/dex/overlays/istio
-# KNative
-- ../common/knative/knative-serving/overlays/gateways
-- ../common/knative/knative-eventing/base
-- ../common/istio-1-17/cluster-local-gateway/base
-# Kubeflow namespace
-- ../common/kubeflow-namespace/base
-# Kubeflow Roles
-- ../common/kubeflow-roles/base
-# Kubeflow Istio Resources
-- ../common/istio-1-17/kubeflow-istio-resources/base
-
-
-# Kubeflow Pipelines
-- ../apps/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user
-# Katib
-- ../apps/katib/upstream/installs/katib-with-kubeflow
-# Central Dashboard
-- ../apps/centraldashboard/upstream/overlays/kserve
-# Admission Webhook
-- ../apps/admission-webhook/upstream/overlays/cert-manager
-# Jupyter Web App
-- ../apps/jupyter/jupyter-web-app/upstream/overlays/istio
-# Notebook Controller
-- ../apps/jupyter/notebook-controller/upstream/overlays/kubeflow
-# Profiles + KFAM
-- ../apps/profiles/upstream/overlays/kubeflow
-# PVC Viewer
-- ../apps/pvcviewer-controller/upstream/base/
-# Volumes Web App
-- ../apps/volumes-web-app/upstream/overlays/istio
-# Tensorboards Controller
-- ../apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow
-# Tensorboard Web App
-- ../apps/tensorboard/tensorboards-web-app/upstream/overlays/istio
-# Training Operator
-- ../apps/training-operator/upstream/overlays/kubeflow
-# User namespace
-- ../common/user-namespace/base
-
-# KServe
-- ../contrib/kserve/kserve
-- ../contrib/kserve/models-web-app/overlays/kubeflow
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md
new file mode 100644
index 0000000..f954728
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md
@@ -0,0 +1,5 @@
+# Standalone KFP + Kserve
+
+Components:
+- Kubeflow Pipelines (KFP)
+- Kserve
\ No newline at end of file
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml
new file mode 100644
index 0000000..d4e9892
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml
@@ -0,0 +1,57 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+sortOptions:
+ order: legacy
+ legacySortOptions:
+ orderFirst:
+ - Namespace
+ - ResourceQuota
+ - StorageClass
+ - CustomResourceDefinition
+ - MutatingWebhookConfiguration
+ - ServiceAccount
+ - PodSecurityPolicy
+ - Role
+ - ClusterRole
+ - RoleBinding
+ - ClusterRoleBinding
+ - ConfigMap
+ - Secret
+ - Endpoints
+ - Service
+ - LimitRange
+ - PriorityClass
+ - PersistentVolume
+ - PersistentVolumeClaim
+ - Deployment
+ - StatefulSet
+ - CronJob
+ - PodDisruptionBudget
+ orderLast:
+ - ValidatingWebhookConfiguration
+
+resources:
+# Cert-Manager
+- ../../common/cert-manager/cert-manager/base
+- ../../common/cert-manager/kubeflow-issuer/base
+
+# Istio
+- ../../common/istio-1-17/istio-crds/base
+- ../../common/istio-1-17/istio-namespace/base
+- ../../common/istio-1-17/istio-install/base
+
+# KNative
+- ../../common/knative/knative-serving/overlays/gateways
+- ../../common/knative/knative-eventing/base
+- ../../common/istio-1-17/cluster-local-gateway/base
+
+# Kubeflow Istio Resources
+- ../../common/istio-1-17/kubeflow-istio-resources/base
+
+# Kubeflow Pipelines
+- ../../apps/pipeline/upstream/cluster-scoped-resources
+- ../../apps/pipeline/upstream/env/platform-agnostic-emissary
+
+# KServe
+- ../../contrib/kserve/kserve
\ No newline at end of file
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md
new file mode 100644
index 0000000..a2245f6
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md
@@ -0,0 +1,4 @@
+# Standalone KFP
+
+Components:
+- Kubeflow Pipelines (KFP)
\ No newline at end of file
diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml
new file mode 100644
index 0000000..a92789a
--- /dev/null
+++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml
@@ -0,0 +1,54 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+sortOptions:
+ order: legacy
+ legacySortOptions:
+ orderFirst:
+ - Namespace
+ - ResourceQuota
+ - StorageClass
+ - CustomResourceDefinition
+ - MutatingWebhookConfiguration
+ - ServiceAccount
+ - PodSecurityPolicy
+ - Role
+ - ClusterRole
+ - RoleBinding
+ - ClusterRoleBinding
+ - ConfigMap
+ - Secret
+ - Endpoints
+ - Service
+ - LimitRange
+ - PriorityClass
+ - PersistentVolume
+ - PersistentVolumeClaim
+ - Deployment
+ - StatefulSet
+ - CronJob
+ - PodDisruptionBudget
+ orderLast:
+ - ValidatingWebhookConfiguration
+
+resources:
+# Cert-Manager
+- ../../common/cert-manager/cert-manager/base
+- ../../common/cert-manager/kubeflow-issuer/base
+
+# Istio
+- ../../common/istio-1-17/istio-crds/base
+- ../../common/istio-1-17/istio-namespace/base
+- ../../common/istio-1-17/istio-install/base
+
+# KNative
+- ../../common/knative/knative-serving/overlays/gateways
+- ../../common/knative/knative-eventing/base
+- ../../common/istio-1-17/cluster-local-gateway/base
+
+# Kubeflow Istio Resources
+- ../../common/istio-1-17/kubeflow-istio-resources/base
+
+# Kubeflow Pipelines
+- ../../apps/pipeline/upstream/cluster-scoped-resources
+- ../../apps/pipeline/upstream/env/platform-agnostic-emissary
\ No newline at end of file
diff --git a/deployment/kustomization.yaml b/deployment/kustomization.yaml
deleted file mode 100644
index c5d654e..0000000
--- a/deployment/kustomization.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
-- ./kubeflow/manifests/in-cluster-setup
-- ./kubeflow-custom
-- ./mlflow/env/local
-- ./monitoring
\ No newline at end of file
diff --git a/deployment/mlflow/base/config.env b/deployment/mlflow/base/config.env
index 5d6dfc3..e24035e 100644
--- a/deployment/mlflow/base/config.env
+++ b/deployment/mlflow/base/config.env
@@ -4,4 +4,4 @@ DB_HOST=postgres
DB_PORT=5432
DB_NAME=mlflow
-DEFAULT_ARTIFACT_ROOT=gs://mlflow-platformv2
+DEFAULT_ARTIFACT_ROOT=gs://mlflow-mlops-platform
diff --git a/deployment/mlflow/env/gcp-cloudsql/params.env b/deployment/mlflow/env/gcp-cloudsql/params.env
index 69446ff..f83af45 100644
--- a/deployment/mlflow/env/gcp-cloudsql/params.env
+++ b/deployment/mlflow/env/gcp-cloudsql/params.env
@@ -1 +1 @@
-GCP_CLOUDSQL_INSTANCE_NAME=mlops-platform-v2:europe-west1:mlops-platformv2
+GCP_CLOUDSQL_INSTANCE_NAME=mlops-platform-v2:europe-west1:mlops-platform
diff --git a/deployment/monitoring/alert-manager/deployment.yaml b/deployment/monitoring/alert-manager/deployment.yaml
index 2e44389..96028d2 100644
--- a/deployment/monitoring/alert-manager/deployment.yaml
+++ b/deployment/monitoring/alert-manager/deployment.yaml
@@ -25,7 +25,7 @@ spec:
containerPort: 9093
resources:
requests:
- cpu: 500m
+ cpu: 250m
memory: 500M
limits:
cpu: 1
diff --git a/deployment/monitoring/grafana/grafana-deployment.yaml b/deployment/monitoring/grafana/grafana-deployment.yaml
index 71032e1..8e3144e 100644
--- a/deployment/monitoring/grafana/grafana-deployment.yaml
+++ b/deployment/monitoring/grafana/grafana-deployment.yaml
@@ -28,7 +28,7 @@ spec:
cpu: "1000m"
requests:
memory: 500M
- cpu: "500m"
+ cpu: "250m"
volumeMounts:
- mountPath: /var/lib/grafana
name: grafana-storage
diff --git a/scripts/create_cluster.sh b/scripts/create_cluster.sh
index bbe9697..ae01799 100755
--- a/scripts/create_cluster.sh
+++ b/scripts/create_cluster.sh
@@ -1,11 +1,11 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
#######################################################################################
# Create and configure a cluster with Kind
#
-# Usage: $ export HOST_IP=127.0.0.1; export CLUSTER_NAME="kind-ep"; ./create_cluster.sh
+# Usage: $ export HOST_IP=127.0.0.1; export CLUSTER_NAME="mlops-platform"; ./create_cluster.sh
#######################################################################################
@@ -67,7 +67,7 @@ fi
# see https://github.com/kubernetes-sigs/kind/issues/2586
-CONTAINER_ID=$(docker ps -aqf "name=kind-ep-control-plane")
+CONTAINER_ID=$(docker ps -aqf "name=$CLUSTER_NAME-control-plane")
docker exec -t ${CONTAINER_ID} bash -c "echo 'fs.inotify.max_user_watches=1048576' >> /etc/sysctl.conf"
docker exec -t ${CONTAINER_ID} bash -c "echo 'fs.inotify.max_user_instances=512' >> /etc/sysctl.conf"
docker exec -i ${CONTAINER_ID} bash -c "sysctl -p /etc/sysctl.conf"
diff --git a/scripts/install_helm.sh b/scripts/install_helm.sh
index 4a15055..ef44c43 100644
--- a/scripts/install_helm.sh
+++ b/scripts/install_helm.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeo pipefail
+set -eo pipefail
function add_local_bin_to_path {
# make sure ~/.local/bin is in $PATH
diff --git a/scripts/install_local_registry.sh b/scripts/install_local_registry.sh
index 4665d4b..3f1844b 100755
--- a/scripts/install_local_registry.sh
+++ b/scripts/install_local_registry.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
#######################################################################################
# The following shell script will create a local docker registry and connect the
diff --git a/scripts/install_ray.sh b/scripts/install_ray.sh
index b53d539..1e1a864 100644
--- a/scripts/install_ray.sh
+++ b/scripts/install_ray.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeo pipefail
+set -eo pipefail
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
helm repo update
diff --git a/scripts/install_tools.sh b/scripts/install_tools.sh
index d4dc587..105415f 100755
--- a/scripts/install_tools.sh
+++ b/scripts/install_tools.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
#######################################################################################
# CHECK PRE-REQUISITES
diff --git a/scripts/install_tools_mac.sh b/scripts/install_tools_mac.sh
index ad87a1b..a7c30c9 100644
--- a/scripts/install_tools_mac.sh
+++ b/scripts/install_tools_mac.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
#######################################################################################
# CHECK PRE-REQUISITES
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index ea1cf7d..0df3505 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
#######################################################################################
# RUN TESTS
diff --git a/setup.md b/setup.md
index bf89b2e..c424959 100644
--- a/setup.md
+++ b/setup.md
@@ -15,6 +15,19 @@ Install the experimentation platform with:
> **WARNING:** Using the `--test` flag will install the `requirements-tests.txt` in your current python environment.
+## Deployment options
+
+1. **Kubeflow:** Full Kubeflow deployment with all components.
+2. **Kubeflow (without monitoring):** Full Kubeflow deployment without monitoring components (prometheus, grafana).
+3. **Standalone KFP:** Standalone KFP deployment.
+4. **Standalone KFP (without monitoring):** Standalone KFP deployment without monitoring components (prometheus, grafana).
+5. **Standalone KFP and Kserve:** Standalone KFP and Kserve deployment.
+6. **Standalone KFP and Kserve (without monitoring):** Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana).
+
+> The minimum recommended machine requirements are:
+> - **Kubeflow** options: 12 CPU cores, 25GB free disk space.
+> - **Standalone KFP** options: 8 CPU cores, 18GB free disk space.
+
## Test the deployment (manually)
If you just deployed the platform, it will take a while to become ready. You can use
@@ -40,15 +53,27 @@ pytest tests/ [-vrP] [--log-cli-level=INFO]
*These are the same tests that are run automatically if you use the `--test` flag on installation.*
-## Deleting the deployment
+## Uninstall
+
+Uninstall the MLOps Platform with:
-Delete the cluster:
```bash
-# e.g. $ kind delete cluster --name kind-ep
+./uninstall.sh
+```
+
+### Manual deletion
+
+The `uninstall.sh` script should delete everything, but if you need to manually remove the platform, you can do it with:
+
+```bash
+# list kind clusters
+kind get clusters
+
+# delete the kind cluster
kind delete cluster --name [CLUSTER_NAME]
```
-If you also installed the local docker registry (`config.env` > `INSTALL_LOCAL_REGISTRY="true"`):
+If you also installed the local docker registry:
```bash
# check if it is running (kind-registry)
@@ -68,20 +93,7 @@ docker rm -f $(docker ps -aqf "name=kind-registry")
### Error: namespace "kubeflow-user-example-com" not found
This is not an error, and it is expected. Some of the things being deployed depend on other components, which need to be deployed and become ready first.
-For example, the namespace `kubeflow-user-example-com` is created by a `kubeflow` component. That's why we deploy in a loop until everything is applied successfully:
-
-```bash
-while true; do
- if kubectl apply -f "$tmpfile"; then
- echo "Resources successfully applied."
- rm "$tmpfile"
- break
- else
- echo "Retrying to apply resources. Be patient, this might take a while..."
- sleep 10
- fi
-done
-```
+For example, the namespace `kubeflow-user-example-com` is created by a `kubeflow` component. That's why we deploy in a loop until everything is applied successfully.
Once the main `kubeflow` deployment is ready, the `kubeflow-user-example-com` namespace will be created, and the command should finish successfully.
diff --git a/setup.sh b/setup.sh
index 2304afb..797b6db 100755
--- a/setup.sh
+++ b/setup.sh
@@ -1,8 +1,15 @@
#!/bin/bash
-set -xeoa pipefail
+set -eoa pipefail
-source config.env
+# Internal directory where to store platform settings
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+PLATFORM_DIR="$SCRIPT_DIR/.platform"
+mkdir -p "$PLATFORM_DIR"
+PLATFORM_CONFIG="$PLATFORM_DIR/.config"
+cp "$SCRIPT_DIR/config.env" $PLATFORM_CONFIG
+
+source $PLATFORM_CONFIG
RUN_TESTS=false
LOG_LEVEL_TESTS="WARNING"
@@ -23,13 +30,66 @@ echo Cluster name set to: "$CLUSTER_NAME"
echo Host IP set to: "$HOST_IP"
echo Run tests after installation set to: "$RUN_TESTS"
+DEFAULT_DEPLOYMENT_OPTION="kubeflow-monitoring"
+echo
+echo "Please choose the deployment option:"
+echo "[1] Kubeflow (all components)"
+echo "[2] Kubeflow (without monitoring)"
+echo "[3] Standalone KFP"
+echo "[4] Standalone KFP (without monitoring)"
+echo "[5] Standalone KFP and Kserve"
+echo "[6] Standalone KFP and Kserve (without monitoring)"
+read -p "Enter the number of your choice [1-6] (default is [1]): " choice
+case "$choice" in
+ 1 ) DEPLOYMENT_OPTION="kubeflow-monitoring" ;;
+ 2 ) DEPLOYMENT_OPTION="kubeflow" ;;
+ 3 ) DEPLOYMENT_OPTION="standalone-kfp-monitoring" ;;
+ 4 ) DEPLOYMENT_OPTION="standalone-kfp" ;;
+ 5 ) DEPLOYMENT_OPTION="standalone-kfp-kserve-monitoring" ;;
+ 6 ) DEPLOYMENT_OPTION="standalone-kfp-kserve" ;;
+ * ) DEPLOYMENT_OPTION="$DEFAULT_DEPLOYMENT_OPTION" ;;
+esac
+
+INSTALL_LOCAL_REGISTRY=true
+echo
+read -p "Install local Docker registry? (y/n) (default is [y]): " choice
+case "$choice" in
+ n|N ) INSTALL_LOCAL_REGISTRY=false ;;
+ * ) INSTALL_LOCAL_REGISTRY=true ;;
+esac
+
+INSTALL_RAY=false
+echo
+read -p "Install Ray? (It requires ~4 additional CPUs) (y/n) (default is [n]): " choice
+case "$choice" in
+ y|Y ) INSTALL_RAY=true ;;
+ * ) INSTALL_RAY=false ;;
+esac
+
+# Save selections to settings file
+echo -e "\nDEPLOYMENT_OPTION=$DEPLOYMENT_OPTION" >> $PLATFORM_CONFIG
+echo -e "\nINSTALL_LOCAL_REGISTRY=$INSTALL_LOCAL_REGISTRY" >> $PLATFORM_CONFIG
+echo -e "\nINSTALL_RAY=$INSTALL_RAY" >> $PLATFORM_CONFIG
+
# CHECK DISK SPACE
-RECOMMENDED_DISK_SPACE=26214400
-RECOMMENDED_DISK_SPACE_GB=$(($RECOMMENDED_DISK_SPACE / 1024 / 1024))
+RECOMMENDED_DISK_SPACE_KUBEFLOW=26214400
+RECOMMENDED_DISK_SPACE_KUBEFLOW_GB=$(($RECOMMENDED_DISK_SPACE_KUBEFLOW / 1024 / 1024))
+RECOMMENDED_DISK_SPACE_KFP=18874368
+RECOMMENDED_DISK_SPACE_KFP_GB=$(($RECOMMENDED_DISK_SPACE_KFP / 1024 / 1024))
+
+if [[ $DEPLOYMENT_OPTION == *"kfp"* ]]; then
+ RECOMMENDED_DISK_SPACE=$RECOMMENDED_DISK_SPACE_KFP
+ RECOMMENDED_DISK_SPACE_GB=$RECOMMENDED_DISK_SPACE_KFP_GB
+else
+ RECOMMENDED_DISK_SPACE=$RECOMMENDED_DISK_SPACE_KUBEFLOW
+ RECOMMENDED_DISK_SPACE_GB=$RECOMMENDED_DISK_SPACE_KUBEFLOW_GB
+fi
DISK_SPACE=$(df -k . | awk -F ' ' '{print $4}' | sed -n '2 p')
DISK_SPACE_GB=$(($DISK_SPACE / 1024 / 1024))
+# TODO: Set required depending on the deployment, ray, etc.
+
if [[ DISK_SPACE < $RECOMMENDED_DISK_SPACE ]]; then
echo "WARNING: Not enough disk space detected!"
echo "The recommended is > ${RECOMMENDED_DISK_SPACE_GB} GB of disk space. You have ${DISK_SPACE_GB} GB."
@@ -44,7 +104,19 @@ if [[ DISK_SPACE < $RECOMMENDED_DISK_SPACE ]]; then
fi
# CHECK CPU COUNT
-RECOMMENDED_CPUS=16
+RECOMMENDED_CPUS_KUBEFLOW=12
+RECOMMENDED_CPUS_KFP=8
+EXTRA_RAY_CPUS=4
+
+if [[ $DEPLOYMENT_OPTION == *"kfp"* ]]; then
+ RECOMMENDED_CPUS=$RECOMMENDED_CPUS_KFP
+else
+ RECOMMENDED_CPUS=$RECOMMENDED_CPUS_KUBEFLOW
+fi
+
+if [ "$INSTALL_RAY" = true ]; then
+ RECOMMENDED_CPUS=$(($RECOMMENDED_CPUS + $EXTRA_RAY_CPUS))
+fi
# Detect the OS
OS=$(uname)
@@ -59,12 +131,13 @@ fi
if [[ $CPU_COUNT -lt $RECOMMENDED_CPUS ]]; then
echo "WARNING: Not enough CPU cores detected!"
- echo "The recommended is >= ${RECOMMENDED_CPUS} CPU cores. You have ${CPU_COUNT} cores."
+ echo "The recommended is >= ${RECOMMENDED_CPUS} CPU cores for this deployment configuration. You have ${CPU_COUNT} cores."
while true; do
read -p "Do you want to continue with the installation? (y/n): " yn
case $yn in
[Yy]* ) break;;
[Nn]* ) exit 1;;
+ "" ) echo "Please enter a response.";;
* ) echo "Please answer yes or no.";;
esac
done
@@ -72,9 +145,9 @@ fi
# INSTALL TOOLS
if [[ "$(uname)" == "Darwin" ]]; then
- bash scripts/install_tools_mac.sh # Using default bash because /bin/bash is an old version (3)
+ bash "$SCRIPT_DIR/scripts/install_tools_mac.sh" # Using default bash because /bin/bash is an old version (3)
else
- /bin/bash scripts/install_tools.sh
+ /bin/bash "$SCRIPT_DIR/scripts/install_tools.sh"
fi
# CREATE CLUSTER
@@ -83,30 +156,61 @@ function fail {
exit "${2-1}" ## Return a code specified by $2, or 1 by default.
}
-/bin/bash scripts/create_cluster.sh || fail
+# Check if the kind cluster already exists
+if kind get clusters | grep -q "^$CLUSTER_NAME$"; then
+ echo
+ echo "Kind cluster with name \"$CLUSTER_NAME\" already exists. It can be deleted with the following command: kind delete cluster --name $CLUSTER_NAME"
+ while true; do
+ read -p "Do you want to continue the installation on the existing cluster? (y/n): " choice
+ case "$choice" in
+ y|Y ) echo "Using existing kind cluster..."; break;;
+ n|N ) exit 0 ;;
+ * ) echo "Invalid response. Please enter y or n." ;;
+ "" ) echo "Please enter a response." ;;
+ esac
+ done
+else
+ echo "Creating kind cluster..."
+ /bin/bash "$SCRIPT_DIR/scripts/create_cluster.sh"
+fi
kubectl cluster-info --context kind-$CLUSTER_NAME
# DEPLOY LOCAL DOCKER REGISTRY
if [ "$INSTALL_LOCAL_REGISTRY" = true ]; then
- /bin/bash scripts/install_local_registry.sh
+ /bin/bash "$SCRIPT_DIR/scripts/install_local_registry.sh"
fi
# DEPLOY STACK
kubectl config use-context kind-$CLUSTER_NAME
-# Create a temporary file
-tmpfile=$(mktemp)
# Build the kustomization and store the output in the temporary file
-kustomize build deployment > "$tmpfile"
-
+tmp_file=$(mktemp)
+DEPLOYMENT_ROOT="$SCRIPT_DIR/deployment/envs/$DEPLOYMENT_OPTION"
+echo "Deployment root set to: $DEPLOYMENT_ROOT"
+echo
+echo "Building manifests..."
+kustomize build $DEPLOYMENT_ROOT > "$tmp_file"
+echo "Manifests built successfully."
+echo
+echo "Applying resources..."
while true; do
- if kubectl apply -f "$tmpfile"; then
+ if kubectl apply -f "$tmp_file"; then
echo "Resources successfully applied."
- rm "$tmpfile"
+ rm "$tmp_file"
break
else
- echo "Retrying to apply resources. Be patient, this might take a while..."
+ echo
+ echo "Retrying to apply resources."
+ echo "Be patient, this might take a while... (Errors are expected until all resources are available!)"
+ echo
+ echo "Help:"
+ echo " If the errors persists, please check the pods status with: kubectl get pods --all-namespaces"
+ echo " All pods should be either in Running state, or ContainerCreating if they are still starting up."
+ echo " Check specific pod errors with: kubectl describe pod -n [NAMESPACE] [POD_NAME]"
+ echo " For further help, see the Troubleshooting section in setup.md"
+ echo
+
sleep 10
fi
done
@@ -114,17 +218,17 @@ done
# DEPLOY RAY
if [ "$INSTALL_RAY" = true ]; then
echo "Installing Ray"
- /bin/bash scripts/install_helm.sh
- /bin/bash scripts/install_ray.sh
+ /bin/bash "$SCRIPT_DIR/scripts/install_helm.sh"
+ /bin/bash "$SCRIPT_DIR/scripts/install_ray.sh"
fi
echo
-echo Installation completed!
+echo "Installation completed!"
echo
# TESTS
if [ "$RUN_TESTS" = "true" ]; then
- /bin/bash scripts/run_tests.sh
+ /bin/bash "$SCRIPT_DIR/scripts/run_tests.sh"
fi
exit 0
diff --git a/tests/conftest.py b/tests/conftest.py
index 1df8804..38728fa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,7 +3,10 @@
from dotenv import load_dotenv
import os
-ENV_FILE = pathlib.Path(__file__).parent.parent / "config.env"
+from .utils import parse_bool
+
+ENV_FILE = pathlib.Path(__file__).parent.parent / ".platform/.config"
+assert ENV_FILE.exists(), f"File not found: {ENV_FILE} (autogenerated by the platform on installation)" # noqa
load_dotenv(dotenv_path=ENV_FILE)
CLUSTER_NAME = os.getenv("CLUSTER_NAME")
@@ -14,8 +17,8 @@
assert HOST_IP is not None
# MLFLOW
-MLFLOW_ENV_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "config.env"
-MLFLOW_SECRETS_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "secret.env"
+MLFLOW_ENV_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "config.env" # noqa
+MLFLOW_SECRETS_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "secret.env" # noqa
load_dotenv(dotenv_path=MLFLOW_ENV_FILE, override=True)
AWS_ACCESS_KEY_ID = os.getenv("MINIO_ACCESS_KEY")
@@ -25,6 +28,9 @@
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
assert AWS_SECRET_ACCESS_KEY is not None
+IS_STANDALONE_KFP = "kfp" in os.environ.get('DEPLOYMENT_OPTION')
+SKIP_LOCAL_REGISTRY = not parse_bool(os.environ.get('INSTALL_LOCAL_REGISTRY'))
+
def pytest_sessionstart(session):
"""
diff --git a/tests/resources/kfp/build_image.sh b/tests/resources/kfp/build_image.sh
index 6ced22e..779bec1 100755
--- a/tests/resources/kfp/build_image.sh
+++ b/tests/resources/kfp/build_image.sh
@@ -12,7 +12,7 @@ cd "$(dirname "$0")"
docker build -t "$FULL_IMAGE_NAME" .
-# load the image into the local "kind" cluster with name "kind-ep"
+# load the image into the local "kind" cluster
kind load docker-image "$FULL_IMAGE_NAME" --name $CLUSTER_NAME
# to push the image to a remote repository instead
diff --git a/tests/resources/registry/build_push_image.sh b/tests/resources/registry/build_push_image.sh
index 513de3c..14bf947 100755
--- a/tests/resources/registry/build_push_image.sh
+++ b/tests/resources/registry/build_push_image.sh
@@ -14,8 +14,5 @@ cd "$(dirname "$0")"
docker build -t "$FULL_IMAGE_NAME" .
-# load the image into the local "kind" cluster with name "kind-ep"
-#kind load docker-image "$FULL_IMAGE_NAME" --name kind-ep
-
# to push the image to a remote repository instead
docker push "$FULL_IMAGE_NAME"
\ No newline at end of file
diff --git a/tests/test_kfp.py b/tests/test_kfp.py
index a4e5dea..2c292e5 100644
--- a/tests/test_kfp.py
+++ b/tests/test_kfp.py
@@ -1,3 +1,4 @@
+import os
import subprocess
import logging
import pathlib
@@ -9,7 +10,7 @@
import requests
from urllib.parse import urlsplit
-from .conftest import CLUSTER_NAME
+from .conftest import CLUSTER_NAME, IS_STANDALONE_KFP
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -22,7 +23,7 @@
KUBEFLOW_ENDPOINT = "http://localhost:8080"
KUBEFLOW_USERNAME = "user@example.com"
KUBEFLOW_PASSWORD = "12341234"
-NAMESPACE = "kubeflow-user-example-com"
+KUBEFLOW_USER_NAMESPACE = "kubeflow-user-example-com"
def get_istio_auth_session(url: str, username: str, password: str) -> dict:
@@ -44,10 +45,8 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict:
"is_secured": None, # True if KF endpoint is secured
"session_cookie": None # Resulting session cookies in the form "key1=value1; key2=value2"
}
-
# use a persistent session (for cookies)
with requests.Session() as s:
-
################
# Determine if Endpoint is Secured
################
@@ -56,7 +55,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict:
raise RuntimeError(
f"HTTP status code '{resp.status_code}' for GET against: {url}"
)
-
auth_session["redirect_url"] = resp.url
# if we were NOT redirected, then the endpoint is UNSECURED
@@ -101,7 +99,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict:
f"HTTP status code '{resp.status_code}' "
f"for GET against: {redirect_url_obj.geturl()}"
)
-
# set the login url
auth_session["dex_login_url"] = resp.url
@@ -118,7 +115,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict:
f"Login credentials were probably invalid - "
f"No redirect after POST to: {auth_session['dex_login_url']}"
)
-
# store the session cookies in a "key1=value1; key2=value2" string
auth_session["session_cookie"] = "; ".join(
[f"{c.name}={c.value}" for c in s.cookies]
@@ -128,43 +124,37 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict:
def run_pipeline(pipeline_file: str, experiment_name: str):
-
- with subprocess.Popen(["kubectl", "-n", "istio-system", "port-forward", "svc/istio-ingressgateway", "8080:80"], stdout=True) as proc:
+ """Run a pipeline on a Kubeflow cluster."""
+ with subprocess.Popen(["kubectl", "-n", "istio-system", "port-forward", "svc/istio-ingressgateway", "8080:80"], stdout=True) as proc: # noqa: E501
try:
time.sleep(2) # give some time to the port-forward connection
-
auth_session = get_istio_auth_session(
url=KUBEFLOW_ENDPOINT,
username=KUBEFLOW_USERNAME,
password=KUBEFLOW_PASSWORD
)
-
client = kfp.Client(
host=f"{KUBEFLOW_ENDPOINT}/pipeline",
cookies=auth_session["session_cookie"],
- namespace=NAMESPACE,
+ namespace=KUBEFLOW_USER_NAMESPACE,
)
-
created_run = client.create_run_from_pipeline_package(
pipeline_file=pipeline_file,
enable_caching=False,
arguments={},
run_name="kfp_test_run",
experiment_name=experiment_name,
- namespace=NAMESPACE
+ namespace=KUBEFLOW_USER_NAMESPACE
)
-
run_id = created_run.run_id
-
logger.info(f"Submitted run with ID: {run_id}")
-
logger.info(f"Waiting for run {run_id} to complete....")
run_detail = created_run.wait_for_run_completion()
_handle_job_end(run_detail)
# clean up
experiment = client.get_experiment(
- experiment_name=experiment_name, namespace=NAMESPACE
+ experiment_name=experiment_name, namespace=KUBEFLOW_USER_NAMESPACE
)
client.delete_experiment(experiment.id)
logger.info("Done")
@@ -176,16 +166,46 @@ def run_pipeline(pipeline_file: str, experiment_name: str):
proc.terminate()
+def run_pipeline_standalone_kfp(pipeline_file: str, experiment_name: str):
+ """Run a pipeline on a standalone Kubeflow Pipelines cluster."""
+ with subprocess.Popen(["kubectl", "-n", "kubeflow", "port-forward", "svc/ml-pipeline-ui", "8080:80"], stdout=True) as proc: # noqa: E501
+ try:
+ time.sleep(2) # give some time to the port-forward connection
+
+ client = kfp.Client(
+ host=f"{KUBEFLOW_ENDPOINT}/pipeline",
+ )
+ created_run = client.create_run_from_pipeline_package(
+ pipeline_file=pipeline_file,
+ enable_caching=False,
+ arguments={},
+ run_name="kfp_test_run",
+ experiment_name=experiment_name,
+ )
+ run_id = created_run.run_id
+ logger.info(f"Submitted run with ID: {run_id}")
+ logger.info(f"Waiting for run {run_id} to complete....")
+ run_detail = created_run.wait_for_run_completion()
+ _handle_job_end(run_detail)
+
+ # clean up
+ experiment = client.get_experiment(experiment_name=experiment_name)
+ client.delete_experiment(experiment.id)
+ logger.info("Done")
+
+ except Exception as e:
+ logger.error(f"ERROR: {e}")
+ raise e
+ finally:
+ proc.terminate()
+
+
def _handle_job_end(run_detail):
finished_run = run_detail.to_dict()["run"]
-
created_at = finished_run["created_at"]
finished_at = finished_run["finished_at"]
-
duration_secs = (finished_at - created_at).total_seconds()
-
status = finished_run["status"]
-
logger.info(f"Run finished in {round(duration_secs)} seconds with status: {status}")
if status != "Succeeded":
@@ -196,7 +216,6 @@ def build_load_image():
output = subprocess.check_output(
["docker", "exec", f"{CLUSTER_NAME}-control-plane", "crictl", "images"]
)
-
if IMAGE_NAME in output.decode():
logging.info(f"Image already in cluster.")
else:
@@ -206,14 +225,25 @@ def build_load_image():
@pytest.mark.order(6)
@pytest.mark.timeout(240)
+@pytest.mark.skipif(IS_STANDALONE_KFP, reason="It is not Kubeflow")
def test_run_pipeline():
-
# build the base docker image and load it into the cluster
build_load_image()
-
# submit and run pipeline
run_pipeline(pipeline_file=str(PIPELINE_FILE), experiment_name=EXPERIMENT_NAME)
+@pytest.mark.order(6)
+@pytest.mark.timeout(240)
+@pytest.mark.skipif(not IS_STANDALONE_KFP, reason="It is not standalone KFP")
+def test_run_pipeline_standalone_kfp():
+ # build the base docker image and load it into the cluster
+ build_load_image()
+ # submit and run pipeline
+ run_pipeline_standalone_kfp(
+ pipeline_file=str(PIPELINE_FILE), experiment_name=EXPERIMENT_NAME
+ )
+
+
if __name__ == "__main__":
test_run_pipeline()
diff --git a/tests/test_registry.py b/tests/test_registry.py
index e13cf83..dce083c 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -2,17 +2,16 @@
import logging
import pathlib
import pytest
-import os
from envsubst import envsubst
-from .conftest import HOST_IP
-from .test_kfp import run_pipeline
+from .conftest import HOST_IP, IS_STANDALONE_KFP, SKIP_LOCAL_REGISTRY
+from .test_kfp import run_pipeline, run_pipeline_standalone_kfp
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
-BUILD_FILE = pathlib.Path(__file__).parent / "resources" / "registry" / "build_push_image.sh"
-PIPELINE_TEMPLATE = pathlib.Path(__file__).parent / "resources" / "registry" / "pipeline.yaml.template"
+BUILD_FILE = pathlib.Path(__file__).parent / "resources" / "registry" / "build_push_image.sh" # noqa
+PIPELINE_TEMPLATE = pathlib.Path(__file__).parent / "resources" / "registry" / "pipeline.yaml.template" # noqa
IMAGE_NAME = "kfp-registry-test-image"
EXPERIMENT_NAME = "Test Experiment (Registry)"
@@ -32,10 +31,7 @@ def render_pipeline_yaml(output: str):
@pytest.mark.order(7)
-@pytest.mark.skipif(
- os.environ.get('INSTALL_LOCAL_REGISTRY') == 'false',
- reason="No local image registry was installed."
-)
+@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed")
def test_push_image():
# build the base docker image and load it into the cluster
build_push_image()
@@ -43,18 +39,29 @@ def test_push_image():
@pytest.mark.order(8)
@pytest.mark.timeout(120)
-@pytest.mark.skipif(
- os.environ.get('INSTALL_LOCAL_REGISTRY') == 'false',
- reason="No local image registry was installed."
-)
+@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed")
+@pytest.mark.skipif(IS_STANDALONE_KFP, reason="It is not Kubeflow")
def test_run_pipeline_using_registry(tmp_path):
-
# build the base docker image and load it into the cluster
build_push_image()
-
# create pipeline.yaml with the right registry IP address
pipeline_file = tmp_path / "pipeline.yaml"
render_pipeline_yaml(output=str(pipeline_file))
-
# submit and run pipeline
run_pipeline(pipeline_file=str(pipeline_file), experiment_name=EXPERIMENT_NAME)
+
+
+@pytest.mark.order(8)
+@pytest.mark.timeout(120)
+@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed")
+@pytest.mark.skipif(not IS_STANDALONE_KFP, reason="It is not standalone KFP")
+def test_run_pipeline_standalone_kfp_using_registry(tmp_path):
+ # build the base docker image and load it into the cluster
+ build_push_image()
+ # create pipeline.yaml with the right registry IP address
+ pipeline_file = tmp_path / "pipeline.yaml"
+ render_pipeline_yaml(output=str(pipeline_file))
+ # submit and run pipeline
+ run_pipeline_standalone_kfp(
+ pipeline_file=str(pipeline_file), experiment_name=EXPERIMENT_NAME
+ )
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..6bd5ea6
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,16 @@
+from typing import Union
+
+
+def parse_bool(val: Union[str, bool]) -> bool:
+ """Convert a string representation of truth to True or False.
+ True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
+ are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
+ 'val' is anything else.
+ """
+ val = val.lower()
+ if val in ('y', 'yes', 't', 'true', 'on', '1', True):
+ return True
+ elif val in ('n', 'no', 'f', 'false', 'off', '0', False):
+ return False
+ else:
+ raise ValueError(f"Invalid truth value {val}")
diff --git a/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb b/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb
index 4f9967e..bb94984 100644
--- a/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb
+++ b/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb
@@ -778,10 +778,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
"id": "ef37a9bc",
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-04-13T09:16:14.457318Z",
+ "start_time": "2024-04-13T09:16:14.094491Z"
+ }
+ },
"source": [
"@component(\n",
" base_image=\"python:3.9\",\n",
@@ -800,6 +803,7 @@
" from kserve import V1beta1InferenceServiceSpec\n",
" from kserve import V1beta1PredictorSpec\n",
" from kserve import V1beta1SKLearnSpec\n",
+ " from kubernetes.client import V1ResourceRequirements\n",
" import logging\n",
" \n",
" deploy_model_component_landmark = 'KFP_component'\n",
@@ -815,25 +819,45 @@
" api_version = constants.KSERVE_GROUP + '/' + kserve_version\n",
"\n",
" isvc = V1beta1InferenceService(\n",
- " api_version=api_version,\n",
- " kind=constants.KSERVE_KIND,\n",
- " metadata=client.V1ObjectMeta(\n",
- " name=model_name,\n",
- " namespace=namespace,\n",
- " annotations={'sidecar.istio.io/inject':'false'}\n",
+ " api_version = api_version,\n",
+ " kind = constants.KSERVE_KIND,\n",
+ " metadata = client.V1ObjectMeta(\n",
+ " name = model_name,\n",
+ " namespace = namespace,\n",
+ " annotations = {'sidecar.istio.io/inject':'false'}\n",
" ),\n",
- " spec=V1beta1InferenceServiceSpec(\n",
+ " spec = V1beta1InferenceServiceSpec(\n",
" predictor=V1beta1PredictorSpec(\n",
" service_account_name=\"kserve-sa\",\n",
+ " min_replicas=1,\n",
+ " max_replicas = 1,\n",
" sklearn=V1beta1SKLearnSpec(\n",
- " storage_uri=model_uri\n",
- " )\n",
+ " storage_uri=model_uri,\n",
+ " resources=V1ResourceRequirements(\n",
+ " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n",
+ " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n",
+ " )\n",
+ " ),\n",
" )\n",
" )\n",
" )\n",
" KServe = KServeClient()\n",
" KServe.create(isvc)"
- ]
+ ],
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'component' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;129m@component\u001B[39m(\n\u001B[1;32m 2\u001B[0m base_image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpython:3.9\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 3\u001B[0m packages_to_install\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mkserve=0.12.0\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 4\u001B[0m output_component_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcomponents/deploy_model_component.yaml\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[1;32m 5\u001B[0m )\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdeploy_model\u001B[39m(model_name: \u001B[38;5;28mstr\u001B[39m, storage_uri: \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m 7\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;124;03m Deploy the model as a inference service with Kserve.\u001B[39;00m\n\u001B[1;32m 9\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mkubernetes\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m client\n",
+ "\u001B[0;31mNameError\u001B[0m: name 'component' is not defined"
+ ]
+ }
+ ],
+ "execution_count": 1
},
{
"cell_type": "markdown",
@@ -853,10 +877,13 @@
},
{
"cell_type": "code",
- "execution_count": null,
"id": "b90d1839",
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-04-13T09:16:14.719526Z",
+ "start_time": "2024-04-13T09:16:14.679920Z"
+ }
+ },
"source": [
"@component(\n",
" base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n",
@@ -1025,7 +1052,21 @@
" if response.status_code != 200:\n",
" raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n",
" logger.info(f\"\\nPrediction response:\\n{response.text}\\n\")"
- ]
+ ],
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'component' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[2], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;129m@component\u001B[39m(\n\u001B[1;32m 2\u001B[0m base_image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpython:3.9\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;66;03m# kserve on python 3.10 comes with a dependency that fails to get installed\u001B[39;00m\n\u001B[1;32m 3\u001B[0m packages_to_install\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mkserve\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mscikit-learn~=1.0.2\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 4\u001B[0m output_component_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcomponents/inference_component.yaml\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[1;32m 5\u001B[0m )\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21minference\u001B[39m(\n\u001B[1;32m 7\u001B[0m model_name: \u001B[38;5;28mstr\u001B[39m\n\u001B[1;32m 8\u001B[0m ):\n\u001B[1;32m 9\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;124;03m Test inference.\u001B[39;00m\n\u001B[1;32m 11\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m 12\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mkserve\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m KServeClient\n",
+ "\u001B[0;31mNameError\u001B[0m: name 'component' is not defined"
+ ]
+ }
+ ],
+ "execution_count": 2
},
{
"cell_type": "markdown",
@@ -1311,7 +1352,7 @@
"If not, removing the cluster and reinstalling everything is usually easier. A more surgical approach is to use kubectl to check the logs of pods in the given namespace and configure used YAMLs to fix the issue. When the modified YAMLs have been saved, just apply them and then rollout restart the pods. It is recommended to stop any dashboards before doing this. The required commands cluster removal, and kubectl fixing are:\n",
"\n",
"Cluster removal:\n",
- "- Cluster deletion = kind delete cluster --name kind-ep\n",
+ "- Cluster deletion = kind delete cluster --name mlops-platform\n",
"- Registry deletion = docker rm -f $(docker ps -aqf \"name=kind-registry\")\n",
"\n",
"Optional docker clean up:\n",
diff --git a/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb b/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb
index d338ba5..f700667 100644
--- a/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb
+++ b/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb
@@ -21,16 +21,16 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"%%bash\n",
"\n",
"pip install kfp~=1.8.14"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -43,11 +43,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
@@ -64,7 +62,9 @@
" Artifact,\n",
" Model\n",
")"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -83,8 +83,6 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "outputs": [],
"source": [
"import re\n",
"import requests\n",
@@ -189,12 +187,12 @@
],
"metadata": {
"collapsed": false
- }
+ },
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": null,
- "outputs": [],
"source": [
"import kfp\n",
"\n",
@@ -213,7 +211,9 @@
],
"metadata": {
"collapsed": false
- }
+ },
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -239,11 +239,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@component(\n",
" base_image=\"python:3.10\",\n",
@@ -258,7 +256,9 @@
"\n",
" df = pd.read_csv(url, sep=\";\")\n",
" df.to_csv(data.path, index=None)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -271,11 +271,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@component(\n",
" base_image=\"python:3.10\",\n",
@@ -312,7 +310,9 @@
"\n",
" train.to_csv(train_set.path, index=None)\n",
" test.to_csv(test_set.path, index=None)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -325,11 +325,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"from typing import NamedTuple\n",
"\n",
@@ -443,7 +441,9 @@
"\n",
" # return str(mlflow.get_artifact_uri())\n",
" return output(mlflow.get_artifact_uri(), run_id)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -456,11 +456,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@component(\n",
" base_image=\"python:3.10\",\n",
@@ -500,7 +498,9 @@
" logger.error(f\"Metric {key} failed. Evaluation not passed!\")\n",
" return False\n",
" return True"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -513,11 +513,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@component(\n",
" base_image=\"python:3.9\",\n",
@@ -526,7 +524,7 @@
")\n",
"def deploy_model(model_name: str, storage_uri: str):\n",
" \"\"\"\n",
- " Deploy the model as a inference service with Kserve.\n",
+ " Deploy the model as an inference service with Kserve.\n",
" \"\"\"\n",
" import logging\n",
" from kubernetes import client\n",
@@ -537,6 +535,7 @@
" from kserve import V1beta1InferenceServiceSpec\n",
" from kserve import V1beta1PredictorSpec\n",
" from kserve import V1beta1SKLearnSpec\n",
+ " from kubernetes.client import V1ResourceRequirements\n",
"\n",
" logging.basicConfig(level=logging.INFO)\n",
" logger = logging.getLogger(__name__)\n",
@@ -544,32 +543,38 @@
" model_uri = f\"{storage_uri}/{model_name}\"\n",
" logger.info(f\"MODEL URI: {model_uri}\")\n",
"\n",
- " # namespace = 'kserve-inference'\n",
" namespace = utils.get_default_target_namespace()\n",
" kserve_version='v1beta1'\n",
" api_version = constants.KSERVE_GROUP + '/' + kserve_version\n",
"\n",
- "\n",
" isvc = V1beta1InferenceService(\n",
- " api_version=api_version,\n",
- " kind=constants.KSERVE_KIND,\n",
- " metadata=client.V1ObjectMeta(\n",
- " name=model_name,\n",
- " namespace=namespace,\n",
- " annotations={'sidecar.istio.io/inject':'false'}\n",
+ " api_version = api_version,\n",
+ " kind = constants.KSERVE_KIND,\n",
+ " metadata = client.V1ObjectMeta(\n",
+ " name = model_name,\n",
+ " namespace = namespace,\n",
+ " annotations = {'sidecar.istio.io/inject':'false'}\n",
" ),\n",
- " spec=V1beta1InferenceServiceSpec(\n",
+ " spec = V1beta1InferenceServiceSpec(\n",
" predictor=V1beta1PredictorSpec(\n",
" service_account_name=\"kserve-sa\",\n",
+ " min_replicas=1,\n",
+ " max_replicas = 1,\n",
" sklearn=V1beta1SKLearnSpec(\n",
- " storage_uri=model_uri\n",
- " )\n",
+ " storage_uri=model_uri,\n",
+ " resources=V1ResourceRequirements(\n",
+ " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n",
+ " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n",
+ " )\n",
+ " ),\n",
" )\n",
" )\n",
" )\n",
" KServe = KServeClient()\n",
" KServe.create(isvc)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -582,11 +587,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@component(\n",
" base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n",
@@ -748,8 +751,8 @@
" logger.info(f\"\\nInference service URL:\\n{is_url}\\n\")\n",
"\n",
" inference_input = {\n",
- " 'instances': input_sample.tolist()\n",
- " }\n",
+ " 'instances': input_sample.tolist()\n",
+ " }\n",
" response = requests.post(\n",
" is_url,\n",
" json=inference_input,\n",
@@ -761,7 +764,9 @@
" raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n",
" \n",
" logger.info(f\"\\nPrediction response:\\n{response.json()}\\n\")"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -776,11 +781,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"@dsl.pipeline(\n",
" name='demo-pipeline',\n",
@@ -833,7 +836,9 @@
" scaler_in=preprocess_task.outputs[\"scaler_out\"]\n",
" )\n",
" inference_task.after(deploy_model_task)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -846,11 +851,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"# Specify pipeline argument values\n",
"\n",
@@ -867,7 +870,9 @@
" \"l1_ratio\": 0.5,\n",
" \"threshold_metrics\": eval_threshold_metrics\n",
"}"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -880,11 +885,9 @@
},
{
"cell_type": "code",
- "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [],
"source": [
"run_name = \"demo-run\"\n",
"experiment_name = \"demo-experiment\"\n",
@@ -898,7 +901,9 @@
" enable_caching=False,\n",
" namespace=\"kubeflow-user-example-com\"\n",
")"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -942,7 +947,7 @@
"
\n",
"\n",
"```bash\n",
- "$ kubectl -n mlflow port-forward svc/mlflow 5000:5000\n",
+ "kubectl -n mlflow port-forward svc/mlflow 5000:5000\n",
"```\n",
"\n",
"
\n",
diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore
new file mode 100644
index 0000000..b4a2938
--- /dev/null
+++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore
@@ -0,0 +1,2 @@
+components/*
+components/.gitkeep
\ No newline at end of file
diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md
new file mode 100644
index 0000000..5b57ad6
--- /dev/null
+++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md
@@ -0,0 +1,9 @@
+# Demo pipeline (standalone KFP)
+
+Jupyter notebook with a demo pipeline that uses the installed standalone Kubeflow Pipelines (KFP), MLflow and Kserve components.
+
+> **NOTE:** This demo is intended for the standalone-KFP + Kserve deployment option.
+
+
+
+
\ No newline at end of file
diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/components/.gitkeep b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/components/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb
new file mode 100644
index 0000000..471da4e
--- /dev/null
+++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb
@@ -0,0 +1,779 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "# Demo KFP pipeline"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Install requirements:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "%%bash\n",
+ "\n",
+ "pip install kfp~=1.8.14"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Imports:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "import kfp\n",
+ "import kfp.dsl as dsl\n",
+ "from kfp.aws import use_aws_secret\n",
+ "from kfp.v2.dsl import (\n",
+ " component,\n",
+ " Input,\n",
+ " Output,\n",
+ " Dataset,\n",
+ " Metrics,\n",
+ " Artifact,\n",
+ " Model\n",
+ ")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 1. Connect to client\n",
+ "\n",
+ "Run the following to port-forward to the KFP UI:\n",
+ "\n",
+ "```sh\n",
+ "kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8080:80\n",
+ "```\n",
+ "\n",
+ "Now the KFP UI should be reachable at [`http://localhost:8080`](http://localhost:8080)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import kfp\n",
+ "\n",
+ "KFP_ENDPOINT = \"http://localhost:8080\"\n",
+ "\n",
+ "client = kfp.Client(host=KFP_ENDPOINT)\n",
+ "# print(client.list_experiments())"
+ ],
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 2. Components\n",
+ "\n",
+ "There are different ways to define components in KFP. Here, we use the **@component** decorator to define the components as Python function-based components.\n",
+ "\n",
+ "The **@component** annotation converts the function into a factory function that creates pipeline steps that execute this function. This example also specifies the base container image to run you component in."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Pull data component:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "@component(\n",
+ " base_image=\"python:3.10\",\n",
+ " packages_to_install=[\"pandas~=1.4.2\"],\n",
+ " output_component_file='components/pull_data_component.yaml',\n",
+ ")\n",
+ "def pull_data(url: str, data: Output[Dataset]):\n",
+ " \"\"\"\n",
+ " Pull data component.\n",
+ " \"\"\"\n",
+ " import pandas as pd\n",
+ "\n",
+ " df = pd.read_csv(url, sep=\";\")\n",
+ " df.to_csv(data.path, index=None)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Preprocess component:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "@component(\n",
+ " base_image=\"python:3.10\",\n",
+ " packages_to_install=[\"pandas~=1.4.2\", \"scikit-learn~=1.0.2\"],\n",
+ " output_component_file='components/preprocess_component.yaml',\n",
+ ")\n",
+ "def preprocess(\n",
+ " data: Input[Dataset],\n",
+ " scaler_out: Output[Artifact],\n",
+ " train_set: Output[Dataset],\n",
+ " test_set: Output[Dataset],\n",
+ " target: str = \"quality\",\n",
+ "):\n",
+ " \"\"\"\n",
+ " Preprocess component.\n",
+ " \"\"\"\n",
+ " import pandas as pd\n",
+ " import pickle\n",
+ " from sklearn.model_selection import train_test_split\n",
+ " from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ " data = pd.read_csv(data.path)\n",
+ "\n",
+ " # Split the data into training and test sets. (0.75, 0.25) split.\n",
+ " train, test = train_test_split(data)\n",
+ "\n",
+ " scaler = StandardScaler()\n",
+ "\n",
+ " train[train.drop(target, axis=1).columns] = scaler.fit_transform(train.drop(target, axis=1))\n",
+ " test[test.drop(target, axis=1).columns] = scaler.transform(test.drop(target, axis=1))\n",
+ "\n",
+ " with open(scaler_out.path, 'wb') as fp:\n",
+ " pickle.dump(scaler, fp, pickle.HIGHEST_PROTOCOL)\n",
+ "\n",
+ " train.to_csv(train_set.path, index=None)\n",
+ " test.to_csv(test_set.path, index=None)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Train component:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "from typing import NamedTuple\n",
+ "\n",
+ "@component(\n",
+ " base_image=\"python:3.10\",\n",
+ " packages_to_install=[\"numpy\", \"pandas~=1.4.2\", \"scikit-learn~=1.0.2\", \"mlflow~=2.4.1\", \"boto3~=1.21.0\"],\n",
+ " output_component_file='components/train_component.yaml',\n",
+ ")\n",
+ "def train(\n",
+ " train_set: Input[Dataset],\n",
+ " test_set: Input[Dataset],\n",
+ " saved_model: Output[Model],\n",
+ " mlflow_experiment_name: str,\n",
+ " mlflow_tracking_uri: str,\n",
+ " mlflow_s3_endpoint_url: str,\n",
+ " model_name: str,\n",
+ " alpha: float,\n",
+ " l1_ratio: float,\n",
+ " target: str = \"quality\",\n",
+ ") -> NamedTuple(\"Output\", [('storage_uri', str), ('run_id', str),]):\n",
+ " \"\"\"\n",
+ " Train component.\n",
+ " \"\"\"\n",
+ " import numpy as np\n",
+ " import pandas as pd\n",
+ " from sklearn.linear_model import ElasticNet\n",
+ " from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
+ " import mlflow\n",
+ " import mlflow.sklearn\n",
+ " import os\n",
+ " import logging\n",
+ " import pickle\n",
+ " from collections import namedtuple\n",
+ "\n",
+ " logging.basicConfig(level=logging.INFO)\n",
+ " logger = logging.getLogger(__name__)\n",
+ "\n",
+ " def eval_metrics(actual, pred):\n",
+ " rmse = np.sqrt(mean_squared_error(actual, pred))\n",
+ " mae = mean_absolute_error(actual, pred)\n",
+ " r2 = r2_score(actual, pred)\n",
+ " return rmse, mae, r2\n",
+ "\n",
+ " os.environ['MLFLOW_S3_ENDPOINT_URL'] = mlflow_s3_endpoint_url\n",
+ "\n",
+ " # load data\n",
+ " train = pd.read_csv(train_set.path)\n",
+ " test = pd.read_csv(test_set.path)\n",
+ "\n",
+ " # The predicted column is \"quality\" which is a scalar from [3, 9]\n",
+ " train_x = train.drop([target], axis=1)\n",
+ " test_x = test.drop([target], axis=1)\n",
+ " train_y = train[[target]]\n",
+ " test_y = test[[target]]\n",
+ "\n",
+ " logger.info(f\"Using MLflow tracking URI: {mlflow_tracking_uri}\")\n",
+ " mlflow.set_tracking_uri(mlflow_tracking_uri)\n",
+ "\n",
+ " logger.info(f\"Using MLflow experiment: {mlflow_experiment_name}\")\n",
+ " mlflow.set_experiment(mlflow_experiment_name)\n",
+ "\n",
+ " with mlflow.start_run() as run:\n",
+ "\n",
+ " run_id = run.info.run_id\n",
+ " logger.info(f\"Run ID: {run_id}\")\n",
+ "\n",
+ " model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n",
+ "\n",
+ " logger.info(\"Fitting model...\")\n",
+ " model.fit(train_x, train_y)\n",
+ "\n",
+ " logger.info(\"Predicting...\")\n",
+ " predicted_qualities = model.predict(test_x)\n",
+ "\n",
+ " (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n",
+ "\n",
+ " logger.info(\"Elasticnet model (alpha=%f, l1_ratio=%f):\" % (alpha, l1_ratio))\n",
+ " logger.info(\" RMSE: %s\" % rmse)\n",
+ " logger.info(\" MAE: %s\" % mae)\n",
+ " logger.info(\" R2: %s\" % r2)\n",
+ "\n",
+ " logger.info(\"Logging parameters to MLflow\")\n",
+ " mlflow.log_param(\"alpha\", alpha)\n",
+ " mlflow.log_param(\"l1_ratio\", l1_ratio)\n",
+ " mlflow.log_metric(\"rmse\", rmse)\n",
+ " mlflow.log_metric(\"r2\", r2)\n",
+ " mlflow.log_metric(\"mae\", mae)\n",
+ "\n",
+ " # save model to mlflow\n",
+ " logger.info(\"Logging trained model\")\n",
+ " mlflow.sklearn.log_model(\n",
+ " model,\n",
+ " model_name,\n",
+ " registered_model_name=\"ElasticnetWineModel\",\n",
+ " serialization_format=\"pickle\"\n",
+ " )\n",
+ "\n",
+ " logger.info(\"Logging predictions artifact to MLflow\")\n",
+ " np.save(\"predictions.npy\", predicted_qualities)\n",
+ " mlflow.log_artifact(\n",
+ " local_path=\"predictions.npy\", artifact_path=\"predicted_qualities/\"\n",
+ " )\n",
+ "\n",
+ " # save model as KFP artifact\n",
+ " logging.info(f\"Saving model to: {saved_model.path}\")\n",
+ " with open(saved_model.path, 'wb') as fp:\n",
+ " pickle.dump(model, fp, pickle.HIGHEST_PROTOCOL)\n",
+ "\n",
+ " # prepare output\n",
+ " output = namedtuple('Output', ['storage_uri', 'run_id'])\n",
+ "\n",
+ " # return str(mlflow.get_artifact_uri())\n",
+ " return output(mlflow.get_artifact_uri(), run_id)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Evaluate component:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "@component(\n",
+ " base_image=\"python:3.10\",\n",
+ " packages_to_install=[\"numpy\", \"mlflow~=2.4.1\"],\n",
+ " output_component_file='components/evaluate_component.yaml',\n",
+ ")\n",
+ "def evaluate(\n",
+ " run_id: str,\n",
+ " mlflow_tracking_uri: str,\n",
+ " threshold_metrics: dict\n",
+ ") -> bool:\n",
+ " \"\"\"\n",
+ " Evaluate component: Compares metrics from training with given thresholds.\n",
+ "\n",
+ " Args:\n",
+ " run_id (string): MLflow run ID\n",
+ " mlflow_tracking_uri (string): MLflow tracking URI\n",
+ " threshold_metrics (dict): Minimum threshold values for each metric\n",
+ " Returns:\n",
+ " Bool indicating whether evaluation passed or failed.\n",
+ " \"\"\"\n",
+ " from mlflow.tracking import MlflowClient\n",
+ " import logging\n",
+ "\n",
+ " logging.basicConfig(level=logging.INFO)\n",
+ " logger = logging.getLogger(__name__)\n",
+ "\n",
+ " client = MlflowClient(tracking_uri=mlflow_tracking_uri)\n",
+ " info = client.get_run(run_id)\n",
+ " training_metrics = info.data.metrics\n",
+ "\n",
+ " logger.info(f\"Training metrics: {training_metrics}\")\n",
+ "\n",
+ " # compare the evaluation metrics with the defined thresholds\n",
+ " for key, value in threshold_metrics.items():\n",
+ " if key not in training_metrics or training_metrics[key] > value:\n",
+ " logger.error(f\"Metric {key} failed. Evaluation not passed!\")\n",
+ " return False\n",
+ " return True"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Deploy model component:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "@component(\n",
+ " base_image=\"python:3.9\",\n",
+ " packages_to_install=[\"kserve==0.11.0\"],\n",
+ " output_component_file='components/deploy_model_component.yaml',\n",
+ ")\n",
+ "def deploy_model(model_name: str, storage_uri: str):\n",
+ " \"\"\"\n",
+ " Deploy the model as an inference service with Kserve.\n",
+ " \"\"\"\n",
+ " import logging\n",
+ " from kubernetes import client\n",
+ " from kserve import KServeClient\n",
+ " from kserve import constants\n",
+ " from kserve import V1beta1InferenceService\n",
+ " from kserve import V1beta1InferenceServiceSpec\n",
+ " from kserve import V1beta1PredictorSpec\n",
+ " from kserve import V1beta1SKLearnSpec\n",
+ " from kubernetes.client import V1ResourceRequirements\n",
+ "\n",
+ " logging.basicConfig(level=logging.INFO)\n",
+ " logger = logging.getLogger(__name__)\n",
+ "\n",
+ " model_uri = f\"{storage_uri}/{model_name}\"\n",
+ " logger.info(f\"MODEL URI: {model_uri}\")\n",
+ "\n",
+ " namespace = 'kserve-inference'\n",
+ " kserve_version='v1beta1'\n",
+ " api_version = constants.KSERVE_GROUP + '/' + kserve_version\n",
+ "\n",
+ " isvc = V1beta1InferenceService(\n",
+ " api_version = api_version,\n",
+ " kind = constants.KSERVE_KIND,\n",
+ " metadata = client.V1ObjectMeta(\n",
+ " name = model_name,\n",
+ " namespace = namespace,\n",
+ " annotations = {'sidecar.istio.io/inject':'false'}\n",
+ " ),\n",
+ " spec = V1beta1InferenceServiceSpec(\n",
+ " predictor=V1beta1PredictorSpec(\n",
+ " service_account_name=\"kserve-sa\",\n",
+ " min_replicas=1,\n",
+ " max_replicas = 1,\n",
+ " sklearn=V1beta1SKLearnSpec(\n",
+ " storage_uri=model_uri,\n",
+ " resources=V1ResourceRequirements(\n",
+ " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n",
+ " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n",
+ " )\n",
+ " ),\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ " KServe = KServeClient()\n",
+ " KServe.create(isvc)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Inference component:"
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "source": [
+ " @component(\n",
+ " base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n",
+ " packages_to_install=[\"kserve==0.11.0\", \"scikit-learn~=1.0.2\"],\n",
+ " output_component_file='components/inference_component.yaml',\n",
+ ")\n",
+ "def inference(\n",
+ " model_name: str,\n",
+ " scaler_in: Input[Artifact]\n",
+ "):\n",
+ " \"\"\"\n",
+ " Test inference.\n",
+ " \"\"\"\n",
+ " from kserve import KServeClient\n",
+ " import requests\n",
+ " import pickle\n",
+ " import logging\n",
+ "\n",
+ " logging.basicConfig(level=logging.INFO)\n",
+ " logger = logging.getLogger(__name__)\n",
+ "\n",
+ " namespace = 'kserve-inference'\n",
+ " \n",
+ " input_sample = [[5.6, 0.54, 0.04, 1.7, 0.049, 5, 13, 0.9942, 3.72, 0.58, 11.4],\n",
+ " [11.3, 0.34, 0.45, 2, 0.082, 6, 15, 0.9988, 2.94, 0.66, 9.2]]\n",
+ "\n",
+ " logger.info(f\"Loading standard scaler from: {scaler_in.path}\")\n",
+ " with open(scaler_in.path, 'rb') as fp:\n",
+ " scaler = pickle.load(fp)\n",
+ "\n",
+ " logger.info(f\"Standardizing sample: {scaler_in.path}\")\n",
+ " input_sample = scaler.transform(input_sample)\n",
+ "\n",
+ " # get inference service\n",
+ " KServe = KServeClient()\n",
+ "\n",
+ " # wait for deployment to be ready\n",
+ " KServe.get(model_name, namespace=namespace, watch=True, timeout_seconds=120)\n",
+ "\n",
+ " inference_service = KServe.get(model_name, namespace=namespace)\n",
+ " header = {\"Host\": f\"{model_name}.{namespace}.example.com\"}\n",
+ " is_url = f\"http://istio-ingressgateway.istio-system.svc.cluster.local:80/v1/models/{model_name}:predict\"\n",
+ " \n",
+ " logger.info(f\"\\nInference service status:\\n{inference_service['status']}\")\n",
+ " logger.info(f\"\\nInference service URL:\\n{is_url}\\n\")\n",
+ "\n",
+ " inference_input = {\n",
+ " 'instances': input_sample.tolist()\n",
+ " }\n",
+ " response = requests.post(\n",
+ " is_url,\n",
+ " json=inference_input,\n",
+ " headers=header,\n",
+ " )\n",
+ " if response.status_code != 200:\n",
+ " raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n",
+ " \n",
+ " logger.info(f\"\\nPrediction response:\\n{response.json()}\\n\")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 3. Pipeline\n",
+ "\n",
+ "Pipeline definition:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "@dsl.pipeline(\n",
+ " name='demo-pipeline',\n",
+ " description='An example pipeline that performs addition calculations.',\n",
+ ")\n",
+ "def pipeline(\n",
+ " url: str,\n",
+ " target: str,\n",
+ " mlflow_experiment_name: str,\n",
+ " mlflow_tracking_uri: str,\n",
+ " mlflow_s3_endpoint_url: str,\n",
+ " model_name: str,\n",
+ " alpha: float,\n",
+ " l1_ratio: float,\n",
+ " threshold_metrics: dict,\n",
+ "):\n",
+ " pull_task = pull_data(url=url)\n",
+ "\n",
+ " preprocess_task = preprocess(data=pull_task.outputs[\"data\"])\n",
+ "\n",
+ " train_task = train(\n",
+ " train_set=preprocess_task.outputs[\"train_set\"],\n",
+ " test_set=preprocess_task.outputs[\"test_set\"],\n",
+ " target=target,\n",
+ " mlflow_experiment_name=mlflow_experiment_name,\n",
+ " mlflow_tracking_uri=mlflow_tracking_uri,\n",
+ " mlflow_s3_endpoint_url=mlflow_s3_endpoint_url,\n",
+ " model_name=model_name,\n",
+ " alpha=alpha,\n",
+ " l1_ratio=l1_ratio\n",
+ " )\n",
+ " train_task.apply(use_aws_secret(secret_name=\"aws-secret\"))\n",
+ "\n",
+ " evaluate_trask = evaluate(\n",
+ " run_id=train_task.outputs[\"run_id\"],\n",
+ " mlflow_tracking_uri=mlflow_tracking_uri,\n",
+ " threshold_metrics=threshold_metrics\n",
+ " )\n",
+ "\n",
+ " eval_passed = evaluate_trask.output\n",
+ "\n",
+ " with dsl.Condition(eval_passed == \"true\"):\n",
+ " deploy_model_task = deploy_model(\n",
+ " model_name=model_name,\n",
+ " storage_uri=train_task.outputs[\"storage_uri\"],\n",
+ " )\n",
+ "\n",
+ " inference_task = inference(\n",
+ " model_name=model_name,\n",
+ " scaler_in=preprocess_task.outputs[\"scaler_out\"]\n",
+ " )\n",
+ " inference_task.after(deploy_model_task)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "Pipeline arguments:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "# Specify pipeline argument values\n",
+ "\n",
+ "eval_threshold_metrics = {'rmse': 0.9, 'r2': 0.3, 'mae': 0.8}\n",
+ "\n",
+ "arguments = {\n",
+ " \"url\": \"http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv\",\n",
+ " \"target\": \"quality\",\n",
+ " \"mlflow_tracking_uri\": \"http://mlflow.mlflow.svc.cluster.local:5000\",\n",
+ " \"mlflow_s3_endpoint_url\": \"http://mlflow-minio-service.mlflow.svc.cluster.local:9000\",\n",
+ " \"mlflow_experiment_name\": \"demo-notebook\",\n",
+ " \"model_name\": \"wine-quality\",\n",
+ " \"alpha\": 0.5,\n",
+ " \"l1_ratio\": 0.5,\n",
+ " \"threshold_metrics\": eval_threshold_metrics\n",
+ "}"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 4. Submit run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "run_name = \"demo-run\"\n",
+ "experiment_name = \"demo-experiment\"\n",
+ "\n",
+ "client.create_run_from_pipeline_func(\n",
+ " pipeline_func=pipeline,\n",
+ " run_name=run_name,\n",
+ " experiment_name=experiment_name,\n",
+ " arguments=arguments,\n",
+ " mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE,\n",
+ " enable_caching=False,\n",
+ ")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 5. Check run"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "### Kubeflow Pipelines UI\n",
+ "\n",
+ "The default way of accessing KFP UI is via port-forward. This enables you to get started quickly without imposing any requirements on your environment. Run the following to port-forward KFP UI to local port `8080`:\n",
+ "\n",
+ "```sh\n",
+ "kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8080:80\n",
+ "```\n",
+ "\n",
+ "Now the KFP UI should be reachable at [`http://localhost:8080`](http://localhost:8080)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "### MLFlow UI\n",
+ "\n",
+ "To access MLFlow UI, open a terminal and forward a local port to MLFlow server:\n",
+ "\n",
+ "
\n",
+ "\n",
+ "```bash\n",
+ "$ kubectl -n mlflow port-forward svc/mlflow 5000:5000\n",
+ "```\n",
+ "\n",
+ "
\n",
+ "\n",
+ "Now MLFlow's UI should be reachable at [`http://localhost:5000`](http://localhost:5000)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": [
+ "## 6. Check deployed model\n",
+ "\n",
+ "```bash\n",
+ "# get inference services\n",
+ "kubectl -n kserve-inference get inferenceservice\n",
+ "\n",
+ "# get deployed model pods\n",
+ "kubectl -n kserve-inference get pods\n",
+ "\n",
+ "# delete inference service\n",
+ "kubectl -n kserve-inference delete inferenceservice wine-quality\n",
+ "```\n",
+ "
\n",
+ "\n",
+ "If something goes wrong, check the logs with:\n",
+ "\n",
+ "
\n",
+ "\n",
+ "```bash\n",
+ "kubectl logs -n kserve-inference kserve-container\n",
+ "\n",
+ "kubectl logs -n kserve-inference queue-proxy\n",
+ "\n",
+ "kubectl logs -n kserve-inference storage-initializer\n",
+ "```\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "iml4e",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.15 (default, Nov 24 2022, 08:57:44) \n[Clang 14.0.6 ]"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "2976e1db094957a35b33d12f80288a268286b510a60c0d029aa085f0b10be691"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png
new file mode 100644
index 0000000..14bf10c
Binary files /dev/null and b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png differ
diff --git a/tutorials/gcp_quickstart/02_Deploy_the_stack.md b/tutorials/gcp_quickstart/02_Deploy_the_stack.md
index 121b443..ae1531e 100644
--- a/tutorials/gcp_quickstart/02_Deploy_the_stack.md
+++ b/tutorials/gcp_quickstart/02_Deploy_the_stack.md
@@ -5,12 +5,27 @@
- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/)
- [kustomize](https://kubectl.docs.kubernetes.io/installation/kustomize/)
+## 2. Choose the deployment option
+
+Choose the deployment option that best fits your needs:
+
+1. `kubeflow-monitoring`: Full Kubeflow deployment with all components.
+2. `kubeflow`: Full Kubeflow deployment without monitoring components (prometheus, grafana).
+3. `standalone-kfp-monitoring`: Standalone KFP deployment.
+4. `standalone-kfp`: Standalone KFP deployment without monitoring components (prometheus, grafana).
+5. `standalone-kfp-kserve-monitoring`: Standalone KFP and Kserve deployment.
+6. `standalone-kfp-kserve`: Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana).
+
+```bash
+export DEPLOYMENT_OPTION=kubeflow-monitoring
+```
+
## 2. Deploy the stack
Deploy all the components of the platform with:
```bash
-while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
+while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
```
## Troubleshooting
@@ -28,7 +43,7 @@ Race condition errors can occur when deploying Kubeflow. If this happens, delete
```bash
kubectl delete ns kubeflow
-while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
+while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
```
Sometimes, just deleting the failing pod, so that it get recreated, will fix the issue.
diff --git a/tutorials/local_deployment/01_Setup_local_cluster.md b/tutorials/local_deployment/01_Setup_local_cluster.md
index 48a3c8f..da37881 100644
--- a/tutorials/local_deployment/01_Setup_local_cluster.md
+++ b/tutorials/local_deployment/01_Setup_local_cluster.md
@@ -21,7 +21,7 @@ sudo mv ./kind /usr/local/bin/kind
### 3. Create a cluster
```bash
-export CLUSTER_NAME="kind-ep"
+export CLUSTER_NAME="mlops-platform"
export HOST_IP="127.0.0.1" # cluster IP address
cat <