diff --git a/.gitignore b/.gitignore index 2bc18e4..9094443 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,5 @@ tutorials/openstack/secure.yaml # Others old/ istio* -temp/ \ No newline at end of file +temp/ +.platform/ \ No newline at end of file diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1b7e320..caa8276 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,7 +2,7 @@ This file lists all the individuals who have contributed to this project. Thanks to each and every one of you for your valuable contributions! -### Silo AI: +### [Silo AI](https://www.silo.ai/): Original project leads and main developers: @@ -40,12 +40,14 @@ Other developers and testers of the platform: - Kristian Sikiric - Kaustav Tamuly - Jonathan Burdge +- Ammar Aldhahyani -### IML4E project and other contributors: +### [IML4E](https://itea4.org/project/iml4e.html) project and other contributors: -Univerwity of Helsinki: +[University of Helsinki](https://www.helsinki.fi/en/researchgroups/empirical-software-engineering): - Niila Siilasjoki +- Dennis Muiruri Fraunhofer Institute: diff --git a/config.env b/config.env index c0a7bdc..90f77b7 100644 --- a/config.env +++ b/config.env @@ -1,4 +1,2 @@ HOST_IP="127.0.0.1" -CLUSTER_NAME="kind-ep" -INSTALL_LOCAL_REGISTRY="true" -INSTALL_RAY="false" \ No newline at end of file +CLUSTER_NAME="mlops-platform" \ No newline at end of file diff --git a/deployment/README.md b/deployment/README.md index fb32127..49a2fff 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -1,7 +1,19 @@ ## Deploy the stack +Choose the deployment option that best fits your needs: +1. `kubeflow-monitoring`: Full Kubeflow deployment with all components. +2. `kubeflow`: Full Kubeflow deployment without monitoring components (prometheus, grafana). +3. `standalone-kfp-monitoring`: Standalone KFP deployment. +4. `standalone-kfp`: Standalone KFP deployment without monitoring components (prometheus, grafana). +5. `standalone-kfp-kserve-monitoring`: Standalone KFP and Kserve deployment. +6. `standalone-kfp-kserve`: Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana). + +```bash +export DEPLOYMENT_OPTION=kubeflow-monitoring +``` + Deploy to your kubernetes cluster with the following command: ```bash -while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done +while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done ``` \ No newline at end of file diff --git a/deployment/kubeflow-custom/kserve-sa.yaml b/deployment/custom/kserve-custom/base/kserve-sa.yaml similarity index 83% rename from deployment/kubeflow-custom/kserve-sa.yaml rename to deployment/custom/kserve-custom/base/kserve-sa.yaml index dd2d9cb..a300f81 100644 --- a/deployment/kubeflow-custom/kserve-sa.yaml +++ b/deployment/custom/kserve-custom/base/kserve-sa.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: Secret metadata: name: mysecret - namespace: kubeflow-user-example-com annotations: serving.kserve.io/s3-endpoint: mlflow-minio-service.mlflow.svc.cluster.local:9000 serving.kserve.io/s3-usehttps: "0" @@ -15,6 +14,5 @@ apiVersion: v1 kind: ServiceAccount metadata: name: kserve-sa - namespace: kubeflow-user-example-com secrets: - name: mysecret diff --git a/deployment/kubeflow-custom/kustomization.yaml b/deployment/custom/kserve-custom/base/kustomization.yaml similarity index 83% rename from deployment/kubeflow-custom/kustomization.yaml rename to deployment/custom/kserve-custom/base/kustomization.yaml index 2bf776f..12b6241 100644 --- a/deployment/kubeflow-custom/kustomization.yaml +++ b/deployment/custom/kserve-custom/base/kustomization.yaml @@ -2,5 +2,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- aws-secret.yaml - kserve-sa.yaml \ No newline at end of file diff --git a/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml b/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml new file mode 100644 index 0000000..0ad2c65 --- /dev/null +++ b/deployment/custom/kserve-custom/env/kubeflow/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kubeflow-user-example-com + +resources: +- ../../base diff --git a/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml b/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml new file mode 100644 index 0000000..950c92b --- /dev/null +++ b/deployment/custom/kserve-custom/env/standalone-kfp/kserve-inference-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kserve-inference \ No newline at end of file diff --git a/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml b/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml new file mode 100644 index 0000000..762483e --- /dev/null +++ b/deployment/custom/kserve-custom/env/standalone-kfp/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kserve-inference + +resources: + - ../../base + - kserve-inference-namespace.yaml diff --git a/deployment/kubeflow-custom/aws-secret.yaml b/deployment/custom/kubeflow-custom/base/aws-secret.yaml similarity index 86% rename from deployment/kubeflow-custom/aws-secret.yaml rename to deployment/custom/kubeflow-custom/base/aws-secret.yaml index b63fb1e..7203f52 100644 --- a/deployment/kubeflow-custom/aws-secret.yaml +++ b/deployment/custom/kubeflow-custom/base/aws-secret.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: Secret metadata: name: aws-secret - namespace: kubeflow-user-example-com type: Opaque data: # your BASE64 encoded AWS_ACCESS_KEY_ID diff --git a/deployment/custom/kubeflow-custom/base/kustomization.yaml b/deployment/custom/kubeflow-custom/base/kustomization.yaml new file mode 100644 index 0000000..e6ae779 --- /dev/null +++ b/deployment/custom/kubeflow-custom/base/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- aws-secret.yaml \ No newline at end of file diff --git a/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml b/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml new file mode 100644 index 0000000..0ad2c65 --- /dev/null +++ b/deployment/custom/kubeflow-custom/env/kubeflow/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kubeflow-user-example-com + +resources: +- ../../base diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml new file mode 100644 index 0000000..0ac0071 --- /dev/null +++ b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kserve-deployer.yaml @@ -0,0 +1,110 @@ +# Required for deploy model to have the necessery permissions to create inference services + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kserve-deployer +rules: + - verbs: + - '*' + apiGroups: + - '' + resources: + - secrets + - serviceaccounts + - verbs: + - get + - watch + - list + apiGroups: + - '' + resources: + - configmaps + - verbs: + - '*' + apiGroups: + - '' + resources: + - persistentvolumes + - persistentvolumeclaims + - verbs: + - create + - delete + - get + apiGroups: + - snapshot.storage.k8s.io + resources: + - volumesnapshots + - verbs: + - get + - list + - watch + - update + - patch + apiGroups: + - argoproj.io + resources: + - workflows + - verbs: + - '*' + apiGroups: + - '' + resources: + - pods + - pods/exec + - pods/log + - services + - verbs: + - '*' + apiGroups: + - '' + - apps + - extensions + resources: + - deployments + - replicasets + - verbs: + - '*' + apiGroups: + - kubeflow.org + resources: + - '*' + - verbs: + - '*' + apiGroups: + - batch + resources: + - jobs + - verbs: + - '*' + apiGroups: + - machinelearning.seldon.io + resources: + - seldondeployments + - verbs: + - '*' + apiGroups: + - serving.kserve.io + resources: + - '*' + - verbs: + - '*' + apiGroups: + - networking.istio.io + resources: + - '*' +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pipeline-runner-binding-cluster + labels: + application-crd-id: kubeflow-pipelines +subjects: + - kind: ServiceAccount + name: pipeline-runner + namespace: kubeflow +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kserve-deployer \ No newline at end of file diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml new file mode 100644 index 0000000..1eec75e --- /dev/null +++ b/deployment/custom/kubeflow-custom/env/standalone-kfp-kserve/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kubeflow + +resources: + - ../../base + - kserve-deployer.yaml diff --git a/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml b/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml new file mode 100644 index 0000000..1241abe --- /dev/null +++ b/deployment/custom/kubeflow-custom/env/standalone-kfp/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: kubeflow + +resources: + - ../../base diff --git a/deployment/envs/kubeflow-monitoring/kustomization.yaml b/deployment/envs/kubeflow-monitoring/kustomization.yaml new file mode 100644 index 0000000..3e7c6ce --- /dev/null +++ b/deployment/envs/kubeflow-monitoring/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/kubeflow +- ../../custom/kubeflow-custom/env/kubeflow +- ../../custom/kserve-custom/env/kubeflow +- ../../mlflow/env/local +- ../../monitoring \ No newline at end of file diff --git a/deployment/envs/kubeflow/kustomization.yaml b/deployment/envs/kubeflow/kustomization.yaml new file mode 100644 index 0000000..c4c1ca5 --- /dev/null +++ b/deployment/envs/kubeflow/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/kubeflow +- ../../custom/kubeflow-custom/env/kubeflow +- ../../custom/kserve-custom/env/kubeflow +- ../../mlflow/env/local \ No newline at end of file diff --git a/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml b/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml new file mode 100644 index 0000000..5d27568 --- /dev/null +++ b/deployment/envs/standalone-kfp-kserve-monitoring/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve +- ../../custom/kubeflow-custom/env/standalone-kfp-kserve +- ../../custom/kserve-custom/env/standalone-kfp +- ../../mlflow/env/local +- ../../monitoring \ No newline at end of file diff --git a/deployment/envs/standalone-kfp-kserve/kustomization.yaml b/deployment/envs/standalone-kfp-kserve/kustomization.yaml new file mode 100644 index 0000000..e757443 --- /dev/null +++ b/deployment/envs/standalone-kfp-kserve/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve +- ../../custom/kubeflow-custom/env/standalone-kfp-kserve +- ../../custom/kserve-custom/env/standalone-kfp +- ../../mlflow/env/local \ No newline at end of file diff --git a/deployment/envs/standalone-kfp-monitoring/kustomization.yaml b/deployment/envs/standalone-kfp-monitoring/kustomization.yaml new file mode 100644 index 0000000..696c68b --- /dev/null +++ b/deployment/envs/standalone-kfp-monitoring/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp +- ../../custom/kubeflow-custom/env/standalone-kfp +- ../../mlflow/env/local +- ../../monitoring \ No newline at end of file diff --git a/deployment/envs/standalone-kfp/kustomization.yaml b/deployment/envs/standalone-kfp/kustomization.yaml new file mode 100644 index 0000000..16cc703 --- /dev/null +++ b/deployment/envs/standalone-kfp/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../kubeflow/manifests/in-cluster-setup/standalone-kfp +- ../../custom/kubeflow-custom/env/standalone-kfp +- ../../mlflow/env/local \ No newline at end of file diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md new file mode 100644 index 0000000..8dc1847 --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/README.md @@ -0,0 +1,10 @@ +# Kubeflow + +Components: +- Multiuser isolation +- Central Dashboard +- Jupyter Notebooks +- Kubeflow Pipelines (KFP) +- Kserve +- Katib +- TensorBoard \ No newline at end of file diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml new file mode 100644 index 0000000..d3c11cf --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/kubeflow/kustomization.yaml @@ -0,0 +1,87 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +sortOptions: + order: legacy + legacySortOptions: + orderFirst: + - Namespace + - ResourceQuota + - StorageClass + - CustomResourceDefinition + - MutatingWebhookConfiguration + - ServiceAccount + - PodSecurityPolicy + - Role + - ClusterRole + - RoleBinding + - ClusterRoleBinding + - ConfigMap + - Secret + - Endpoints + - Service + - LimitRange + - PriorityClass + - PersistentVolume + - PersistentVolumeClaim + - Deployment + - StatefulSet + - CronJob + - PodDisruptionBudget + orderLast: + - ValidatingWebhookConfiguration + +resources: +# Cert-Manager +- ../../common/cert-manager/cert-manager/base +- ../../common/cert-manager/kubeflow-issuer/base +# Istio +- ../../common/istio-1-17/istio-crds/base +- ../../common/istio-1-17/istio-namespace/base +- ../../common/istio-1-17/istio-install/base +# OIDC Authservice +- ../../common/oidc-client/oidc-authservice/base +# Dex +- ../../common/dex/overlays/istio +# KNative +- ../../common/knative/knative-serving/overlays/gateways +- ../../common/knative/knative-eventing/base +- ../../common/istio-1-17/cluster-local-gateway/base +# Kubeflow namespace +- ../../common/kubeflow-namespace/base +# Kubeflow Roles +- ../../common/kubeflow-roles/base +# Kubeflow Istio Resources +- ../../common/istio-1-17/kubeflow-istio-resources/base + + +# Kubeflow Pipelines +- ../../apps/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user +# Katib +- ../../apps/katib/upstream/installs/katib-with-kubeflow +# Central Dashboard +- ../../apps/centraldashboard/upstream/overlays/kserve +# Admission Webhook +- ../../apps/admission-webhook/upstream/overlays/cert-manager +# Jupyter Web App +- ../../apps/jupyter/jupyter-web-app/upstream/overlays/istio +# Notebook Controller +- ../../apps/jupyter/notebook-controller/upstream/overlays/kubeflow +# Profiles + KFAM +- ../../apps/profiles/upstream/overlays/kubeflow +# PVC Viewer +- ../../apps/pvcviewer-controller/upstream/base/ +# Volumes Web App +- ../../apps/volumes-web-app/upstream/overlays/istio +# Tensorboards Controller +- ../../apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow +# Tensorboard Web App +- ../../apps/tensorboard/tensorboards-web-app/upstream/overlays/istio +# Training Operator +- ../../apps/training-operator/upstream/overlays/kubeflow +# User namespace +- ../../common/user-namespace/base + +# KServe +- ../../contrib/kserve/kserve +- ../../contrib/kserve/models-web-app/overlays/kubeflow diff --git a/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml deleted file mode 100644 index c1a8578..0000000 --- a/deployment/kubeflow/manifests/in-cluster-setup/kustomization.yaml +++ /dev/null @@ -1,87 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -sortOptions: - order: legacy - legacySortOptions: - orderFirst: - - Namespace - - ResourceQuota - - StorageClass - - CustomResourceDefinition - - MutatingWebhookConfiguration - - ServiceAccount - - PodSecurityPolicy - - Role - - ClusterRole - - RoleBinding - - ClusterRoleBinding - - ConfigMap - - Secret - - Endpoints - - Service - - LimitRange - - PriorityClass - - PersistentVolume - - PersistentVolumeClaim - - Deployment - - StatefulSet - - CronJob - - PodDisruptionBudget - orderLast: - - ValidatingWebhookConfiguration - -resources: -# Cert-Manager -- ../common/cert-manager/cert-manager/base -- ../common/cert-manager/kubeflow-issuer/base -# Istio -- ../common/istio-1-17/istio-crds/base -- ../common/istio-1-17/istio-namespace/base -- ../common/istio-1-17/istio-install/base -# OIDC Authservice -- ../common/oidc-client/oidc-authservice/base -# Dex -- ../common/dex/overlays/istio -# KNative -- ../common/knative/knative-serving/overlays/gateways -- ../common/knative/knative-eventing/base -- ../common/istio-1-17/cluster-local-gateway/base -# Kubeflow namespace -- ../common/kubeflow-namespace/base -# Kubeflow Roles -- ../common/kubeflow-roles/base -# Kubeflow Istio Resources -- ../common/istio-1-17/kubeflow-istio-resources/base - - -# Kubeflow Pipelines -- ../apps/pipeline/upstream/env/cert-manager/platform-agnostic-multi-user -# Katib -- ../apps/katib/upstream/installs/katib-with-kubeflow -# Central Dashboard -- ../apps/centraldashboard/upstream/overlays/kserve -# Admission Webhook -- ../apps/admission-webhook/upstream/overlays/cert-manager -# Jupyter Web App -- ../apps/jupyter/jupyter-web-app/upstream/overlays/istio -# Notebook Controller -- ../apps/jupyter/notebook-controller/upstream/overlays/kubeflow -# Profiles + KFAM -- ../apps/profiles/upstream/overlays/kubeflow -# PVC Viewer -- ../apps/pvcviewer-controller/upstream/base/ -# Volumes Web App -- ../apps/volumes-web-app/upstream/overlays/istio -# Tensorboards Controller -- ../apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow -# Tensorboard Web App -- ../apps/tensorboard/tensorboards-web-app/upstream/overlays/istio -# Training Operator -- ../apps/training-operator/upstream/overlays/kubeflow -# User namespace -- ../common/user-namespace/base - -# KServe -- ../contrib/kserve/kserve -- ../contrib/kserve/models-web-app/overlays/kubeflow diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md new file mode 100644 index 0000000..f954728 --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/README.md @@ -0,0 +1,5 @@ +# Standalone KFP + Kserve + +Components: +- Kubeflow Pipelines (KFP) +- Kserve \ No newline at end of file diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml new file mode 100644 index 0000000..d4e9892 --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp-kserve/kustomization.yaml @@ -0,0 +1,57 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +sortOptions: + order: legacy + legacySortOptions: + orderFirst: + - Namespace + - ResourceQuota + - StorageClass + - CustomResourceDefinition + - MutatingWebhookConfiguration + - ServiceAccount + - PodSecurityPolicy + - Role + - ClusterRole + - RoleBinding + - ClusterRoleBinding + - ConfigMap + - Secret + - Endpoints + - Service + - LimitRange + - PriorityClass + - PersistentVolume + - PersistentVolumeClaim + - Deployment + - StatefulSet + - CronJob + - PodDisruptionBudget + orderLast: + - ValidatingWebhookConfiguration + +resources: +# Cert-Manager +- ../../common/cert-manager/cert-manager/base +- ../../common/cert-manager/kubeflow-issuer/base + +# Istio +- ../../common/istio-1-17/istio-crds/base +- ../../common/istio-1-17/istio-namespace/base +- ../../common/istio-1-17/istio-install/base + +# KNative +- ../../common/knative/knative-serving/overlays/gateways +- ../../common/knative/knative-eventing/base +- ../../common/istio-1-17/cluster-local-gateway/base + +# Kubeflow Istio Resources +- ../../common/istio-1-17/kubeflow-istio-resources/base + +# Kubeflow Pipelines +- ../../apps/pipeline/upstream/cluster-scoped-resources +- ../../apps/pipeline/upstream/env/platform-agnostic-emissary + +# KServe +- ../../contrib/kserve/kserve \ No newline at end of file diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md new file mode 100644 index 0000000..a2245f6 --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/README.md @@ -0,0 +1,4 @@ +# Standalone KFP + +Components: +- Kubeflow Pipelines (KFP) \ No newline at end of file diff --git a/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml new file mode 100644 index 0000000..a92789a --- /dev/null +++ b/deployment/kubeflow/manifests/in-cluster-setup/standalone-kfp/kustomization.yaml @@ -0,0 +1,54 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +sortOptions: + order: legacy + legacySortOptions: + orderFirst: + - Namespace + - ResourceQuota + - StorageClass + - CustomResourceDefinition + - MutatingWebhookConfiguration + - ServiceAccount + - PodSecurityPolicy + - Role + - ClusterRole + - RoleBinding + - ClusterRoleBinding + - ConfigMap + - Secret + - Endpoints + - Service + - LimitRange + - PriorityClass + - PersistentVolume + - PersistentVolumeClaim + - Deployment + - StatefulSet + - CronJob + - PodDisruptionBudget + orderLast: + - ValidatingWebhookConfiguration + +resources: +# Cert-Manager +- ../../common/cert-manager/cert-manager/base +- ../../common/cert-manager/kubeflow-issuer/base + +# Istio +- ../../common/istio-1-17/istio-crds/base +- ../../common/istio-1-17/istio-namespace/base +- ../../common/istio-1-17/istio-install/base + +# KNative +- ../../common/knative/knative-serving/overlays/gateways +- ../../common/knative/knative-eventing/base +- ../../common/istio-1-17/cluster-local-gateway/base + +# Kubeflow Istio Resources +- ../../common/istio-1-17/kubeflow-istio-resources/base + +# Kubeflow Pipelines +- ../../apps/pipeline/upstream/cluster-scoped-resources +- ../../apps/pipeline/upstream/env/platform-agnostic-emissary \ No newline at end of file diff --git a/deployment/kustomization.yaml b/deployment/kustomization.yaml deleted file mode 100644 index c5d654e..0000000 --- a/deployment/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -resources: -- ./kubeflow/manifests/in-cluster-setup -- ./kubeflow-custom -- ./mlflow/env/local -- ./monitoring \ No newline at end of file diff --git a/deployment/mlflow/base/config.env b/deployment/mlflow/base/config.env index 5d6dfc3..e24035e 100644 --- a/deployment/mlflow/base/config.env +++ b/deployment/mlflow/base/config.env @@ -4,4 +4,4 @@ DB_HOST=postgres DB_PORT=5432 DB_NAME=mlflow -DEFAULT_ARTIFACT_ROOT=gs://mlflow-platformv2 +DEFAULT_ARTIFACT_ROOT=gs://mlflow-mlops-platform diff --git a/deployment/mlflow/env/gcp-cloudsql/params.env b/deployment/mlflow/env/gcp-cloudsql/params.env index 69446ff..f83af45 100644 --- a/deployment/mlflow/env/gcp-cloudsql/params.env +++ b/deployment/mlflow/env/gcp-cloudsql/params.env @@ -1 +1 @@ -GCP_CLOUDSQL_INSTANCE_NAME=mlops-platform-v2:europe-west1:mlops-platformv2 +GCP_CLOUDSQL_INSTANCE_NAME=mlops-platform-v2:europe-west1:mlops-platform diff --git a/deployment/monitoring/alert-manager/deployment.yaml b/deployment/monitoring/alert-manager/deployment.yaml index 2e44389..96028d2 100644 --- a/deployment/monitoring/alert-manager/deployment.yaml +++ b/deployment/monitoring/alert-manager/deployment.yaml @@ -25,7 +25,7 @@ spec: containerPort: 9093 resources: requests: - cpu: 500m + cpu: 250m memory: 500M limits: cpu: 1 diff --git a/deployment/monitoring/grafana/grafana-deployment.yaml b/deployment/monitoring/grafana/grafana-deployment.yaml index 71032e1..8e3144e 100644 --- a/deployment/monitoring/grafana/grafana-deployment.yaml +++ b/deployment/monitoring/grafana/grafana-deployment.yaml @@ -28,7 +28,7 @@ spec: cpu: "1000m" requests: memory: 500M - cpu: "500m" + cpu: "250m" volumeMounts: - mountPath: /var/lib/grafana name: grafana-storage diff --git a/scripts/create_cluster.sh b/scripts/create_cluster.sh index bbe9697..ae01799 100755 --- a/scripts/create_cluster.sh +++ b/scripts/create_cluster.sh @@ -1,11 +1,11 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail ####################################################################################### # Create and configure a cluster with Kind # -# Usage: $ export HOST_IP=127.0.0.1; export CLUSTER_NAME="kind-ep"; ./create_cluster.sh +# Usage: $ export HOST_IP=127.0.0.1; export CLUSTER_NAME="mlops-platform"; ./create_cluster.sh ####################################################################################### @@ -67,7 +67,7 @@ fi # see https://github.com/kubernetes-sigs/kind/issues/2586 -CONTAINER_ID=$(docker ps -aqf "name=kind-ep-control-plane") +CONTAINER_ID=$(docker ps -aqf "name=$CLUSTER_NAME-control-plane") docker exec -t ${CONTAINER_ID} bash -c "echo 'fs.inotify.max_user_watches=1048576' >> /etc/sysctl.conf" docker exec -t ${CONTAINER_ID} bash -c "echo 'fs.inotify.max_user_instances=512' >> /etc/sysctl.conf" docker exec -i ${CONTAINER_ID} bash -c "sysctl -p /etc/sysctl.conf" diff --git a/scripts/install_helm.sh b/scripts/install_helm.sh index 4a15055..ef44c43 100644 --- a/scripts/install_helm.sh +++ b/scripts/install_helm.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeo pipefail +set -eo pipefail function add_local_bin_to_path { # make sure ~/.local/bin is in $PATH diff --git a/scripts/install_local_registry.sh b/scripts/install_local_registry.sh index 4665d4b..3f1844b 100755 --- a/scripts/install_local_registry.sh +++ b/scripts/install_local_registry.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail ####################################################################################### # The following shell script will create a local docker registry and connect the diff --git a/scripts/install_ray.sh b/scripts/install_ray.sh index b53d539..1e1a864 100644 --- a/scripts/install_ray.sh +++ b/scripts/install_ray.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeo pipefail +set -eo pipefail helm repo add kuberay https://ray-project.github.io/kuberay-helm/ helm repo update diff --git a/scripts/install_tools.sh b/scripts/install_tools.sh index d4dc587..105415f 100755 --- a/scripts/install_tools.sh +++ b/scripts/install_tools.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail ####################################################################################### # CHECK PRE-REQUISITES diff --git a/scripts/install_tools_mac.sh b/scripts/install_tools_mac.sh index ad87a1b..a7c30c9 100644 --- a/scripts/install_tools_mac.sh +++ b/scripts/install_tools_mac.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail ####################################################################################### # CHECK PRE-REQUISITES diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index ea1cf7d..0df3505 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail ####################################################################################### # RUN TESTS diff --git a/setup.md b/setup.md index bf89b2e..c424959 100644 --- a/setup.md +++ b/setup.md @@ -15,6 +15,19 @@ Install the experimentation platform with: > **WARNING:** Using the `--test` flag will install the `requirements-tests.txt` in your current python environment. +## Deployment options + +1. **Kubeflow:** Full Kubeflow deployment with all components. +2. **Kubeflow (without monitoring):** Full Kubeflow deployment without monitoring components (prometheus, grafana). +3. **Standalone KFP:** Standalone KFP deployment. +4. **Standalone KFP (without monitoring):** Standalone KFP deployment without monitoring components (prometheus, grafana). +5. **Standalone KFP and Kserve:** Standalone KFP and Kserve deployment. +6. **Standalone KFP and Kserve (without monitoring):** Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana). + +> The minimum recommended machine requirements are: +> - **Kubeflow** options: 12 CPU cores, 25GB free disk space. +> - **Standalone KFP** options: 8 CPU cores, 18GB free disk space. + ## Test the deployment (manually) If you just deployed the platform, it will take a while to become ready. You can use @@ -40,15 +53,27 @@ pytest tests/ [-vrP] [--log-cli-level=INFO] *These are the same tests that are run automatically if you use the `--test` flag on installation.* -## Deleting the deployment +## Uninstall + +Uninstall the MLOps Platform with: -Delete the cluster: ```bash -# e.g. $ kind delete cluster --name kind-ep +./uninstall.sh +``` + +### Manual deletion + +The `uninstall.sh` script should delete everything, but if you need to manually remove the platform, you can do it with: + +```bash +# list kind clusters +kind get clusters + +# delete the kind cluster kind delete cluster --name [CLUSTER_NAME] ``` -If you also installed the local docker registry (`config.env` > `INSTALL_LOCAL_REGISTRY="true"`): +If you also installed the local docker registry: ```bash # check if it is running (kind-registry) @@ -68,20 +93,7 @@ docker rm -f $(docker ps -aqf "name=kind-registry") ### Error: namespace "kubeflow-user-example-com" not found This is not an error, and it is expected. Some of the things being deployed depend on other components, which need to be deployed and become ready first. -For example, the namespace `kubeflow-user-example-com` is created by a `kubeflow` component. That's why we deploy in a loop until everything is applied successfully: - -```bash -while true; do - if kubectl apply -f "$tmpfile"; then - echo "Resources successfully applied." - rm "$tmpfile" - break - else - echo "Retrying to apply resources. Be patient, this might take a while..." - sleep 10 - fi -done -``` +For example, the namespace `kubeflow-user-example-com` is created by a `kubeflow` component. That's why we deploy in a loop until everything is applied successfully. Once the main `kubeflow` deployment is ready, the `kubeflow-user-example-com` namespace will be created, and the command should finish successfully. diff --git a/setup.sh b/setup.sh index 2304afb..797b6db 100755 --- a/setup.sh +++ b/setup.sh @@ -1,8 +1,15 @@ #!/bin/bash -set -xeoa pipefail +set -eoa pipefail -source config.env +# Internal directory where to store platform settings +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +PLATFORM_DIR="$SCRIPT_DIR/.platform" +mkdir -p "$PLATFORM_DIR" +PLATFORM_CONFIG="$PLATFORM_DIR/.config" +cp "$SCRIPT_DIR/config.env" $PLATFORM_CONFIG + +source $PLATFORM_CONFIG RUN_TESTS=false LOG_LEVEL_TESTS="WARNING" @@ -23,13 +30,66 @@ echo Cluster name set to: "$CLUSTER_NAME" echo Host IP set to: "$HOST_IP" echo Run tests after installation set to: "$RUN_TESTS" +DEFAULT_DEPLOYMENT_OPTION="kubeflow-monitoring" +echo +echo "Please choose the deployment option:" +echo "[1] Kubeflow (all components)" +echo "[2] Kubeflow (without monitoring)" +echo "[3] Standalone KFP" +echo "[4] Standalone KFP (without monitoring)" +echo "[5] Standalone KFP and Kserve" +echo "[6] Standalone KFP and Kserve (without monitoring)" +read -p "Enter the number of your choice [1-6] (default is [1]): " choice +case "$choice" in + 1 ) DEPLOYMENT_OPTION="kubeflow-monitoring" ;; + 2 ) DEPLOYMENT_OPTION="kubeflow" ;; + 3 ) DEPLOYMENT_OPTION="standalone-kfp-monitoring" ;; + 4 ) DEPLOYMENT_OPTION="standalone-kfp" ;; + 5 ) DEPLOYMENT_OPTION="standalone-kfp-kserve-monitoring" ;; + 6 ) DEPLOYMENT_OPTION="standalone-kfp-kserve" ;; + * ) DEPLOYMENT_OPTION="$DEFAULT_DEPLOYMENT_OPTION" ;; +esac + +INSTALL_LOCAL_REGISTRY=true +echo +read -p "Install local Docker registry? (y/n) (default is [y]): " choice +case "$choice" in + n|N ) INSTALL_LOCAL_REGISTRY=false ;; + * ) INSTALL_LOCAL_REGISTRY=true ;; +esac + +INSTALL_RAY=false +echo +read -p "Install Ray? (It requires ~4 additional CPUs) (y/n) (default is [n]): " choice +case "$choice" in + y|Y ) INSTALL_RAY=true ;; + * ) INSTALL_RAY=false ;; +esac + +# Save selections to settings file +echo -e "\nDEPLOYMENT_OPTION=$DEPLOYMENT_OPTION" >> $PLATFORM_CONFIG +echo -e "\nINSTALL_LOCAL_REGISTRY=$INSTALL_LOCAL_REGISTRY" >> $PLATFORM_CONFIG +echo -e "\nINSTALL_RAY=$INSTALL_RAY" >> $PLATFORM_CONFIG + # CHECK DISK SPACE -RECOMMENDED_DISK_SPACE=26214400 -RECOMMENDED_DISK_SPACE_GB=$(($RECOMMENDED_DISK_SPACE / 1024 / 1024)) +RECOMMENDED_DISK_SPACE_KUBEFLOW=26214400 +RECOMMENDED_DISK_SPACE_KUBEFLOW_GB=$(($RECOMMENDED_DISK_SPACE_KUBEFLOW / 1024 / 1024)) +RECOMMENDED_DISK_SPACE_KFP=18874368 +RECOMMENDED_DISK_SPACE_KFP_GB=$(($RECOMMENDED_DISK_SPACE_KFP / 1024 / 1024)) + +if [[ $DEPLOYMENT_OPTION == *"kfp"* ]]; then + RECOMMENDED_DISK_SPACE=$RECOMMENDED_DISK_SPACE_KFP + RECOMMENDED_DISK_SPACE_GB=$RECOMMENDED_DISK_SPACE_KFP_GB +else + RECOMMENDED_DISK_SPACE=$RECOMMENDED_DISK_SPACE_KUBEFLOW + RECOMMENDED_DISK_SPACE_GB=$RECOMMENDED_DISK_SPACE_KUBEFLOW_GB +fi DISK_SPACE=$(df -k . | awk -F ' ' '{print $4}' | sed -n '2 p') DISK_SPACE_GB=$(($DISK_SPACE / 1024 / 1024)) +# TODO: Set required depending on the deployment, ray, etc. + if [[ DISK_SPACE < $RECOMMENDED_DISK_SPACE ]]; then echo "WARNING: Not enough disk space detected!" echo "The recommended is > ${RECOMMENDED_DISK_SPACE_GB} GB of disk space. You have ${DISK_SPACE_GB} GB." @@ -44,7 +104,19 @@ if [[ DISK_SPACE < $RECOMMENDED_DISK_SPACE ]]; then fi # CHECK CPU COUNT -RECOMMENDED_CPUS=16 +RECOMMENDED_CPUS_KUBEFLOW=12 +RECOMMENDED_CPUS_KFP=8 +EXTRA_RAY_CPUS=4 + +if [[ $DEPLOYMENT_OPTION == *"kfp"* ]]; then + RECOMMENDED_CPUS=$RECOMMENDED_CPUS_KFP +else + RECOMMENDED_CPUS=$RECOMMENDED_CPUS_KUBEFLOW +fi + +if [ "$INSTALL_RAY" = true ]; then + RECOMMENDED_CPUS=$(($RECOMMENDED_CPUS + $EXTRA_RAY_CPUS)) +fi # Detect the OS OS=$(uname) @@ -59,12 +131,13 @@ fi if [[ $CPU_COUNT -lt $RECOMMENDED_CPUS ]]; then echo "WARNING: Not enough CPU cores detected!" - echo "The recommended is >= ${RECOMMENDED_CPUS} CPU cores. You have ${CPU_COUNT} cores." + echo "The recommended is >= ${RECOMMENDED_CPUS} CPU cores for this deployment configuration. You have ${CPU_COUNT} cores." while true; do read -p "Do you want to continue with the installation? (y/n): " yn case $yn in [Yy]* ) break;; [Nn]* ) exit 1;; + "" ) echo "Please enter a response.";; * ) echo "Please answer yes or no.";; esac done @@ -72,9 +145,9 @@ fi # INSTALL TOOLS if [[ "$(uname)" == "Darwin" ]]; then - bash scripts/install_tools_mac.sh # Using default bash because /bin/bash is an old version (3) + bash "$SCRIPT_DIR/scripts/install_tools_mac.sh" # Using default bash because /bin/bash is an old version (3) else - /bin/bash scripts/install_tools.sh + /bin/bash "$SCRIPT_DIR/scripts/install_tools.sh" fi # CREATE CLUSTER @@ -83,30 +156,61 @@ function fail { exit "${2-1}" ## Return a code specified by $2, or 1 by default. } -/bin/bash scripts/create_cluster.sh || fail +# Check if the kind cluster already exists +if kind get clusters | grep -q "^$CLUSTER_NAME$"; then + echo + echo "Kind cluster with name \"$CLUSTER_NAME\" already exists. It can be deleted with the following command: kind delete cluster --name $CLUSTER_NAME" + while true; do + read -p "Do you want to continue the installation on the existing cluster? (y/n): " choice + case "$choice" in + y|Y ) echo "Using existing kind cluster..."; break;; + n|N ) exit 0 ;; + * ) echo "Invalid response. Please enter y or n." ;; + "" ) echo "Please enter a response." ;; + esac + done +else + echo "Creating kind cluster..." + /bin/bash "$SCRIPT_DIR/scripts/create_cluster.sh" +fi kubectl cluster-info --context kind-$CLUSTER_NAME # DEPLOY LOCAL DOCKER REGISTRY if [ "$INSTALL_LOCAL_REGISTRY" = true ]; then - /bin/bash scripts/install_local_registry.sh + /bin/bash "$SCRIPT_DIR/scripts/install_local_registry.sh" fi # DEPLOY STACK kubectl config use-context kind-$CLUSTER_NAME -# Create a temporary file -tmpfile=$(mktemp) # Build the kustomization and store the output in the temporary file -kustomize build deployment > "$tmpfile" - +tmp_file=$(mktemp) +DEPLOYMENT_ROOT="$SCRIPT_DIR/deployment/envs/$DEPLOYMENT_OPTION" +echo "Deployment root set to: $DEPLOYMENT_ROOT" +echo +echo "Building manifests..." +kustomize build $DEPLOYMENT_ROOT > "$tmp_file" +echo "Manifests built successfully." +echo +echo "Applying resources..." while true; do - if kubectl apply -f "$tmpfile"; then + if kubectl apply -f "$tmp_file"; then echo "Resources successfully applied." - rm "$tmpfile" + rm "$tmp_file" break else - echo "Retrying to apply resources. Be patient, this might take a while..." + echo + echo "Retrying to apply resources." + echo "Be patient, this might take a while... (Errors are expected until all resources are available!)" + echo + echo "Help:" + echo " If the errors persists, please check the pods status with: kubectl get pods --all-namespaces" + echo " All pods should be either in Running state, or ContainerCreating if they are still starting up." + echo " Check specific pod errors with: kubectl describe pod -n [NAMESPACE] [POD_NAME]" + echo " For further help, see the Troubleshooting section in setup.md" + echo + sleep 10 fi done @@ -114,17 +218,17 @@ done # DEPLOY RAY if [ "$INSTALL_RAY" = true ]; then echo "Installing Ray" - /bin/bash scripts/install_helm.sh - /bin/bash scripts/install_ray.sh + /bin/bash "$SCRIPT_DIR/scripts/install_helm.sh" + /bin/bash "$SCRIPT_DIR/scripts/install_ray.sh" fi echo -echo Installation completed! +echo "Installation completed!" echo # TESTS if [ "$RUN_TESTS" = "true" ]; then - /bin/bash scripts/run_tests.sh + /bin/bash "$SCRIPT_DIR/scripts/run_tests.sh" fi exit 0 diff --git a/tests/conftest.py b/tests/conftest.py index 1df8804..38728fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,10 @@ from dotenv import load_dotenv import os -ENV_FILE = pathlib.Path(__file__).parent.parent / "config.env" +from .utils import parse_bool + +ENV_FILE = pathlib.Path(__file__).parent.parent / ".platform/.config" +assert ENV_FILE.exists(), f"File not found: {ENV_FILE} (autogenerated by the platform on installation)" # noqa load_dotenv(dotenv_path=ENV_FILE) CLUSTER_NAME = os.getenv("CLUSTER_NAME") @@ -14,8 +17,8 @@ assert HOST_IP is not None # MLFLOW -MLFLOW_ENV_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "config.env" -MLFLOW_SECRETS_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "secret.env" +MLFLOW_ENV_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "config.env" # noqa +MLFLOW_SECRETS_FILE = pathlib.Path(__file__).parent.parent / "deployment/mlflow/env/local" / "secret.env" # noqa load_dotenv(dotenv_path=MLFLOW_ENV_FILE, override=True) AWS_ACCESS_KEY_ID = os.getenv("MINIO_ACCESS_KEY") @@ -25,6 +28,9 @@ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") assert AWS_SECRET_ACCESS_KEY is not None +IS_STANDALONE_KFP = "kfp" in os.environ.get('DEPLOYMENT_OPTION') +SKIP_LOCAL_REGISTRY = not parse_bool(os.environ.get('INSTALL_LOCAL_REGISTRY')) + def pytest_sessionstart(session): """ diff --git a/tests/resources/kfp/build_image.sh b/tests/resources/kfp/build_image.sh index 6ced22e..779bec1 100755 --- a/tests/resources/kfp/build_image.sh +++ b/tests/resources/kfp/build_image.sh @@ -12,7 +12,7 @@ cd "$(dirname "$0")" docker build -t "$FULL_IMAGE_NAME" . -# load the image into the local "kind" cluster with name "kind-ep" +# load the image into the local "kind" cluster kind load docker-image "$FULL_IMAGE_NAME" --name $CLUSTER_NAME # to push the image to a remote repository instead diff --git a/tests/resources/registry/build_push_image.sh b/tests/resources/registry/build_push_image.sh index 513de3c..14bf947 100755 --- a/tests/resources/registry/build_push_image.sh +++ b/tests/resources/registry/build_push_image.sh @@ -14,8 +14,5 @@ cd "$(dirname "$0")" docker build -t "$FULL_IMAGE_NAME" . -# load the image into the local "kind" cluster with name "kind-ep" -#kind load docker-image "$FULL_IMAGE_NAME" --name kind-ep - # to push the image to a remote repository instead docker push "$FULL_IMAGE_NAME" \ No newline at end of file diff --git a/tests/test_kfp.py b/tests/test_kfp.py index a4e5dea..2c292e5 100644 --- a/tests/test_kfp.py +++ b/tests/test_kfp.py @@ -1,3 +1,4 @@ +import os import subprocess import logging import pathlib @@ -9,7 +10,7 @@ import requests from urllib.parse import urlsplit -from .conftest import CLUSTER_NAME +from .conftest import CLUSTER_NAME, IS_STANDALONE_KFP logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -22,7 +23,7 @@ KUBEFLOW_ENDPOINT = "http://localhost:8080" KUBEFLOW_USERNAME = "user@example.com" KUBEFLOW_PASSWORD = "12341234" -NAMESPACE = "kubeflow-user-example-com" +KUBEFLOW_USER_NAMESPACE = "kubeflow-user-example-com" def get_istio_auth_session(url: str, username: str, password: str) -> dict: @@ -44,10 +45,8 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict: "is_secured": None, # True if KF endpoint is secured "session_cookie": None # Resulting session cookies in the form "key1=value1; key2=value2" } - # use a persistent session (for cookies) with requests.Session() as s: - ################ # Determine if Endpoint is Secured ################ @@ -56,7 +55,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict: raise RuntimeError( f"HTTP status code '{resp.status_code}' for GET against: {url}" ) - auth_session["redirect_url"] = resp.url # if we were NOT redirected, then the endpoint is UNSECURED @@ -101,7 +99,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict: f"HTTP status code '{resp.status_code}' " f"for GET against: {redirect_url_obj.geturl()}" ) - # set the login url auth_session["dex_login_url"] = resp.url @@ -118,7 +115,6 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict: f"Login credentials were probably invalid - " f"No redirect after POST to: {auth_session['dex_login_url']}" ) - # store the session cookies in a "key1=value1; key2=value2" string auth_session["session_cookie"] = "; ".join( [f"{c.name}={c.value}" for c in s.cookies] @@ -128,43 +124,37 @@ def get_istio_auth_session(url: str, username: str, password: str) -> dict: def run_pipeline(pipeline_file: str, experiment_name: str): - - with subprocess.Popen(["kubectl", "-n", "istio-system", "port-forward", "svc/istio-ingressgateway", "8080:80"], stdout=True) as proc: + """Run a pipeline on a Kubeflow cluster.""" + with subprocess.Popen(["kubectl", "-n", "istio-system", "port-forward", "svc/istio-ingressgateway", "8080:80"], stdout=True) as proc: # noqa: E501 try: time.sleep(2) # give some time to the port-forward connection - auth_session = get_istio_auth_session( url=KUBEFLOW_ENDPOINT, username=KUBEFLOW_USERNAME, password=KUBEFLOW_PASSWORD ) - client = kfp.Client( host=f"{KUBEFLOW_ENDPOINT}/pipeline", cookies=auth_session["session_cookie"], - namespace=NAMESPACE, + namespace=KUBEFLOW_USER_NAMESPACE, ) - created_run = client.create_run_from_pipeline_package( pipeline_file=pipeline_file, enable_caching=False, arguments={}, run_name="kfp_test_run", experiment_name=experiment_name, - namespace=NAMESPACE + namespace=KUBEFLOW_USER_NAMESPACE ) - run_id = created_run.run_id - logger.info(f"Submitted run with ID: {run_id}") - logger.info(f"Waiting for run {run_id} to complete....") run_detail = created_run.wait_for_run_completion() _handle_job_end(run_detail) # clean up experiment = client.get_experiment( - experiment_name=experiment_name, namespace=NAMESPACE + experiment_name=experiment_name, namespace=KUBEFLOW_USER_NAMESPACE ) client.delete_experiment(experiment.id) logger.info("Done") @@ -176,16 +166,46 @@ def run_pipeline(pipeline_file: str, experiment_name: str): proc.terminate() +def run_pipeline_standalone_kfp(pipeline_file: str, experiment_name: str): + """Run a pipeline on a standalone Kubeflow Pipelines cluster.""" + with subprocess.Popen(["kubectl", "-n", "kubeflow", "port-forward", "svc/ml-pipeline-ui", "8080:80"], stdout=True) as proc: # noqa: E501 + try: + time.sleep(2) # give some time to the port-forward connection + + client = kfp.Client( + host=f"{KUBEFLOW_ENDPOINT}/pipeline", + ) + created_run = client.create_run_from_pipeline_package( + pipeline_file=pipeline_file, + enable_caching=False, + arguments={}, + run_name="kfp_test_run", + experiment_name=experiment_name, + ) + run_id = created_run.run_id + logger.info(f"Submitted run with ID: {run_id}") + logger.info(f"Waiting for run {run_id} to complete....") + run_detail = created_run.wait_for_run_completion() + _handle_job_end(run_detail) + + # clean up + experiment = client.get_experiment(experiment_name=experiment_name) + client.delete_experiment(experiment.id) + logger.info("Done") + + except Exception as e: + logger.error(f"ERROR: {e}") + raise e + finally: + proc.terminate() + + def _handle_job_end(run_detail): finished_run = run_detail.to_dict()["run"] - created_at = finished_run["created_at"] finished_at = finished_run["finished_at"] - duration_secs = (finished_at - created_at).total_seconds() - status = finished_run["status"] - logger.info(f"Run finished in {round(duration_secs)} seconds with status: {status}") if status != "Succeeded": @@ -196,7 +216,6 @@ def build_load_image(): output = subprocess.check_output( ["docker", "exec", f"{CLUSTER_NAME}-control-plane", "crictl", "images"] ) - if IMAGE_NAME in output.decode(): logging.info(f"Image already in cluster.") else: @@ -206,14 +225,25 @@ def build_load_image(): @pytest.mark.order(6) @pytest.mark.timeout(240) +@pytest.mark.skipif(IS_STANDALONE_KFP, reason="It is not Kubeflow") def test_run_pipeline(): - # build the base docker image and load it into the cluster build_load_image() - # submit and run pipeline run_pipeline(pipeline_file=str(PIPELINE_FILE), experiment_name=EXPERIMENT_NAME) +@pytest.mark.order(6) +@pytest.mark.timeout(240) +@pytest.mark.skipif(not IS_STANDALONE_KFP, reason="It is not standalone KFP") +def test_run_pipeline_standalone_kfp(): + # build the base docker image and load it into the cluster + build_load_image() + # submit and run pipeline + run_pipeline_standalone_kfp( + pipeline_file=str(PIPELINE_FILE), experiment_name=EXPERIMENT_NAME + ) + + if __name__ == "__main__": test_run_pipeline() diff --git a/tests/test_registry.py b/tests/test_registry.py index e13cf83..dce083c 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -2,17 +2,16 @@ import logging import pathlib import pytest -import os from envsubst import envsubst -from .conftest import HOST_IP -from .test_kfp import run_pipeline +from .conftest import HOST_IP, IS_STANDALONE_KFP, SKIP_LOCAL_REGISTRY +from .test_kfp import run_pipeline, run_pipeline_standalone_kfp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -BUILD_FILE = pathlib.Path(__file__).parent / "resources" / "registry" / "build_push_image.sh" -PIPELINE_TEMPLATE = pathlib.Path(__file__).parent / "resources" / "registry" / "pipeline.yaml.template" +BUILD_FILE = pathlib.Path(__file__).parent / "resources" / "registry" / "build_push_image.sh" # noqa +PIPELINE_TEMPLATE = pathlib.Path(__file__).parent / "resources" / "registry" / "pipeline.yaml.template" # noqa IMAGE_NAME = "kfp-registry-test-image" EXPERIMENT_NAME = "Test Experiment (Registry)" @@ -32,10 +31,7 @@ def render_pipeline_yaml(output: str): @pytest.mark.order(7) -@pytest.mark.skipif( - os.environ.get('INSTALL_LOCAL_REGISTRY') == 'false', - reason="No local image registry was installed." -) +@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed") def test_push_image(): # build the base docker image and load it into the cluster build_push_image() @@ -43,18 +39,29 @@ def test_push_image(): @pytest.mark.order(8) @pytest.mark.timeout(120) -@pytest.mark.skipif( - os.environ.get('INSTALL_LOCAL_REGISTRY') == 'false', - reason="No local image registry was installed." -) +@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed") +@pytest.mark.skipif(IS_STANDALONE_KFP, reason="It is not Kubeflow") def test_run_pipeline_using_registry(tmp_path): - # build the base docker image and load it into the cluster build_push_image() - # create pipeline.yaml with the right registry IP address pipeline_file = tmp_path / "pipeline.yaml" render_pipeline_yaml(output=str(pipeline_file)) - # submit and run pipeline run_pipeline(pipeline_file=str(pipeline_file), experiment_name=EXPERIMENT_NAME) + + +@pytest.mark.order(8) +@pytest.mark.timeout(120) +@pytest.mark.skipif(SKIP_LOCAL_REGISTRY, reason="No local image registry was installed") +@pytest.mark.skipif(not IS_STANDALONE_KFP, reason="It is not standalone KFP") +def test_run_pipeline_standalone_kfp_using_registry(tmp_path): + # build the base docker image and load it into the cluster + build_push_image() + # create pipeline.yaml with the right registry IP address + pipeline_file = tmp_path / "pipeline.yaml" + render_pipeline_yaml(output=str(pipeline_file)) + # submit and run pipeline + run_pipeline_standalone_kfp( + pipeline_file=str(pipeline_file), experiment_name=EXPERIMENT_NAME + ) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..6bd5ea6 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,16 @@ +from typing import Union + + +def parse_bool(val: Union[str, bool]) -> bool: + """Convert a string representation of truth to True or False. + True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values + are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if + 'val' is anything else. + """ + val = val.lower() + if val in ('y', 'yes', 't', 'true', 'on', '1', True): + return True + elif val in ('n', 'no', 'f', 'false', 'off', '0', False): + return False + else: + raise ValueError(f"Invalid truth value {val}") diff --git a/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb b/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb index 4f9967e..bb94984 100644 --- a/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb +++ b/tutorials/demo_notebooks/demo_fairness_and_energy_monitoring/demo-pipeline-with-fairness-and-energy-monitoring.ipynb @@ -778,10 +778,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "ef37a9bc", - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T09:16:14.457318Z", + "start_time": "2024-04-13T09:16:14.094491Z" + } + }, "source": [ "@component(\n", " base_image=\"python:3.9\",\n", @@ -800,6 +803,7 @@ " from kserve import V1beta1InferenceServiceSpec\n", " from kserve import V1beta1PredictorSpec\n", " from kserve import V1beta1SKLearnSpec\n", + " from kubernetes.client import V1ResourceRequirements\n", " import logging\n", " \n", " deploy_model_component_landmark = 'KFP_component'\n", @@ -815,25 +819,45 @@ " api_version = constants.KSERVE_GROUP + '/' + kserve_version\n", "\n", " isvc = V1beta1InferenceService(\n", - " api_version=api_version,\n", - " kind=constants.KSERVE_KIND,\n", - " metadata=client.V1ObjectMeta(\n", - " name=model_name,\n", - " namespace=namespace,\n", - " annotations={'sidecar.istio.io/inject':'false'}\n", + " api_version = api_version,\n", + " kind = constants.KSERVE_KIND,\n", + " metadata = client.V1ObjectMeta(\n", + " name = model_name,\n", + " namespace = namespace,\n", + " annotations = {'sidecar.istio.io/inject':'false'}\n", " ),\n", - " spec=V1beta1InferenceServiceSpec(\n", + " spec = V1beta1InferenceServiceSpec(\n", " predictor=V1beta1PredictorSpec(\n", " service_account_name=\"kserve-sa\",\n", + " min_replicas=1,\n", + " max_replicas = 1,\n", " sklearn=V1beta1SKLearnSpec(\n", - " storage_uri=model_uri\n", - " )\n", + " storage_uri=model_uri,\n", + " resources=V1ResourceRequirements(\n", + " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n", + " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n", + " )\n", + " ),\n", " )\n", " )\n", " )\n", " KServe = KServeClient()\n", " KServe.create(isvc)" - ] + ], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'component' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;129m@component\u001B[39m(\n\u001B[1;32m 2\u001B[0m base_image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpython:3.9\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 3\u001B[0m packages_to_install\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mkserve=0.12.0\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 4\u001B[0m output_component_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcomponents/deploy_model_component.yaml\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[1;32m 5\u001B[0m )\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdeploy_model\u001B[39m(model_name: \u001B[38;5;28mstr\u001B[39m, storage_uri: \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m 7\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;124;03m Deploy the model as a inference service with Kserve.\u001B[39;00m\n\u001B[1;32m 9\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mkubernetes\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m client\n", + "\u001B[0;31mNameError\u001B[0m: name 'component' is not defined" + ] + } + ], + "execution_count": 1 }, { "cell_type": "markdown", @@ -853,10 +877,13 @@ }, { "cell_type": "code", - "execution_count": null, "id": "b90d1839", - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T09:16:14.719526Z", + "start_time": "2024-04-13T09:16:14.679920Z" + } + }, "source": [ "@component(\n", " base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n", @@ -1025,7 +1052,21 @@ " if response.status_code != 200:\n", " raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n", " logger.info(f\"\\nPrediction response:\\n{response.text}\\n\")" - ] + ], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'component' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[2], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;129m@component\u001B[39m(\n\u001B[1;32m 2\u001B[0m base_image\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpython:3.9\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;66;03m# kserve on python 3.10 comes with a dependency that fails to get installed\u001B[39;00m\n\u001B[1;32m 3\u001B[0m packages_to_install\u001B[38;5;241m=\u001B[39m[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mkserve\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mscikit-learn~=1.0.2\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 4\u001B[0m output_component_file\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcomponents/inference_component.yaml\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[1;32m 5\u001B[0m )\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21minference\u001B[39m(\n\u001B[1;32m 7\u001B[0m model_name: \u001B[38;5;28mstr\u001B[39m\n\u001B[1;32m 8\u001B[0m ):\n\u001B[1;32m 9\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 10\u001B[0m \u001B[38;5;124;03m Test inference.\u001B[39;00m\n\u001B[1;32m 11\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m 12\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mkserve\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m KServeClient\n", + "\u001B[0;31mNameError\u001B[0m: name 'component' is not defined" + ] + } + ], + "execution_count": 2 }, { "cell_type": "markdown", @@ -1311,7 +1352,7 @@ "If not, removing the cluster and reinstalling everything is usually easier. A more surgical approach is to use kubectl to check the logs of pods in the given namespace and configure used YAMLs to fix the issue. When the modified YAMLs have been saved, just apply them and then rollout restart the pods. It is recommended to stop any dashboards before doing this. The required commands cluster removal, and kubectl fixing are:\n", "\n", "Cluster removal:\n", - "- Cluster deletion = kind delete cluster --name kind-ep\n", + "- Cluster deletion = kind delete cluster --name mlops-platform\n", "- Registry deletion = docker rm -f $(docker ps -aqf \"name=kind-registry\")\n", "\n", "Optional docker clean up:\n", diff --git a/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb b/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb index d338ba5..f700667 100644 --- a/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb +++ b/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb @@ -21,16 +21,16 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "%%bash\n", "\n", "pip install kfp~=1.8.14" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -43,11 +43,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", @@ -64,7 +62,9 @@ " Artifact,\n", " Model\n", ")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -83,8 +83,6 @@ }, { "cell_type": "code", - "execution_count": null, - "outputs": [], "source": [ "import re\n", "import requests\n", @@ -189,12 +187,12 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": null, - "outputs": [], "source": [ "import kfp\n", "\n", @@ -213,7 +211,9 @@ ], "metadata": { "collapsed": false - } + }, + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -239,11 +239,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@component(\n", " base_image=\"python:3.10\",\n", @@ -258,7 +256,9 @@ "\n", " df = pd.read_csv(url, sep=\";\")\n", " df.to_csv(data.path, index=None)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -271,11 +271,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@component(\n", " base_image=\"python:3.10\",\n", @@ -312,7 +310,9 @@ "\n", " train.to_csv(train_set.path, index=None)\n", " test.to_csv(test_set.path, index=None)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -325,11 +325,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "from typing import NamedTuple\n", "\n", @@ -443,7 +441,9 @@ "\n", " # return str(mlflow.get_artifact_uri())\n", " return output(mlflow.get_artifact_uri(), run_id)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -456,11 +456,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@component(\n", " base_image=\"python:3.10\",\n", @@ -500,7 +498,9 @@ " logger.error(f\"Metric {key} failed. Evaluation not passed!\")\n", " return False\n", " return True" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -513,11 +513,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@component(\n", " base_image=\"python:3.9\",\n", @@ -526,7 +524,7 @@ ")\n", "def deploy_model(model_name: str, storage_uri: str):\n", " \"\"\"\n", - " Deploy the model as a inference service with Kserve.\n", + " Deploy the model as an inference service with Kserve.\n", " \"\"\"\n", " import logging\n", " from kubernetes import client\n", @@ -537,6 +535,7 @@ " from kserve import V1beta1InferenceServiceSpec\n", " from kserve import V1beta1PredictorSpec\n", " from kserve import V1beta1SKLearnSpec\n", + " from kubernetes.client import V1ResourceRequirements\n", "\n", " logging.basicConfig(level=logging.INFO)\n", " logger = logging.getLogger(__name__)\n", @@ -544,32 +543,38 @@ " model_uri = f\"{storage_uri}/{model_name}\"\n", " logger.info(f\"MODEL URI: {model_uri}\")\n", "\n", - " # namespace = 'kserve-inference'\n", " namespace = utils.get_default_target_namespace()\n", " kserve_version='v1beta1'\n", " api_version = constants.KSERVE_GROUP + '/' + kserve_version\n", "\n", - "\n", " isvc = V1beta1InferenceService(\n", - " api_version=api_version,\n", - " kind=constants.KSERVE_KIND,\n", - " metadata=client.V1ObjectMeta(\n", - " name=model_name,\n", - " namespace=namespace,\n", - " annotations={'sidecar.istio.io/inject':'false'}\n", + " api_version = api_version,\n", + " kind = constants.KSERVE_KIND,\n", + " metadata = client.V1ObjectMeta(\n", + " name = model_name,\n", + " namespace = namespace,\n", + " annotations = {'sidecar.istio.io/inject':'false'}\n", " ),\n", - " spec=V1beta1InferenceServiceSpec(\n", + " spec = V1beta1InferenceServiceSpec(\n", " predictor=V1beta1PredictorSpec(\n", " service_account_name=\"kserve-sa\",\n", + " min_replicas=1,\n", + " max_replicas = 1,\n", " sklearn=V1beta1SKLearnSpec(\n", - " storage_uri=model_uri\n", - " )\n", + " storage_uri=model_uri,\n", + " resources=V1ResourceRequirements(\n", + " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n", + " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n", + " )\n", + " ),\n", " )\n", " )\n", " )\n", " KServe = KServeClient()\n", " KServe.create(isvc)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -582,11 +587,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@component(\n", " base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n", @@ -748,8 +751,8 @@ " logger.info(f\"\\nInference service URL:\\n{is_url}\\n\")\n", "\n", " inference_input = {\n", - " 'instances': input_sample.tolist()\n", - " }\n", + " 'instances': input_sample.tolist()\n", + " }\n", " response = requests.post(\n", " is_url,\n", " json=inference_input,\n", @@ -761,7 +764,9 @@ " raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n", " \n", " logger.info(f\"\\nPrediction response:\\n{response.json()}\\n\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -776,11 +781,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "@dsl.pipeline(\n", " name='demo-pipeline',\n", @@ -833,7 +836,9 @@ " scaler_in=preprocess_task.outputs[\"scaler_out\"]\n", " )\n", " inference_task.after(deploy_model_task)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -846,11 +851,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "# Specify pipeline argument values\n", "\n", @@ -867,7 +870,9 @@ " \"l1_ratio\": 0.5,\n", " \"threshold_metrics\": eval_threshold_metrics\n", "}" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -880,11 +885,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [], "source": [ "run_name = \"demo-run\"\n", "experiment_name = \"demo-experiment\"\n", @@ -898,7 +901,9 @@ " enable_caching=False,\n", " namespace=\"kubeflow-user-example-com\"\n", ")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -942,7 +947,7 @@ "
\n", "\n", "```bash\n", - "$ kubectl -n mlflow port-forward svc/mlflow 5000:5000\n", + "kubectl -n mlflow port-forward svc/mlflow 5000:5000\n", "```\n", "\n", "
\n", diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore new file mode 100644 index 0000000..b4a2938 --- /dev/null +++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/.gitignore @@ -0,0 +1,2 @@ +components/* +components/.gitkeep \ No newline at end of file diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md new file mode 100644 index 0000000..5b57ad6 --- /dev/null +++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/README.md @@ -0,0 +1,9 @@ +# Demo pipeline (standalone KFP) + +Jupyter notebook with a demo pipeline that uses the installed standalone Kubeflow Pipelines (KFP), MLflow and Kserve components. + +> **NOTE:** This demo is intended for the standalone-KFP + Kserve deployment option. + +
+ +![Pipeline Graph](graph.png) \ No newline at end of file diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/components/.gitkeep b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/components/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb new file mode 100644 index 0000000..471da4e --- /dev/null +++ b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/demo-pipeline.ipynb @@ -0,0 +1,779 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "# Demo KFP pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Install requirements:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "%%bash\n", + "\n", + "pip install kfp~=1.8.14" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Imports:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "import kfp\n", + "import kfp.dsl as dsl\n", + "from kfp.aws import use_aws_secret\n", + "from kfp.v2.dsl import (\n", + " component,\n", + " Input,\n", + " Output,\n", + " Dataset,\n", + " Metrics,\n", + " Artifact,\n", + " Model\n", + ")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 1. Connect to client\n", + "\n", + "Run the following to port-forward to the KFP UI:\n", + "\n", + "```sh\n", + "kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8080:80\n", + "```\n", + "\n", + "Now the KFP UI should be reachable at [`http://localhost:8080`](http://localhost:8080)." + ] + }, + { + "cell_type": "code", + "source": [ + "import kfp\n", + "\n", + "KFP_ENDPOINT = \"http://localhost:8080\"\n", + "\n", + "client = kfp.Client(host=KFP_ENDPOINT)\n", + "# print(client.list_experiments())" + ], + "metadata": { + "collapsed": false + }, + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 2. Components\n", + "\n", + "There are different ways to define components in KFP. Here, we use the **@component** decorator to define the components as Python function-based components.\n", + "\n", + "The **@component** annotation converts the function into a factory function that creates pipeline steps that execute this function. This example also specifies the base container image to run you component in." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Pull data component:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "@component(\n", + " base_image=\"python:3.10\",\n", + " packages_to_install=[\"pandas~=1.4.2\"],\n", + " output_component_file='components/pull_data_component.yaml',\n", + ")\n", + "def pull_data(url: str, data: Output[Dataset]):\n", + " \"\"\"\n", + " Pull data component.\n", + " \"\"\"\n", + " import pandas as pd\n", + "\n", + " df = pd.read_csv(url, sep=\";\")\n", + " df.to_csv(data.path, index=None)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Preprocess component:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "@component(\n", + " base_image=\"python:3.10\",\n", + " packages_to_install=[\"pandas~=1.4.2\", \"scikit-learn~=1.0.2\"],\n", + " output_component_file='components/preprocess_component.yaml',\n", + ")\n", + "def preprocess(\n", + " data: Input[Dataset],\n", + " scaler_out: Output[Artifact],\n", + " train_set: Output[Dataset],\n", + " test_set: Output[Dataset],\n", + " target: str = \"quality\",\n", + "):\n", + " \"\"\"\n", + " Preprocess component.\n", + " \"\"\"\n", + " import pandas as pd\n", + " import pickle\n", + " from sklearn.model_selection import train_test_split\n", + " from sklearn.preprocessing import StandardScaler\n", + "\n", + " data = pd.read_csv(data.path)\n", + "\n", + " # Split the data into training and test sets. (0.75, 0.25) split.\n", + " train, test = train_test_split(data)\n", + "\n", + " scaler = StandardScaler()\n", + "\n", + " train[train.drop(target, axis=1).columns] = scaler.fit_transform(train.drop(target, axis=1))\n", + " test[test.drop(target, axis=1).columns] = scaler.transform(test.drop(target, axis=1))\n", + "\n", + " with open(scaler_out.path, 'wb') as fp:\n", + " pickle.dump(scaler, fp, pickle.HIGHEST_PROTOCOL)\n", + "\n", + " train.to_csv(train_set.path, index=None)\n", + " test.to_csv(test_set.path, index=None)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Train component:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "from typing import NamedTuple\n", + "\n", + "@component(\n", + " base_image=\"python:3.10\",\n", + " packages_to_install=[\"numpy\", \"pandas~=1.4.2\", \"scikit-learn~=1.0.2\", \"mlflow~=2.4.1\", \"boto3~=1.21.0\"],\n", + " output_component_file='components/train_component.yaml',\n", + ")\n", + "def train(\n", + " train_set: Input[Dataset],\n", + " test_set: Input[Dataset],\n", + " saved_model: Output[Model],\n", + " mlflow_experiment_name: str,\n", + " mlflow_tracking_uri: str,\n", + " mlflow_s3_endpoint_url: str,\n", + " model_name: str,\n", + " alpha: float,\n", + " l1_ratio: float,\n", + " target: str = \"quality\",\n", + ") -> NamedTuple(\"Output\", [('storage_uri', str), ('run_id', str),]):\n", + " \"\"\"\n", + " Train component.\n", + " \"\"\"\n", + " import numpy as np\n", + " import pandas as pd\n", + " from sklearn.linear_model import ElasticNet\n", + " from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", + " import mlflow\n", + " import mlflow.sklearn\n", + " import os\n", + " import logging\n", + " import pickle\n", + " from collections import namedtuple\n", + "\n", + " logging.basicConfig(level=logging.INFO)\n", + " logger = logging.getLogger(__name__)\n", + "\n", + " def eval_metrics(actual, pred):\n", + " rmse = np.sqrt(mean_squared_error(actual, pred))\n", + " mae = mean_absolute_error(actual, pred)\n", + " r2 = r2_score(actual, pred)\n", + " return rmse, mae, r2\n", + "\n", + " os.environ['MLFLOW_S3_ENDPOINT_URL'] = mlflow_s3_endpoint_url\n", + "\n", + " # load data\n", + " train = pd.read_csv(train_set.path)\n", + " test = pd.read_csv(test_set.path)\n", + "\n", + " # The predicted column is \"quality\" which is a scalar from [3, 9]\n", + " train_x = train.drop([target], axis=1)\n", + " test_x = test.drop([target], axis=1)\n", + " train_y = train[[target]]\n", + " test_y = test[[target]]\n", + "\n", + " logger.info(f\"Using MLflow tracking URI: {mlflow_tracking_uri}\")\n", + " mlflow.set_tracking_uri(mlflow_tracking_uri)\n", + "\n", + " logger.info(f\"Using MLflow experiment: {mlflow_experiment_name}\")\n", + " mlflow.set_experiment(mlflow_experiment_name)\n", + "\n", + " with mlflow.start_run() as run:\n", + "\n", + " run_id = run.info.run_id\n", + " logger.info(f\"Run ID: {run_id}\")\n", + "\n", + " model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n", + "\n", + " logger.info(\"Fitting model...\")\n", + " model.fit(train_x, train_y)\n", + "\n", + " logger.info(\"Predicting...\")\n", + " predicted_qualities = model.predict(test_x)\n", + "\n", + " (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n", + "\n", + " logger.info(\"Elasticnet model (alpha=%f, l1_ratio=%f):\" % (alpha, l1_ratio))\n", + " logger.info(\" RMSE: %s\" % rmse)\n", + " logger.info(\" MAE: %s\" % mae)\n", + " logger.info(\" R2: %s\" % r2)\n", + "\n", + " logger.info(\"Logging parameters to MLflow\")\n", + " mlflow.log_param(\"alpha\", alpha)\n", + " mlflow.log_param(\"l1_ratio\", l1_ratio)\n", + " mlflow.log_metric(\"rmse\", rmse)\n", + " mlflow.log_metric(\"r2\", r2)\n", + " mlflow.log_metric(\"mae\", mae)\n", + "\n", + " # save model to mlflow\n", + " logger.info(\"Logging trained model\")\n", + " mlflow.sklearn.log_model(\n", + " model,\n", + " model_name,\n", + " registered_model_name=\"ElasticnetWineModel\",\n", + " serialization_format=\"pickle\"\n", + " )\n", + "\n", + " logger.info(\"Logging predictions artifact to MLflow\")\n", + " np.save(\"predictions.npy\", predicted_qualities)\n", + " mlflow.log_artifact(\n", + " local_path=\"predictions.npy\", artifact_path=\"predicted_qualities/\"\n", + " )\n", + "\n", + " # save model as KFP artifact\n", + " logging.info(f\"Saving model to: {saved_model.path}\")\n", + " with open(saved_model.path, 'wb') as fp:\n", + " pickle.dump(model, fp, pickle.HIGHEST_PROTOCOL)\n", + "\n", + " # prepare output\n", + " output = namedtuple('Output', ['storage_uri', 'run_id'])\n", + "\n", + " # return str(mlflow.get_artifact_uri())\n", + " return output(mlflow.get_artifact_uri(), run_id)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Evaluate component:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "@component(\n", + " base_image=\"python:3.10\",\n", + " packages_to_install=[\"numpy\", \"mlflow~=2.4.1\"],\n", + " output_component_file='components/evaluate_component.yaml',\n", + ")\n", + "def evaluate(\n", + " run_id: str,\n", + " mlflow_tracking_uri: str,\n", + " threshold_metrics: dict\n", + ") -> bool:\n", + " \"\"\"\n", + " Evaluate component: Compares metrics from training with given thresholds.\n", + "\n", + " Args:\n", + " run_id (string): MLflow run ID\n", + " mlflow_tracking_uri (string): MLflow tracking URI\n", + " threshold_metrics (dict): Minimum threshold values for each metric\n", + " Returns:\n", + " Bool indicating whether evaluation passed or failed.\n", + " \"\"\"\n", + " from mlflow.tracking import MlflowClient\n", + " import logging\n", + "\n", + " logging.basicConfig(level=logging.INFO)\n", + " logger = logging.getLogger(__name__)\n", + "\n", + " client = MlflowClient(tracking_uri=mlflow_tracking_uri)\n", + " info = client.get_run(run_id)\n", + " training_metrics = info.data.metrics\n", + "\n", + " logger.info(f\"Training metrics: {training_metrics}\")\n", + "\n", + " # compare the evaluation metrics with the defined thresholds\n", + " for key, value in threshold_metrics.items():\n", + " if key not in training_metrics or training_metrics[key] > value:\n", + " logger.error(f\"Metric {key} failed. Evaluation not passed!\")\n", + " return False\n", + " return True" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Deploy model component:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "@component(\n", + " base_image=\"python:3.9\",\n", + " packages_to_install=[\"kserve==0.11.0\"],\n", + " output_component_file='components/deploy_model_component.yaml',\n", + ")\n", + "def deploy_model(model_name: str, storage_uri: str):\n", + " \"\"\"\n", + " Deploy the model as an inference service with Kserve.\n", + " \"\"\"\n", + " import logging\n", + " from kubernetes import client\n", + " from kserve import KServeClient\n", + " from kserve import constants\n", + " from kserve import V1beta1InferenceService\n", + " from kserve import V1beta1InferenceServiceSpec\n", + " from kserve import V1beta1PredictorSpec\n", + " from kserve import V1beta1SKLearnSpec\n", + " from kubernetes.client import V1ResourceRequirements\n", + "\n", + " logging.basicConfig(level=logging.INFO)\n", + " logger = logging.getLogger(__name__)\n", + "\n", + " model_uri = f\"{storage_uri}/{model_name}\"\n", + " logger.info(f\"MODEL URI: {model_uri}\")\n", + "\n", + " namespace = 'kserve-inference'\n", + " kserve_version='v1beta1'\n", + " api_version = constants.KSERVE_GROUP + '/' + kserve_version\n", + "\n", + " isvc = V1beta1InferenceService(\n", + " api_version = api_version,\n", + " kind = constants.KSERVE_KIND,\n", + " metadata = client.V1ObjectMeta(\n", + " name = model_name,\n", + " namespace = namespace,\n", + " annotations = {'sidecar.istio.io/inject':'false'}\n", + " ),\n", + " spec = V1beta1InferenceServiceSpec(\n", + " predictor=V1beta1PredictorSpec(\n", + " service_account_name=\"kserve-sa\",\n", + " min_replicas=1,\n", + " max_replicas = 1,\n", + " sklearn=V1beta1SKLearnSpec(\n", + " storage_uri=model_uri,\n", + " resources=V1ResourceRequirements(\n", + " requests={\"cpu\": \"100m\", \"memory\": \"512Mi\"},\n", + " limits={\"cpu\": \"300m\", \"memory\": \"512Mi\"}\n", + " )\n", + " ),\n", + " )\n", + " )\n", + " )\n", + " KServe = KServeClient()\n", + " KServe.create(isvc)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Inference component:" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + " @component(\n", + " base_image=\"python:3.9\", # kserve on python 3.10 comes with a dependency that fails to get installed\n", + " packages_to_install=[\"kserve==0.11.0\", \"scikit-learn~=1.0.2\"],\n", + " output_component_file='components/inference_component.yaml',\n", + ")\n", + "def inference(\n", + " model_name: str,\n", + " scaler_in: Input[Artifact]\n", + "):\n", + " \"\"\"\n", + " Test inference.\n", + " \"\"\"\n", + " from kserve import KServeClient\n", + " import requests\n", + " import pickle\n", + " import logging\n", + "\n", + " logging.basicConfig(level=logging.INFO)\n", + " logger = logging.getLogger(__name__)\n", + "\n", + " namespace = 'kserve-inference'\n", + " \n", + " input_sample = [[5.6, 0.54, 0.04, 1.7, 0.049, 5, 13, 0.9942, 3.72, 0.58, 11.4],\n", + " [11.3, 0.34, 0.45, 2, 0.082, 6, 15, 0.9988, 2.94, 0.66, 9.2]]\n", + "\n", + " logger.info(f\"Loading standard scaler from: {scaler_in.path}\")\n", + " with open(scaler_in.path, 'rb') as fp:\n", + " scaler = pickle.load(fp)\n", + "\n", + " logger.info(f\"Standardizing sample: {scaler_in.path}\")\n", + " input_sample = scaler.transform(input_sample)\n", + "\n", + " # get inference service\n", + " KServe = KServeClient()\n", + "\n", + " # wait for deployment to be ready\n", + " KServe.get(model_name, namespace=namespace, watch=True, timeout_seconds=120)\n", + "\n", + " inference_service = KServe.get(model_name, namespace=namespace)\n", + " header = {\"Host\": f\"{model_name}.{namespace}.example.com\"}\n", + " is_url = f\"http://istio-ingressgateway.istio-system.svc.cluster.local:80/v1/models/{model_name}:predict\"\n", + " \n", + " logger.info(f\"\\nInference service status:\\n{inference_service['status']}\")\n", + " logger.info(f\"\\nInference service URL:\\n{is_url}\\n\")\n", + "\n", + " inference_input = {\n", + " 'instances': input_sample.tolist()\n", + " }\n", + " response = requests.post(\n", + " is_url,\n", + " json=inference_input,\n", + " headers=header,\n", + " )\n", + " if response.status_code != 200:\n", + " raise RuntimeError(f\"HTTP status code '{response.status_code}': {response.json()}\")\n", + " \n", + " logger.info(f\"\\nPrediction response:\\n{response.json()}\\n\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 3. Pipeline\n", + "\n", + "Pipeline definition:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "@dsl.pipeline(\n", + " name='demo-pipeline',\n", + " description='An example pipeline that performs addition calculations.',\n", + ")\n", + "def pipeline(\n", + " url: str,\n", + " target: str,\n", + " mlflow_experiment_name: str,\n", + " mlflow_tracking_uri: str,\n", + " mlflow_s3_endpoint_url: str,\n", + " model_name: str,\n", + " alpha: float,\n", + " l1_ratio: float,\n", + " threshold_metrics: dict,\n", + "):\n", + " pull_task = pull_data(url=url)\n", + "\n", + " preprocess_task = preprocess(data=pull_task.outputs[\"data\"])\n", + "\n", + " train_task = train(\n", + " train_set=preprocess_task.outputs[\"train_set\"],\n", + " test_set=preprocess_task.outputs[\"test_set\"],\n", + " target=target,\n", + " mlflow_experiment_name=mlflow_experiment_name,\n", + " mlflow_tracking_uri=mlflow_tracking_uri,\n", + " mlflow_s3_endpoint_url=mlflow_s3_endpoint_url,\n", + " model_name=model_name,\n", + " alpha=alpha,\n", + " l1_ratio=l1_ratio\n", + " )\n", + " train_task.apply(use_aws_secret(secret_name=\"aws-secret\"))\n", + "\n", + " evaluate_trask = evaluate(\n", + " run_id=train_task.outputs[\"run_id\"],\n", + " mlflow_tracking_uri=mlflow_tracking_uri,\n", + " threshold_metrics=threshold_metrics\n", + " )\n", + "\n", + " eval_passed = evaluate_trask.output\n", + "\n", + " with dsl.Condition(eval_passed == \"true\"):\n", + " deploy_model_task = deploy_model(\n", + " model_name=model_name,\n", + " storage_uri=train_task.outputs[\"storage_uri\"],\n", + " )\n", + "\n", + " inference_task = inference(\n", + " model_name=model_name,\n", + " scaler_in=preprocess_task.outputs[\"scaler_out\"]\n", + " )\n", + " inference_task.after(deploy_model_task)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Pipeline arguments:" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "# Specify pipeline argument values\n", + "\n", + "eval_threshold_metrics = {'rmse': 0.9, 'r2': 0.3, 'mae': 0.8}\n", + "\n", + "arguments = {\n", + " \"url\": \"http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv\",\n", + " \"target\": \"quality\",\n", + " \"mlflow_tracking_uri\": \"http://mlflow.mlflow.svc.cluster.local:5000\",\n", + " \"mlflow_s3_endpoint_url\": \"http://mlflow-minio-service.mlflow.svc.cluster.local:9000\",\n", + " \"mlflow_experiment_name\": \"demo-notebook\",\n", + " \"model_name\": \"wine-quality\",\n", + " \"alpha\": 0.5,\n", + " \"l1_ratio\": 0.5,\n", + " \"threshold_metrics\": eval_threshold_metrics\n", + "}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 4. Submit run" + ] + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false + }, + "source": [ + "run_name = \"demo-run\"\n", + "experiment_name = \"demo-experiment\"\n", + "\n", + "client.create_run_from_pipeline_func(\n", + " pipeline_func=pipeline,\n", + " run_name=run_name,\n", + " experiment_name=experiment_name,\n", + " arguments=arguments,\n", + " mode=kfp.dsl.PipelineExecutionMode.V2_COMPATIBLE,\n", + " enable_caching=False,\n", + ")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 5. Check run" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### Kubeflow Pipelines UI\n", + "\n", + "The default way of accessing KFP UI is via port-forward. This enables you to get started quickly without imposing any requirements on your environment. Run the following to port-forward KFP UI to local port `8080`:\n", + "\n", + "```sh\n", + "kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8080:80\n", + "```\n", + "\n", + "Now the KFP UI should be reachable at [`http://localhost:8080`](http://localhost:8080)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### MLFlow UI\n", + "\n", + "To access MLFlow UI, open a terminal and forward a local port to MLFlow server:\n", + "\n", + "
\n", + "\n", + "```bash\n", + "$ kubectl -n mlflow port-forward svc/mlflow 5000:5000\n", + "```\n", + "\n", + "
\n", + "\n", + "Now MLFlow's UI should be reachable at [`http://localhost:5000`](http://localhost:5000)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 6. Check deployed model\n", + "\n", + "```bash\n", + "# get inference services\n", + "kubectl -n kserve-inference get inferenceservice\n", + "\n", + "# get deployed model pods\n", + "kubectl -n kserve-inference get pods\n", + "\n", + "# delete inference service\n", + "kubectl -n kserve-inference delete inferenceservice wine-quality\n", + "```\n", + "
\n", + "\n", + "If something goes wrong, check the logs with:\n", + "\n", + "
\n", + "\n", + "```bash\n", + "kubectl logs -n kserve-inference kserve-container\n", + "\n", + "kubectl logs -n kserve-inference queue-proxy\n", + "\n", + "kubectl logs -n kserve-inference storage-initializer\n", + "```\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "iml4e", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15 (default, Nov 24 2022, 08:57:44) \n[Clang 14.0.6 ]" + }, + "vscode": { + "interpreter": { + "hash": "2976e1db094957a35b33d12f80288a268286b510a60c0d029aa085f0b10be691" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png new file mode 100644 index 0000000..14bf10c Binary files /dev/null and b/tutorials/demo_notebooks/demo_pipeline_standalone_kfp/graph.png differ diff --git a/tutorials/gcp_quickstart/02_Deploy_the_stack.md b/tutorials/gcp_quickstart/02_Deploy_the_stack.md index 121b443..ae1531e 100644 --- a/tutorials/gcp_quickstart/02_Deploy_the_stack.md +++ b/tutorials/gcp_quickstart/02_Deploy_the_stack.md @@ -5,12 +5,27 @@ - [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) - [kustomize](https://kubectl.docs.kubernetes.io/installation/kustomize/) +## 2. Choose the deployment option + +Choose the deployment option that best fits your needs: + +1. `kubeflow-monitoring`: Full Kubeflow deployment with all components. +2. `kubeflow`: Full Kubeflow deployment without monitoring components (prometheus, grafana). +3. `standalone-kfp-monitoring`: Standalone KFP deployment. +4. `standalone-kfp`: Standalone KFP deployment without monitoring components (prometheus, grafana). +5. `standalone-kfp-kserve-monitoring`: Standalone KFP and Kserve deployment. +6. `standalone-kfp-kserve`: Standalone KFP and Kserve deployment without monitoring components (prometheus, grafana). + +```bash +export DEPLOYMENT_OPTION=kubeflow-monitoring +``` + ## 2. Deploy the stack Deploy all the components of the platform with: ```bash -while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done +while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done ``` ## Troubleshooting @@ -28,7 +43,7 @@ Race condition errors can occur when deploying Kubeflow. If this happens, delete ```bash kubectl delete ns kubeflow -while ! kustomize build deployment | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done +while ! kustomize build "deployment/envs/$DEPLOYMENT_OPTION" | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done ``` Sometimes, just deleting the failing pod, so that it get recreated, will fix the issue. diff --git a/tutorials/local_deployment/01_Setup_local_cluster.md b/tutorials/local_deployment/01_Setup_local_cluster.md index 48a3c8f..da37881 100644 --- a/tutorials/local_deployment/01_Setup_local_cluster.md +++ b/tutorials/local_deployment/01_Setup_local_cluster.md @@ -21,7 +21,7 @@ sudo mv ./kind /usr/local/bin/kind ### 3. Create a cluster ```bash -export CLUSTER_NAME="kind-ep" +export CLUSTER_NAME="mlops-platform" export HOST_IP="127.0.0.1" # cluster IP address cat <