From a79dba5f1551dfed2c86b40c7da94d5c28fca352 Mon Sep 17 00:00:00 2001 From: avelichk Date: Sat, 14 Aug 2021 23:34:35 +0100 Subject: [PATCH 1/2] Modify XGBoostJob example for the new Controller --- examples/v1beta1/xgboost-lightgbm.yaml | 19 +++++++------------ .../components/controller/controller.yaml | 3 +-- .../v1beta1/components/controller/rbac.yaml | 6 ------ .../v1beta1/experiment_defaults.go | 4 ++-- pkg/controller.v1beta1/consts/const.go | 2 ++ 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/examples/v1beta1/xgboost-lightgbm.yaml b/examples/v1beta1/xgboost-lightgbm.yaml index 762f6275242..00ad9a1f09d 100644 --- a/examples/v1beta1/xgboost-lightgbm.yaml +++ b/examples/v1beta1/xgboost-lightgbm.yaml @@ -2,7 +2,7 @@ apiVersion: kubeflow.org/v1beta1 kind: Experiment metadata: namespace: kubeflow - name: xgboost-lightgbm + name: xgboostjob-lightgbm spec: objective: type: maximize @@ -35,11 +35,7 @@ spec: max: "60" step: "1" trialTemplate: - primaryPodLabels: - job-role: master - primaryContainerName: xgboostjob - successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# - failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")# + primaryContainerName: xgboost trialParameters: - name: learningRate description: Learning rate for the training model @@ -48,8 +44,7 @@ spec: description: Number of leaves for one tree reference: num-leaves trialSpec: - # TODO (andreyvelich): Change to kubeflow.org/v1 once all-in-one operator is finished. - apiVersion: xgboostjob.kubeflow.org/v1 + apiVersion: kubeflow.org/v1 kind: XGBoostJob spec: xgbReplicaSpecs: @@ -59,11 +54,11 @@ spec: template: spec: containers: - - name: xgboostjob + - name: xgboost image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 ports: - containerPort: 9991 - name: xgboostjob-port + name: xgboost-port imagePullPolicy: Always args: - --job_type=Train @@ -93,11 +88,11 @@ spec: template: spec: containers: - - name: xgboostjob + - name: xgboost image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 ports: - containerPort: 9991 - name: xgboostjob-port + name: xgboost-port imagePullPolicy: Always args: - --job_type=Train diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index 6cadb0863f5..d2b81cf5439 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -29,8 +29,7 @@ spec: - "--trial-resources=TFJob.v1.kubeflow.org" - "--trial-resources=PyTorchJob.v1.kubeflow.org" - "--trial-resources=MPIJob.v1.kubeflow.org" - # TODO (andreyvelich): Change to v1.kubeflow.org once all-in-one operator is finished. - - "--trial-resources=XGBoostJob.v1.xgboostjob.kubeflow.org" + - "--trial-resources=XGBoostJob.v1.kubeflow.org" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index 9b4ba763b1f..2eb95f41280 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -53,12 +53,6 @@ rules: - tfjobs - pytorchjobs - mpijobs - verbs: - - "*" - # TODO (andreyvelich): Move to "apiGroup: kubeflow.org" once all-in-one operator is finished. - - apiGroups: - - xgboostjob.kubeflow.org - resources: - xgboostjobs verbs: - "*" diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go index 3b54d50310d..8ad7dc9e2ca 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go @@ -96,7 +96,7 @@ func (e *Experiment) setDefaultObjective() { func (e *Experiment) setDefaultTrialTemplate() { t := e.Spec.TrialTemplate - // Set default values for Job, TFJob and PyTorchJob if TrialSpec is not nil + // Set default values for Job and Kubeflow Training Job if TrialSpec is not nil if t != nil && t.TrialSource.TrialSpec != nil { jobKind := t.TrialSource.TrialSpec.GetKind() if jobKind == consts.JobKindJob { @@ -106,7 +106,7 @@ func (e *Experiment) setDefaultTrialTemplate() { if t.FailureCondition == "" { t.FailureCondition = DefaultJobFailureCondition } - } else if jobKind == consts.JobKindTF || jobKind == consts.JobKindPyTorch { + } else if jobKind == consts.JobKindTF || jobKind == consts.JobKindPyTorch || jobKind == consts.JobKindXGBoost { if t.SuccessCondition == "" { t.SuccessCondition = DefaultKubeflowJobSuccessCondition } diff --git a/pkg/controller.v1beta1/consts/const.go b/pkg/controller.v1beta1/consts/const.go index 0ac309ddbdc..08f7c18e04b 100644 --- a/pkg/controller.v1beta1/consts/const.go +++ b/pkg/controller.v1beta1/consts/const.go @@ -134,6 +134,8 @@ const ( JobKindTF = "TFJob" // JobKindPyTorch is the kind of PyTorchJob. JobKindPyTorch = "PyTorchJob" + // JobKindXGBoost is the kind of XGBoostJob. + JobKindXGBoost = "XGBoostJob" // AnnotationIstioSidecarInjectName is the annotation of Istio Sidecar AnnotationIstioSidecarInjectName = "sidecar.istio.io/inject" From a43e0212ce22e87a165964666ad700fc9ca78189 Mon Sep 17 00:00:00 2001 From: avelichk Date: Sun, 15 Aug 2021 01:01:03 +0100 Subject: [PATCH 2/2] Modify port --- examples/v1beta1/xgboost-lightgbm.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/v1beta1/xgboost-lightgbm.yaml b/examples/v1beta1/xgboost-lightgbm.yaml index 00ad9a1f09d..b1fa963cc0c 100644 --- a/examples/v1beta1/xgboost-lightgbm.yaml +++ b/examples/v1beta1/xgboost-lightgbm.yaml @@ -58,7 +58,7 @@ spec: image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 ports: - containerPort: 9991 - name: xgboost-port + name: xgboostjob-port imagePullPolicy: Always args: - --job_type=Train @@ -92,7 +92,7 @@ spec: image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 ports: - containerPort: 9991 - name: xgboost-port + name: xgboostjob-port imagePullPolicy: Always args: - --job_type=Train