From 44ed4dd89b5d6663aac9fdbf2699caad54395086 Mon Sep 17 00:00:00 2001 From: wackxu Date: Tue, 28 May 2019 15:15:32 +0800 Subject: [PATCH 1/3] implementation of mxnet operator API v1 --- .travis.yml | 1 + Dockerfile | 5 +- cmd/mxnet-operator.v1/app/options/options.go | 58 ++ cmd/mxnet-operator.v1/app/server.go | 208 +++++++ cmd/mxnet-operator.v1/main.go | 49 ++ hack/update-codegen.sh | 9 +- manifests/crd-v1.yaml | 58 ++ pkg/apis/mxnet/v1/constants.go | 30 + pkg/apis/mxnet/v1/defaults.go | 105 ++++ pkg/apis/mxnet/v1/doc.go | 21 + pkg/apis/mxnet/v1/register.go | 74 +++ pkg/apis/mxnet/v1/types.go | 256 ++++++++ pkg/apis/mxnet/v1/util.go | 20 + pkg/apis/mxnet/v1/util_test.go | 44 ++ pkg/apis/mxnet/v1/zz_generated.deepcopy.go | 231 +++++++ pkg/apis/mxnet/v1/zz_generated.defaults.go | 43 ++ pkg/apis/mxnet/validation/validation.go | 41 ++ pkg/apis/mxnet/validation/validation_test.go | 68 +++ pkg/common/util/v1/testutil/const.go | 35 ++ pkg/common/util/v1/testutil/mxjob.go | 121 ++++ pkg/common/util/v1/testutil/pod.go | 92 +++ pkg/common/util/v1/testutil/service.go | 62 ++ pkg/common/util/v1/testutil/util.go | 93 +++ pkg/common/util/v1/unstructured/informer.go | 62 ++ pkg/controller.v1/mxnet/controller.go | 514 ++++++++++++++++ pkg/controller.v1/mxnet/controller_test.go | 456 ++++++++++++++ pkg/controller.v1/mxnet/informer.go | 125 ++++ pkg/controller.v1/mxnet/job.go | 163 +++++ pkg/controller.v1/mxnet/job_test.go | 596 +++++++++++++++++++ pkg/controller.v1/mxnet/mxnet.go | 122 ++++ pkg/controller.v1/mxnet/pod.go | 290 +++++++++ pkg/controller.v1/mxnet/pod_test.go | 238 ++++++++ pkg/controller.v1/mxnet/service.go | 127 ++++ pkg/controller.v1/mxnet/service_test.go | 95 +++ pkg/controller.v1/mxnet/status.go | 249 ++++++++ pkg/controller.v1/mxnet/status_test.go | 257 ++++++++ pkg/controller.v1/mxnet/util.go | 48 ++ pkg/controller.v1/mxnet/util_test.go | 80 +++ pkg/util/k8sutil/client.go | 14 + 39 files changed, 5158 insertions(+), 2 deletions(-) create mode 100644 cmd/mxnet-operator.v1/app/options/options.go create mode 100644 cmd/mxnet-operator.v1/app/server.go create mode 100644 cmd/mxnet-operator.v1/main.go create mode 100644 manifests/crd-v1.yaml create mode 100644 pkg/apis/mxnet/v1/constants.go create mode 100644 pkg/apis/mxnet/v1/defaults.go create mode 100644 pkg/apis/mxnet/v1/doc.go create mode 100644 pkg/apis/mxnet/v1/register.go create mode 100644 pkg/apis/mxnet/v1/types.go create mode 100644 pkg/apis/mxnet/v1/util.go create mode 100644 pkg/apis/mxnet/v1/util_test.go create mode 100644 pkg/apis/mxnet/v1/zz_generated.deepcopy.go create mode 100644 pkg/apis/mxnet/v1/zz_generated.defaults.go create mode 100644 pkg/common/util/v1/testutil/const.go create mode 100644 pkg/common/util/v1/testutil/mxjob.go create mode 100644 pkg/common/util/v1/testutil/pod.go create mode 100644 pkg/common/util/v1/testutil/service.go create mode 100644 pkg/common/util/v1/testutil/util.go create mode 100644 pkg/common/util/v1/unstructured/informer.go create mode 100644 pkg/controller.v1/mxnet/controller.go create mode 100644 pkg/controller.v1/mxnet/controller_test.go create mode 100644 pkg/controller.v1/mxnet/informer.go create mode 100644 pkg/controller.v1/mxnet/job.go create mode 100644 pkg/controller.v1/mxnet/job_test.go create mode 100644 pkg/controller.v1/mxnet/mxnet.go create mode 100644 pkg/controller.v1/mxnet/pod.go create mode 100644 pkg/controller.v1/mxnet/pod_test.go create mode 100644 pkg/controller.v1/mxnet/service.go create mode 100644 pkg/controller.v1/mxnet/service_test.go create mode 100644 pkg/controller.v1/mxnet/status.go create mode 100644 pkg/controller.v1/mxnet/status_test.go create mode 100644 pkg/controller.v1/mxnet/util.go create mode 100644 pkg/controller.v1/mxnet/util_test.go diff --git a/.travis.yml b/.travis.yml index 3ad78c9e..bc047c6a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ install: script: - hack/verify-codegen.sh - go build -o mxnet-operator.v1beta1 github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1beta1 + - go build -o mxnet-operator.v1 github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1 - gometalinter --config=linter_config.json --vendor ./... # We customize the build step because by default # Travis runs go test -v ./... which will include the vendor diff --git a/Dockerfile b/Dockerfile index 21f1164a..aa9189c4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,9 @@ FROM golang:1.8.2 RUN mkdir -p /opt/kubeflow COPY mxnet-operator.v1beta1 /opt/kubeflow +COPY mxnet-operator.v1 /opt/kubeflow + RUN chmod a+x /opt/kubeflow/mxnet-operator.v1beta1 +RUN chmod a+x /opt/kubeflow/mxnet-operator.v1 -CMD ["/opt/kubeflow/mxnet-operator.v1beta1", "--alsologtostderr", "-v=1"] +CMD ["/opt/kubeflow/mxnet-operator.v1", "--alsologtostderr", "-v=1"] diff --git a/cmd/mxnet-operator.v1/app/options/options.go b/cmd/mxnet-operator.v1/app/options/options.go new file mode 100644 index 00000000..d9a50ff9 --- /dev/null +++ b/cmd/mxnet-operator.v1/app/options/options.go @@ -0,0 +1,58 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package options + +import ( + "flag" + + v1 "k8s.io/api/core/v1" +) + +// ServerOption is the main context object for the controller manager. +type ServerOption struct { + Kubeconfig string + MasterURL string + Threadiness int + PrintVersion bool + JSONLogFormat bool + EnableGangScheduling bool + Namespace string +} + +// NewServerOption creates a new CMServer with a default config. +func NewServerOption() *ServerOption { + s := ServerOption{} + return &s +} + +// AddFlags adds flags for a specific CMServer to the specified FlagSet. +func (s *ServerOption) AddFlags(fs *flag.FlagSet) { + fs.StringVar(&s.MasterURL, "master", "", + `The url of the Kubernetes API server, + will overrides any value in kubeconfig, only required if out-of-cluster.`) + + fs.StringVar(&s.Namespace, "namespace", v1.NamespaceAll, + `The namespace to monitor mxjobs. If unset, it monitors all namespaces cluster-wide. + If set, it only monitors mxjobs in the given namespace.`) + + fs.IntVar(&s.Threadiness, "threadiness", 1, + `How many threads to process the main logic`) + + fs.BoolVar(&s.PrintVersion, "version", false, "Show version and quit") + + fs.BoolVar(&s.JSONLogFormat, "json-log-format", true, + "Set true to use json style log format. Set false to use plaintext style log format") + fs.BoolVar(&s.EnableGangScheduling, "enable-gang-scheduling", false, "Set true to enable gang scheduling by kube-arbitrator.") +} diff --git a/cmd/mxnet-operator.v1/app/server.go b/cmd/mxnet-operator.v1/app/server.go new file mode 100644 index 00000000..e27d76e8 --- /dev/null +++ b/cmd/mxnet-operator.v1/app/server.go @@ -0,0 +1,208 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package app + +import ( + "fmt" + "os" + "time" + + log "github.com/sirupsen/logrus" + + "k8s.io/api/core/v1" + crdclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + restclientset "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + election "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/client-go/tools/record" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxnetv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/scheme" + mxjobinformers "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions" + controller "github.com/kubeflow/mxnet-operator/pkg/controller.v1/mxnet" + "github.com/kubeflow/mxnet-operator/pkg/version" + "github.com/kubeflow/tf-operator/pkg/util/signals" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" +) + +const ( + apiVersion = "v1" +) + +var ( + // leader election config + leaseDuration = 15 * time.Second + renewDuration = 5 * time.Second + retryPeriod = 3 * time.Second + resyncPeriod = 30 * time.Second +) + +const RecommendedKubeConfigPathEnv = "KUBECONFIG" + +func Run(opt *options.ServerOption) error { + // Check if the -version flag was passed and, if so, print the version and exit. + if opt.PrintVersion { + version.PrintVersionAndExit(apiVersion) + } + + namespace := os.Getenv(mxnetv1.EnvKubeflowNamespace) + if len(namespace) == 0 { + log.Infof("EnvKubeflowNamespace not set, use default namespace") + namespace = metav1.NamespaceDefault + } + if opt.Namespace == v1.NamespaceAll { + log.Info("Using cluster scoped operator") + } else { + log.Infof("Scoping operator to namespace %s", opt.Namespace) + } + + // To help debugging, immediately log version. + log.Infof("%+v", version.Info(apiVersion)) + + // Set up signals so we handle the first shutdown signal gracefully. + stopCh := signals.SetupSignalHandler() + + log.Infof("RecommendedKubeConfigPathEnv : %+v", RecommendedKubeConfigPathEnv) + log.Infof("KUBECONFIG : %+v", os.Getenv("KUBECONFIG")) + + // Note: ENV KUBECONFIG will overwrite user defined Kubeconfig option. + if len(os.Getenv(RecommendedKubeConfigPathEnv)) > 0 { + // use the current context in kubeconfig + // This is very useful for running locally. + opt.Kubeconfig = os.Getenv(RecommendedKubeConfigPathEnv) + } + + // Get kubernetes config. + kcfg, err := clientcmd.BuildConfigFromFlags(opt.MasterURL, opt.Kubeconfig) + if err != nil { + log.Fatalf("Error building kubeconfig: %s", err.Error()) + } + + // Create clients. + kubeClientSet, leaderElectionClientSet, mxJobClientSet, kubeBatchClientSet, err := createClientSets(kcfg) + if err != nil { + return err + } + + // Create informer factory. + kubeInformerFactory := kubeinformers.NewFilteredSharedInformerFactory(kubeClientSet, resyncPeriod, opt.Namespace, nil) + mxJobInformerFactory := mxjobinformers.NewSharedInformerFactory(mxJobClientSet, resyncPeriod) + + unstructuredInformer := controller.NewUnstructuredMXJobInformer(kcfg, opt.Namespace) + + // Create mx controller. + tc := controller.NewMXController(unstructuredInformer, kubeClientSet, mxJobClientSet, kubeBatchClientSet, kubeInformerFactory, mxJobInformerFactory, *opt) + + // Start informer goroutines. + go kubeInformerFactory.Start(stopCh) + + // We do not use the generated informer because of + // go mxJobInformerFactory.Start(stopCh) + go unstructuredInformer.Informer().Run(stopCh) + + // Set leader election start function. + run := func(<-chan struct{}) { + if err := tc.Run(opt.Threadiness, stopCh); err != nil { + log.Errorf("Failed to run the controller: %v", err) + } + } + + id, err := os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname: %v", err) + } + + // Prepare event clients. + eventBroadcaster := record.NewBroadcaster() + if err = v1.AddToScheme(scheme.Scheme); err != nil { + return fmt.Errorf("CoreV1 Add Scheme failed: %v", err) + } + recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "mxnet-operator"}) + + rl := &resourcelock.EndpointsLock{ + EndpointsMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "mxnet-operator", + }, + Client: leaderElectionClientSet.CoreV1(), + LockConfig: resourcelock.ResourceLockConfig{ + Identity: id, + EventRecorder: recorder, + }, + } + + // Start leader election. + election.RunOrDie(election.LeaderElectionConfig{ + Lock: rl, + LeaseDuration: leaseDuration, + RenewDeadline: renewDuration, + RetryPeriod: retryPeriod, + Callbacks: election.LeaderCallbacks{ + OnStartedLeading: run, + OnStoppedLeading: func() { + log.Fatalf("leader election lost") + }, + }, + }) + + return nil +} + +func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, mxjobclientset.Interface, kubebatchclient.Interface, error) { + + crdClient, err := crdclient.NewForConfig(config) + + if err != nil { + return nil, nil, nil, nil, err + } + + checkCRDExists(crdClient, mxnetv1.MXCRD) + + kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "mxnet-operator")) + if err != nil { + return nil, nil, nil, nil, err + } + + leaderElectionClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "leader-election")) + if err != nil { + return nil, nil, nil, nil, err + } + + mxJobClientSet, err := mxjobclientset.NewForConfig(config) + if err != nil { + return nil, nil, nil, nil, err + } + + kubeBatchClientSet, err := kubebatchclient.NewForConfig(restclientset.AddUserAgent(config, "kube-batch")) + if err != nil { + return nil, nil, nil, nil, err + } + + return kubeClientSet, leaderElectionClientSet, mxJobClientSet, kubeBatchClientSet, nil +} + +func checkCRDExists(clientset crdclient.Interface, crdName string) { + _, err := clientset.ApiextensionsV1beta1().CustomResourceDefinitions().Get(crdName, metav1.GetOptions{}) + if err != nil { + log.Error(err) + os.Exit(1) + } +} diff --git a/cmd/mxnet-operator.v1/main.go b/cmd/mxnet-operator.v1/main.go new file mode 100644 index 00000000..ba011966 --- /dev/null +++ b/cmd/mxnet-operator.v1/main.go @@ -0,0 +1,49 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + + "github.com/onrik/logrus/filename" + log "github.com/sirupsen/logrus" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app" + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" +) + +func init() { + // Add filename as one of the fields of the structured log message. + filenameHook := filename.NewHook() + filenameHook.Field = "filename" + log.AddHook(filenameHook) +} + +func main() { + s := options.NewServerOption() + s.AddFlags(flag.CommandLine) + + flag.Parse() + + if s.JSONLogFormat { + // Output logs in a json format so that it can be parsed by services like Stackdriver. + log.SetFormatter(&log.JSONFormatter{}) + } + + if err := app.Run(s); err != nil { + log.Fatalf("%v\n", err) + } + +} diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index b265184c..9ec3cafe 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -31,7 +31,7 @@ CODEGEN_PKG=${CODEGEN_PKG:-$(cd ${SCRIPT_ROOT}; ls -d -1 ./vendor/k8s.io/code-ge cd ${SCRIPT_ROOT} ${CODEGEN_PKG}/generate-groups.sh "defaulter,deepcopy,client,informer,lister" \ github.com/kubeflow/mxnet-operator/pkg/client github.com/kubeflow/mxnet-operator/pkg/apis \ - mxnet:v1beta1 \ + mxnet:v1beta1,v1 \ --go-header-file hack/boilerplate/boilerplate.go.txt # Notice: The code in code-generator does not generate defaulter by default. @@ -40,3 +40,10 @@ ${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/mxnet-operator/pkg -O zz_generated.defaults \ --go-header-file hack/boilerplate/boilerplate.go.txt \ --output-package github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1 + +# Notice: The code in code-generator does not generate defaulter by default. +echo "Generating defaulters for v1" +${GOPATH}/bin/defaulter-gen --input-dirs github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1 \ + -O zz_generated.defaults \ + --go-header-file hack/boilerplate/boilerplate.go.txt \ + --output-package github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1 \ No newline at end of file diff --git a/manifests/crd-v1.yaml b/manifests/crd-v1.yaml new file mode 100644 index 00000000..9613c3a9 --- /dev/null +++ b/manifests/crd-v1.yaml @@ -0,0 +1,58 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: mxjobs.kubeflow.org +spec: + group: kubeflow.org + version: v1 + scope: Namespaced + names: + kind: MXJob + singular: mxjob + plural: mxjobs + subresources: + status: {} + validation: + openAPIV3Schema: + properties: + spec: + properties: + mxReplicaSpecs: + properties: + # The validation works when the configuration contains + # `Worker`, `Server`, `Scheduler`, + # `TunerTracker`, `TunerServer`, `Tuner`, + # Otherwise it will not be validated. + Scheduler: + properties: + replicas: + type: integer + minimum: 1 + maximum: 1 + Worker: + properties: + replicas: + type: integer + minimum: 1 + Server: + properties: + replicas: + type: integer + minimum: 1 + TunerTracker: + properties: + replicas: + type: integer + minimum: 1 + maximum: 1 + TunerServer: + properties: + replicas: + type: integer + minimum: 1 + Tuner: + properties: + replicas: + type: integer + minimum: 1 + maximum: 1 diff --git a/pkg/apis/mxnet/v1/constants.go b/pkg/apis/mxnet/v1/constants.go new file mode 100644 index 00000000..dceddb60 --- /dev/null +++ b/pkg/apis/mxnet/v1/constants.go @@ -0,0 +1,30 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +const ( + // EnvKubeflowNamespace is ENV for kubeflow namespace specified by user. + EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE" + + // DefaultPortName is name of the port used to communicate between scheduler and + // servers & workers. + DefaultPortName = "mxjob-port" + // DefaultContainerName is the name of the MXJob container. + DefaultContainerName = "mxnet" + // DefaultPort is default value of the port. + DefaultPort = 9091 + // DefaultRestartPolicy is default RestartPolicy for MXReplicaSpec. + DefaultRestartPolicy = RestartPolicyNever +) diff --git a/pkg/apis/mxnet/v1/defaults.go b/pkg/apis/mxnet/v1/defaults.go new file mode 100644 index 00000000..8319ec7e --- /dev/null +++ b/pkg/apis/mxnet/v1/defaults.go @@ -0,0 +1,105 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +import ( + "strings" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// Int32 is a helper routine that allocates a new int32 value +// to store v and returns a pointer to it. +func Int32(v int32) *int32 { + return &v +} + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + return RegisterDefaults(scheme) +} + +// setDefaultPort sets the default ports for mxnet container. +func setDefaultPort(spec *v1.PodSpec) { + index := 0 + for i, container := range spec.Containers { + if container.Name == DefaultContainerName { + index = i + break + } + } + + hasMXJobPort := false + for _, port := range spec.Containers[index].Ports { + if port.Name == DefaultPortName { + hasMXJobPort = true + break + } + } + if !hasMXJobPort { + spec.Containers[index].Ports = append(spec.Containers[index].Ports, v1.ContainerPort{ + Name: DefaultPortName, + ContainerPort: DefaultPort, + }) + } +} + +func setDefaultReplicas(spec *MXReplicaSpec) { + if spec.Replicas == nil { + spec.Replicas = Int32(1) + } + if spec.RestartPolicy == "" { + spec.RestartPolicy = DefaultRestartPolicy + } +} + +// setTypeNamesToCamelCase sets the name of all replica types from any case to correct case. +func setTypeNamesToCamelCase(mxJob *MXJob) { + setTypeNameToCamelCase(mxJob, MXReplicaTypeScheduler) + setTypeNameToCamelCase(mxJob, MXReplicaTypeServer) + setTypeNameToCamelCase(mxJob, MXReplicaTypeWorker) +} + +// setTypeNameToCamelCase sets the name of the replica type from any case to correct case. +// E.g. from server to Server; from WORKER to Worker. +func setTypeNameToCamelCase(mxJob *MXJob, typ MXReplicaType) { + for t := range mxJob.Spec.MXReplicaSpecs { + if strings.EqualFold(string(t), string(typ)) && t != typ { + spec := mxJob.Spec.MXReplicaSpecs[t] + delete(mxJob.Spec.MXReplicaSpecs, t) + mxJob.Spec.MXReplicaSpecs[typ] = spec + return + } + } +} + +// SetDefaults_MXJob sets any unspecified values to defaults. +func SetDefaults_MXJob(mxjob *MXJob) { + // Set default cleanpod policy to All. + if mxjob.Spec.CleanPodPolicy == nil { + all := CleanPodPolicyAll + mxjob.Spec.CleanPodPolicy = &all + } + + // Update the key of MXReplicaSpecs to camel case. + setTypeNamesToCamelCase(mxjob) + + for _, spec := range mxjob.Spec.MXReplicaSpecs { + // Set default replicas to 1. + setDefaultReplicas(spec) + // Set default port to mxnet container. + setDefaultPort(&spec.Template.Spec) + } +} diff --git a/pkg/apis/mxnet/v1/doc.go b/pkg/apis/mxnet/v1/doc.go new file mode 100644 index 00000000..48b85e8c --- /dev/null +++ b/pkg/apis/mxnet/v1/doc.go @@ -0,0 +1,21 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +k8s:deepcopy-gen=package,register +// +k8s:defaulter-gen=TypeMeta +// +k8s:openapi-gen=true + +// Package v1 is the v1 version of the API. +// +groupName=kubeflow.org +package v1 diff --git a/pkg/apis/mxnet/v1/register.go b/pkg/apis/mxnet/v1/register.go new file mode 100644 index 00000000..7b95f7b3 --- /dev/null +++ b/pkg/apis/mxnet/v1/register.go @@ -0,0 +1,74 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ( + // TODO: move SchemeBuilder with zz_generated.deepcopy.go to k8s.io/api. + // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. + SchemeBuilder runtime.SchemeBuilder + localSchemeBuilder = &SchemeBuilder + AddToScheme = localSchemeBuilder.AddToScheme +) + +const ( + // GroupName is the group name use in this package. + GroupName = "kubeflow.org" + // Kind is the kind name. + Kind = "MXJob" + // GroupVersion is the version. + GroupVersion = "v1" + // Plural is the Plural for MXJob. + Plural = "mxjobs" + // Singular is the singular for MXJob. + Singular = "mxjob" + // MXCRD is the CRD name for MXJob. + MXCRD = "mxjobs.kubeflow.org" +) + +var ( + // SchemeGroupVersion is the group version used to register these objects. + SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: GroupVersion} + // SchemeGroupVersionKind is the GroupVersionKind of the resource. + SchemeGroupVersionKind = SchemeGroupVersion.WithKind(Kind) +) + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + localSchemeBuilder.Register(addKnownTypes) + localSchemeBuilder.Register(addDefaultingFuncs) +} + +// Resource takes an unqualified resource and returns a Group-qualified GroupResource. +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +// addKnownTypes adds the set of types defined in this package to the supplied scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &MXJob{}, + &MXJobList{}, + ) + metav1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/pkg/apis/mxnet/v1/types.go b/pkg/apis/mxnet/v1/types.go new file mode 100644 index 00000000..7e6dd10b --- /dev/null +++ b/pkg/apis/mxnet/v1/types.go @@ -0,0 +1,256 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +import ( + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +genclient +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=mxjob + +// MXJob represents the configuration of signal MXJob +type MXJob struct { + metav1.TypeMeta `json:",inline"` + + // Standard object's metadata. + metav1.ObjectMeta `json:"metadata,omitempty"` + + // Specification of the desired behavior of the MXJob. + Spec MXJobSpec `json:"spec,omitempty"` + + // Most recently observed status of the MXJob. + // This data may not be up to date. + // Populated by the system. + // Read-only. + Status MXJobStatus `json:"status,omitempty"` +} + +// MXJobSpec is a desired state description of the MXJob. +type MXJobSpec struct { + // CleanPodPolicy defines the policy to kill pods after MXJob is + // succeeded. + // Default to Running. + CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + + // TTLSecondsAfterFinished is the TTL to clean up mxnet-jobs (temporary + // before kubernetes adds the cleanup controller). + // It may take extra ReconcilePeriod seconds for the cleanup, since + // reconcile gets called periodically. + // Default to infinite. + TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` + + // JobMode specify the kind of MXjob to do. Different mode may have + // different MXReplicaSpecs request + JobMode JobModeType `json:"jobMode"` + + // MXReplicaSpecs is map of MXReplicaType and MXReplicaSpec + // specifies the MX replicas to run. + // For example, + // { + // "Scheduler": MXReplicaSpec, + // "Server": MXReplicaSpec, + // "Worker": MXReplicaSpec, + // } + MXReplicaSpecs map[MXReplicaType]*MXReplicaSpec `json:"mxReplicaSpecs"` +} + +// MXReplicaSpec is a description of the MXReplica +type MXReplicaSpec struct { + // Replicas is the desired number of replicas of the given template. + // If unspecified, defaults to 1. + Replicas *int32 `json:"replicas,omitempty"` + + // Label is used as tunerServerKey, it's designed for tvm auto-tuning. + Label string `json:"label,omitempty"` + + // Template is the object that describes the pod that + // will be created for this MXReplica. RestartPolicy in PodTemplateSpec + // will be overide by RestartPolicy in MXReplicaSpec + Template v1.PodTemplateSpec `json:"template,omitempty"` + + // Restart policy for all MXReplicas within the MXJob. + // One of Always, OnFailure, Never and ExitCode. + // Default to Never. + RestartPolicy RestartPolicy `json:"restartPolicy,omitempty"` +} + +// CleanPodPolicy describes how to deal with pods when the MXJob is finished. +type CleanPodPolicy string + +const ( + CleanPodPolicyUndefined CleanPodPolicy = "" + CleanPodPolicyAll CleanPodPolicy = "All" + CleanPodPolicyRunning CleanPodPolicy = "Running" + CleanPodPolicyNone CleanPodPolicy = "None" +) + +// RestartPolicy describes how the MXReplicas should be restarted. +// Only one of the following restart policies may be specified. +// If none of the following policies is specified, the default one +// is RestartPolicyAlways. +type RestartPolicy string + +const ( + RestartPolicyAlways RestartPolicy = "Always" + RestartPolicyOnFailure RestartPolicy = "OnFailure" + RestartPolicyNever RestartPolicy = "Never" + + // `ExitCode` policy means that user should add exit code by themselves, + // `mxnet-operator` will check these exit codes to + // determine the behavior when an error occurs: + // - 1-127: permanent error, do not restart. + // - 128-255: retryable error, will restart the pod. + RestartPolicyExitCode RestartPolicy = "ExitCode" +) + +// JobModeType id the type for JobMode +type JobModeType string + +const ( + // Train Mode, in this mode requested MXReplicaSpecs need + // has Server, Scheduler, Worker + MXTrain JobModeType = "MXTrain" + + // Tune Mode, in this mode requested MXReplicaSpecs need + // has Tuner + MXTune JobModeType = "MXTune" +) + +// MXReplicaType is the type for MXReplica. +type MXReplicaType string + +const ( + // MXReplicaTypeScheduler is the type for scheduler replica in MXNet. + MXReplicaTypeScheduler MXReplicaType = "Scheduler" + + // MXReplicaTypeServer is the type for parameter servers of distributed MXNet. + MXReplicaTypeServer MXReplicaType = "Server" + + // MXReplicaTypeWorker is the type for workers of distributed MXNet. + // This is also used for non-distributed MXNet. + MXReplicaTypeWorker MXReplicaType = "Worker" + + // MXReplicaTypeTunerTracker + // This the auto-tuning tracker e.g. autotvm tracker, it will dispatch tuning task to TunerServer + MXReplicaTypeTunerTracker MXReplicaType = "TunerTracker" + + // MXReplicaTypeTunerServer + MXReplicaTypeTunerServer MXReplicaType = "TunerServer" + + // MXReplicaTuner is the type for auto-tuning of distributed MXNet. + // This is also used for non-distributed MXNet. + MXReplicaTypeTuner MXReplicaType = "Tuner" +) + +// MXJobStatus represents the current observed state of the MXJob. +type MXJobStatus struct { + // Conditions is an array of current observed MXJob conditions. + Conditions []MXJobCondition `json:"conditions"` + + // MXReplicaStatuses is map of MXReplicaType and MXReplicaStatus, + // specifies the status of each MXReplica. + MXReplicaStatuses map[MXReplicaType]*MXReplicaStatus `json:"mxReplicaStatuses"` + + // Represents time when the MXJob was acknowledged by the MXJob controller. + // It is not guaranteed to be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + StartTime *metav1.Time `json:"startTime,omitempty"` + + // Represents time when the MXJob was completed. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + CompletionTime *metav1.Time `json:"completionTime,omitempty"` + + // Represents last time when the MXJob was reconciled. It is not guaranteed to + // be set in happens-before order across separate operations. + // It is represented in RFC3339 form and is in UTC. + LastReconcileTime *metav1.Time `json:"lastReconcileTime,omitempty"` +} + +// MXReplicaStatus represents the current observed state of the MXReplica. +type MXReplicaStatus struct { + // The number of actively running pods. + Active int32 `json:"active,omitempty"` + + // The number of pods which reached phase Succeeded. + Succeeded int32 `json:"succeeded,omitempty"` + + // The number of pods which reached phase Failed. + Failed int32 `json:"failed,omitempty"` +} + +// MXJobCondition describes the state of the MXJob at a certain point. +type MXJobCondition struct { + // Type of MXJob condition. + Type MXJobConditionType `json:"type"` + // Status of the condition, one of True, False, Unknown. + Status v1.ConditionStatus `json:"status"` + // The reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // A human readable message indicating details about the transition. + Message string `json:"message,omitempty"` + // The last time this condition was updated. + LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` + // Last time the condition transitioned from one status to another. + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` +} + +// MXJobConditionType defines all kinds of types of MXJobStatus. +type MXJobConditionType string + +const ( + // MXJobCreated means the mxjob has been accepted by the system, + // but one or more of the pods/services has not been started. + // This includes time before pods being scheduled and launched. + MXJobCreated MXJobConditionType = "Created" + + // MXJobRunning means all sub-resources (e.g. services/pods) of this MXJob + // have been successfully scheduled and launched. + // The training is running without error. + MXJobRunning MXJobConditionType = "Running" + + // MXJobRestarting means one or more sub-resources (e.g. services/pods) of this MXJob + // reached phase failed but maybe restarted according to it's restart policy + // which specified by user in v1.PodTemplateSpec. + // The training is freezing/pending. + MXJobRestarting MXJobConditionType = "Restarting" + + // MXJobSucceeded means all sub-resources (e.g. services/pods) of this MXJob + // reached phase have terminated in success. + // The training is complete without error. + MXJobSucceeded MXJobConditionType = "Succeeded" + + // MXJobFailed means one or more sub-resources (e.g. services/pods) of this MXJob + // reached phase failed with no restarting. + // The training has failed its execution. + MXJobFailed MXJobConditionType = "Failed" +) + +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object +// +resource:path=mxjobs + +// MXJobList is a list of MXJobs. +type MXJobList struct { + metav1.TypeMeta `json:",inline"` + + // Standard list metadata. + metav1.ListMeta `json:"metadata,omitempty"` + + // List of MXJobs. + Items []MXJob `json:"items"` +} diff --git a/pkg/apis/mxnet/v1/util.go b/pkg/apis/mxnet/v1/util.go new file mode 100644 index 00000000..834f3cea --- /dev/null +++ b/pkg/apis/mxnet/v1/util.go @@ -0,0 +1,20 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +// IsScheduler returns true if the type is Scheduler. +func IsScheduler(typ MXReplicaType) bool { + return typ == MXReplicaTypeScheduler +} diff --git a/pkg/apis/mxnet/v1/util_test.go b/pkg/apis/mxnet/v1/util_test.go new file mode 100644 index 00000000..49a61cd3 --- /dev/null +++ b/pkg/apis/mxnet/v1/util_test.go @@ -0,0 +1,44 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1 + +import "testing" + +func TestIsScheduler(t *testing.T) { + tc := []struct { + Type MXReplicaType + Expected bool + }{ + { + Type: MXReplicaTypeScheduler, + Expected: true, + }, + { + Type: MXReplicaTypeServer, + Expected: false, + }, + { + Type: MXReplicaTypeWorker, + Expected: false, + }, + } + + for _, c := range tc { + actual := IsScheduler(c.Type) + if actual != c.Expected { + t.Errorf("Expected %v; Got %v", c.Expected, actual) + } + } +} diff --git a/pkg/apis/mxnet/v1/zz_generated.deepcopy.go b/pkg/apis/mxnet/v1/zz_generated.deepcopy.go new file mode 100644 index 00000000..626047ea --- /dev/null +++ b/pkg/apis/mxnet/v1/zz_generated.deepcopy.go @@ -0,0 +1,231 @@ +// +build !ignore_autogenerated + +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by deepcopy-gen. DO NOT EDIT. + +package v1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXJob) DeepCopyInto(out *MXJob) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXJob. +func (in *MXJob) DeepCopy() *MXJob { + if in == nil { + return nil + } + out := new(MXJob) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MXJob) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXJobCondition) DeepCopyInto(out *MXJobCondition) { + *out = *in + in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXJobCondition. +func (in *MXJobCondition) DeepCopy() *MXJobCondition { + if in == nil { + return nil + } + out := new(MXJobCondition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXJobList) DeepCopyInto(out *MXJobList) { + *out = *in + out.TypeMeta = in.TypeMeta + out.ListMeta = in.ListMeta + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]MXJob, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXJobList. +func (in *MXJobList) DeepCopy() *MXJobList { + if in == nil { + return nil + } + out := new(MXJobList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *MXJobList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXJobSpec) DeepCopyInto(out *MXJobSpec) { + *out = *in + if in.CleanPodPolicy != nil { + in, out := &in.CleanPodPolicy, &out.CleanPodPolicy + *out = new(CleanPodPolicy) + **out = **in + } + if in.TTLSecondsAfterFinished != nil { + in, out := &in.TTLSecondsAfterFinished, &out.TTLSecondsAfterFinished + *out = new(int32) + **out = **in + } + if in.MXReplicaSpecs != nil { + in, out := &in.MXReplicaSpecs, &out.MXReplicaSpecs + *out = make(map[MXReplicaType]*MXReplicaSpec, len(*in)) + for key, val := range *in { + var outVal *MXReplicaSpec + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(MXReplicaSpec) + (*in).DeepCopyInto(*out) + } + (*out)[key] = outVal + } + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXJobSpec. +func (in *MXJobSpec) DeepCopy() *MXJobSpec { + if in == nil { + return nil + } + out := new(MXJobSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXJobStatus) DeepCopyInto(out *MXJobStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]MXJobCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.MXReplicaStatuses != nil { + in, out := &in.MXReplicaStatuses, &out.MXReplicaStatuses + *out = make(map[MXReplicaType]*MXReplicaStatus, len(*in)) + for key, val := range *in { + var outVal *MXReplicaStatus + if val == nil { + (*out)[key] = nil + } else { + in, out := &val, &outVal + *out = new(MXReplicaStatus) + **out = **in + } + (*out)[key] = outVal + } + } + if in.StartTime != nil { + in, out := &in.StartTime, &out.StartTime + *out = (*in).DeepCopy() + } + if in.CompletionTime != nil { + in, out := &in.CompletionTime, &out.CompletionTime + *out = (*in).DeepCopy() + } + if in.LastReconcileTime != nil { + in, out := &in.LastReconcileTime, &out.LastReconcileTime + *out = (*in).DeepCopy() + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXJobStatus. +func (in *MXJobStatus) DeepCopy() *MXJobStatus { + if in == nil { + return nil + } + out := new(MXJobStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXReplicaSpec) DeepCopyInto(out *MXReplicaSpec) { + *out = *in + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } + in.Template.DeepCopyInto(&out.Template) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXReplicaSpec. +func (in *MXReplicaSpec) DeepCopy() *MXReplicaSpec { + if in == nil { + return nil + } + out := new(MXReplicaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MXReplicaStatus) DeepCopyInto(out *MXReplicaStatus) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MXReplicaStatus. +func (in *MXReplicaStatus) DeepCopy() *MXReplicaStatus { + if in == nil { + return nil + } + out := new(MXReplicaStatus) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/apis/mxnet/v1/zz_generated.defaults.go b/pkg/apis/mxnet/v1/zz_generated.defaults.go new file mode 100644 index 00000000..1bd8ca50 --- /dev/null +++ b/pkg/apis/mxnet/v1/zz_generated.defaults.go @@ -0,0 +1,43 @@ +// +build !ignore_autogenerated + +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by defaulter-gen. DO NOT EDIT. + +package v1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&MXJob{}, func(obj interface{}) { SetObjectDefaults_MXJob(obj.(*MXJob)) }) + scheme.AddTypeDefaultingFunc(&MXJobList{}, func(obj interface{}) { SetObjectDefaults_MXJobList(obj.(*MXJobList)) }) + return nil +} + +func SetObjectDefaults_MXJob(in *MXJob) { + SetDefaults_MXJob(in) +} + +func SetObjectDefaults_MXJobList(in *MXJobList) { + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_MXJob(a) + } +} diff --git a/pkg/apis/mxnet/validation/validation.go b/pkg/apis/mxnet/validation/validation.go index a0c39bd2..b994e7c2 100644 --- a/pkg/apis/mxnet/validation/validation.go +++ b/pkg/apis/mxnet/validation/validation.go @@ -19,6 +19,7 @@ import ( log "github.com/sirupsen/logrus" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" mxv1beta1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1" ) @@ -61,3 +62,43 @@ func validateBetaOneReplicaSpecs(specs map[mxv1beta1.MXReplicaType]*mxv1beta1.MX } return nil } + +// ValidateBetaOneMXJobSpec checks that the v1beta1.MXJobSpec is valid. +func ValidateV1MXJobSpec(c *mxv1.MXJobSpec) error { + return validateV1ReplicaSpecs(c.MXReplicaSpecs) +} + +func validateV1ReplicaSpecs(specs map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec) error { + if specs == nil { + return fmt.Errorf("MXJobSpec is not valid") + } + foundScheduler := 0 + for rType, value := range specs { + if value == nil || len(value.Template.Spec.Containers) == 0 { + return fmt.Errorf("MXJobSpec is not valid") + } + if mxv1.IsScheduler(rType) { + foundScheduler++ + } + // Make sure the image is defined in the container. + numNamedMXNet := 0 + for _, container := range value.Template.Spec.Containers { + if container.Image == "" { + log.Warn("Image is undefined in the container") + return fmt.Errorf("MXJobSpec is not valid") + } + if container.Name == mxv1beta1.DefaultContainerName { + numNamedMXNet++ + } + } + // Make sure there has at least one container named "mxnet". + if numNamedMXNet == 0 { + log.Warnf("There is no container named mxnet in %v", rType) + return fmt.Errorf("MXJobSpec is not valid") + } + } + if foundScheduler > 1 { + return fmt.Errorf("more than 1 scheduler found") + } + return nil +} diff --git a/pkg/apis/mxnet/validation/validation_test.go b/pkg/apis/mxnet/validation/validation_test.go index e8173ec3..b92f2933 100644 --- a/pkg/apis/mxnet/validation/validation_test.go +++ b/pkg/apis/mxnet/validation/validation_test.go @@ -17,6 +17,7 @@ package validation import ( "testing" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" mxv1beta1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1" "k8s.io/api/core/v1" @@ -88,3 +89,70 @@ func TestValidateAlphaTwoMXJobSpec(t *testing.T) { } } } + +func TestValidateV1MXJobSpec(t *testing.T) { + testCases := []mxv1.MXJobSpec{ + { + MXReplicaSpecs: nil, + }, + { + MXReplicaSpecs: map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec{ + mxv1.MXReplicaTypeWorker: &mxv1.MXReplicaSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{}, + }, + }, + }, + }, + }, + { + MXReplicaSpecs: map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec{ + mxv1.MXReplicaTypeWorker: &mxv1.MXReplicaSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Image: "", + }, + }, + }, + }, + }, + }, + }, + { + MXReplicaSpecs: map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec{ + mxv1.MXReplicaTypeWorker: &mxv1.MXReplicaSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "", + Image: "mxjob/mxnet:gpu", + }, + }, + }, + }, + }, + }, + }, + { + MXReplicaSpecs: map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec{ + mxv1.MXReplicaTypeScheduler: &mxv1.MXReplicaSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{}, + }, + }, + }, + }, + }, + } + for _, c := range testCases { + err := ValidateV1MXJobSpec(&c) + if err.Error() != "MXJobSpec is not valid" { + t.Error("Failed validate the alpha2.MXJobSpec") + } + } +} diff --git a/pkg/common/util/v1/testutil/const.go b/pkg/common/util/v1/testutil/const.go new file mode 100644 index 00000000..a3dd0e9f --- /dev/null +++ b/pkg/common/util/v1/testutil/const.go @@ -0,0 +1,35 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "time" +) + +const ( + TestImageName = "mxjob/mxnet-operator:v1" + TestMXJobName = "test-mxjob" + + LabelScheduler = "scheduler" + LabelWorker = "worker" + LabelServer = "server" + + SleepInterval = 500 * time.Millisecond + ThreadCount = 1 +) + +var ( + AlwaysReady = func() bool { return true } +) diff --git a/pkg/common/util/v1/testutil/mxjob.go b/pkg/common/util/v1/testutil/mxjob.go new file mode 100644 index 00000000..0f8c9dd8 --- /dev/null +++ b/pkg/common/util/v1/testutil/mxjob.go @@ -0,0 +1,121 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "time" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" +) + +func NewMXJobWithCleanPolicy(scheduler, worker, server int, policy mxv1.CleanPodPolicy) *mxv1.MXJob { + + var mxJob *mxv1.MXJob + + if scheduler > 0 { + mxJob = NewMXJobWithScheduler(worker, server) + } else { + mxJob = NewMXJob(worker, server) + } + + mxJob.Spec.CleanPodPolicy = &policy + return mxJob +} + +func NewMXJobWithCleanupJobDelay(scheduler, worker, server int, ttl *int32) *mxv1.MXJob { + + var mxJob *mxv1.MXJob + + if scheduler > 0 { + mxJob = NewMXJobWithScheduler(worker, server) + } else { + mxJob = NewMXJob(worker, server) + } + + mxJob.Spec.TTLSecondsAfterFinished = ttl + policy := mxv1.CleanPodPolicyNone + mxJob.Spec.CleanPodPolicy = &policy + return mxJob +} + +func NewMXJobWithScheduler(worker, server int) *mxv1.MXJob { + mxJob := NewMXJob(worker, server) + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeScheduler] = &mxv1.MXReplicaSpec{ + Template: NewMXReplicaSpecTemplate(), + } + return mxJob +} + +func NewMXJob(worker, server int) *mxv1.MXJob { + mxJob := &mxv1.MXJob{ + TypeMeta: metav1.TypeMeta{ + Kind: mxv1.Kind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: TestMXJobName, + Namespace: metav1.NamespaceDefault, + }, + Spec: mxv1.MXJobSpec{ + MXReplicaSpecs: make(map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec), + }, + } + + if worker > 0 { + worker := int32(worker) + workerReplicaSpec := &mxv1.MXReplicaSpec{ + Replicas: &worker, + Template: NewMXReplicaSpecTemplate(), + } + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker] = workerReplicaSpec + } + + if server > 0 { + server := int32(server) + serverReplicaSpec := &mxv1.MXReplicaSpec{ + Replicas: &server, + Template: NewMXReplicaSpecTemplate(), + } + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer] = serverReplicaSpec + } + return mxJob +} + +func NewMXReplicaSpecTemplate() v1.PodTemplateSpec { + return v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: mxv1.DefaultContainerName, + Image: TestImageName, + Args: []string{"Fake", "Fake"}, + Ports: []v1.ContainerPort{ + { + Name: mxv1.DefaultPortName, + ContainerPort: mxv1.DefaultPort, + }, + }, + }, + }, + }, + } +} + +func SetMXJobCompletionTime(mxJob *mxv1.MXJob) { + now := metav1.Time{Time: time.Now()} + mxJob.Status.CompletionTime = &now +} diff --git a/pkg/common/util/v1/testutil/pod.go b/pkg/common/util/v1/testutil/pod.go new file mode 100644 index 00000000..cf8a9cba --- /dev/null +++ b/pkg/common/util/v1/testutil/pod.go @@ -0,0 +1,92 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + "testing" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" +) + +const ( + // labels for pods and servers. + mxReplicaTypeLabel = "mxnet-replica-type" + mxReplicaIndexLabel = "mxnet-replica-index" +) + +var ( + controllerKind = mxv1.SchemeGroupVersionKind +) + +func NewBasePod(name string, mxJob *mxv1.MXJob, t *testing.T) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: GenLabels(mxJob.Name), + Namespace: mxJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(mxJob, controllerKind)}, + }, + } +} + +func NewPod(mxJob *mxv1.MXJob, typ string, index int, t *testing.T) *v1.Pod { + pod := NewBasePod(fmt.Sprintf("%s-%d", typ, index), mxJob, t) + pod.Labels[mxReplicaTypeLabel] = typ + pod.Labels[mxReplicaIndexLabel] = fmt.Sprintf("%d", index) + return pod +} + +// create count pods with the given phase for the given mxJob +func NewPodList(count int32, status v1.PodPhase, mxJob *mxv1.MXJob, typ string, start int32, t *testing.T) []*v1.Pod { + pods := []*v1.Pod{} + for i := int32(0); i < count; i++ { + newPod := NewPod(mxJob, typ, int(start+i), t) + newPod.Status = v1.PodStatus{Phase: status} + pods = append(pods, newPod) + } + return pods +} + +func SetPodsStatuses(podIndexer cache.Indexer, mxJob *mxv1.MXJob, typ string, pendingPods, activePods, succeededPods, failedPods int32, t *testing.T) { + var index int32 + for _, pod := range NewPodList(pendingPods, v1.PodPending, mxJob, typ, index, t) { + if err := podIndexer.Add(pod); err != nil { + t.Errorf("%s: unexpected error when adding pod %v", mxJob.Name, err) + } + } + index += pendingPods + for _, pod := range NewPodList(activePods, v1.PodRunning, mxJob, typ, index, t) { + if err := podIndexer.Add(pod); err != nil { + t.Errorf("%s: unexpected error when adding pod %v", mxJob.Name, err) + } + } + index += activePods + for _, pod := range NewPodList(succeededPods, v1.PodSucceeded, mxJob, typ, index, t) { + if err := podIndexer.Add(pod); err != nil { + t.Errorf("%s: unexpected error when adding pod %v", mxJob.Name, err) + } + } + index += succeededPods + for _, pod := range NewPodList(failedPods, v1.PodFailed, mxJob, typ, index, t) { + if err := podIndexer.Add(pod); err != nil { + t.Errorf("%s: unexpected error when adding pod %v", mxJob.Name, err) + } + } +} diff --git a/pkg/common/util/v1/testutil/service.go b/pkg/common/util/v1/testutil/service.go new file mode 100644 index 00000000..14018e97 --- /dev/null +++ b/pkg/common/util/v1/testutil/service.go @@ -0,0 +1,62 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + "testing" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" +) + +func NewBaseService(name string, mxJob *mxv1.MXJob, t *testing.T) *v1.Service { + return &v1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: GenLabels(mxJob.Name), + Namespace: mxJob.Namespace, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(mxJob, controllerKind)}, + }, + } +} + +func NewService(mxJob *mxv1.MXJob, typ string, index int, t *testing.T) *v1.Service { + service := NewBaseService(fmt.Sprintf("%s-%d", typ, index), mxJob, t) + service.Labels[mxReplicaTypeLabel] = typ + service.Labels[mxReplicaIndexLabel] = fmt.Sprintf("%d", index) + return service +} + +// NewServiceList creates count pods with the given phase for the given mxJob +func NewServiceList(count int32, mxJob *mxv1.MXJob, typ string, t *testing.T) []*v1.Service { + services := []*v1.Service{} + for i := int32(0); i < count; i++ { + newService := NewService(mxJob, typ, int(i), t) + services = append(services, newService) + } + return services +} + +func SetServices(serviceIndexer cache.Indexer, mxJob *mxv1.MXJob, typ string, activeWorkerServices int32, t *testing.T) { + for _, service := range NewServiceList(activeWorkerServices, mxJob, typ, t) { + if err := serviceIndexer.Add(service); err != nil { + t.Errorf("unexpected error when adding service %v", err) + } + } +} diff --git a/pkg/common/util/v1/testutil/util.go b/pkg/common/util/v1/testutil/util.go new file mode 100644 index 00000000..4abce23c --- /dev/null +++ b/pkg/common/util/v1/testutil/util.go @@ -0,0 +1,93 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "encoding/json" + "strings" + "testing" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/tools/cache" +) + +const ( + LabelGroupName = "group_name" + LabelMXJobName = "mxnet_job_name" +) + +var ( + // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. + // IndexerInformer uses a delta queue, therefore for deletes we have to use this + // key function but it should be just fine for non delete events. + KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc + GroupName = mxv1.GroupName +) + +func GenLabels(jobName string) map[string]string { + return map[string]string{ + LabelGroupName: GroupName, + LabelMXJobName: strings.Replace(jobName, "/", "-", -1), + } +} + +func GenOwnerReference(mxjob *mxv1.MXJob) *metav1.OwnerReference { + boolPtr := func(b bool) *bool { return &b } + controllerRef := &metav1.OwnerReference{ + APIVersion: mxv1.SchemeGroupVersion.String(), + Kind: mxv1.Kind, + Name: mxjob.Name, + UID: mxjob.UID, + BlockOwnerDeletion: boolPtr(true), + Controller: boolPtr(true), + } + + return controllerRef +} + +// ConvertMXJobToUnstructured uses JSON to convert MXJob to Unstructured. +func ConvertMXJobToUnstructured(mxJob *mxv1.MXJob) (*unstructured.Unstructured, error) { + var unstructured unstructured.Unstructured + b, err := json.Marshal(mxJob) + if err != nil { + return nil, err + } + + if err := json.Unmarshal(b, &unstructured); err != nil { + return nil, err + } + return &unstructured, nil +} + +func GetKey(mxJob *mxv1.MXJob, t *testing.T) string { + key, err := KeyFunc(mxJob) + if err != nil { + t.Errorf("Unexpected error getting key for job %v: %v", mxJob.Name, err) + return "" + } + return key +} + +func CheckCondition(mxJob *mxv1.MXJob, condition mxv1.MXJobConditionType, reason string) bool { + for _, v := range mxJob.Status.Conditions { + if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { + return true + } + } + return false +} diff --git a/pkg/common/util/v1/unstructured/informer.go b/pkg/common/util/v1/unstructured/informer.go new file mode 100644 index 00000000..6dc4ccd4 --- /dev/null +++ b/pkg/common/util/v1/unstructured/informer.go @@ -0,0 +1,62 @@ +// Package unstructured is the package for unstructured informer, +// which is from https://github.com/argoproj/argo/blob/master/util/unstructured/unstructured.go +package unstructured + +import ( + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/tools/cache" + + informer "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/mxnet/v1" + lister "github.com/kubeflow/mxnet-operator/pkg/client/listers/mxnet/v1" +) + +type UnstructuredInformer struct { + informer cache.SharedIndexInformer +} + +func NewMXJobInformer(resource schema.GroupVersionResource, client dynamic.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) informer.MXJobInformer { + return &UnstructuredInformer{ + informer: newUnstructuredInformer(resource, client, namespace, resyncPeriod, indexers), + } +} + +func (f *UnstructuredInformer) Informer() cache.SharedIndexInformer { + return f.informer +} + +func (f *UnstructuredInformer) Lister() lister.MXJobLister { + return lister.NewMXJobLister(f.Informer().GetIndexer()) +} + +// newUnstructuredInformer constructs a new informer for Unstructured type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func newUnstructuredInformer(resource schema.GroupVersionResource, client dynamic.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return newFilteredUnstructuredInformer(resource, client, namespace, resyncPeriod, indexers) +} + +// newFilteredUnstructuredInformer constructs a new informer for Unstructured type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func newFilteredUnstructuredInformer(resource schema.GroupVersionResource, client dynamic.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { + return client.Resource(resource).List(options) + }, + WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { + return client.Resource(resource).Watch(options) + }, + }, + &unstructured.Unstructured{}, + resyncPeriod, + indexers, + ) +} diff --git a/pkg/controller.v1/mxnet/controller.go b/pkg/controller.v1/mxnet/controller.go new file mode 100644 index 00000000..7279fb52 --- /dev/null +++ b/pkg/controller.v1/mxnet/controller.go @@ -0,0 +1,514 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "fmt" + "time" + + log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/cache" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + mxjobscheme "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/scheme" + mxjobinformers "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions" + mxjobinformersv1 "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/mxnet/v1" + mxjoblisters "github.com/kubeflow/mxnet-operator/pkg/client/listers/mxnet/v1" + "github.com/kubeflow/tf-operator/pkg/common/jobcontroller" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" + "k8s.io/apimachinery/pkg/runtime/schema" +) + +const ( + controllerName = "mxnet-operator" + + // labels for pods and servers. + mxReplicaTypeLabel = "mxnet-replica-type" + mxReplicaIndexLabel = "mxnet-replica-index" + labelGroupName = "group_name" + labelMXJobName = "mxnet_job_name" + labelMXJobRole = "mxnet-job-role" +) + +var ( + // KeyFunc is the short name to DeletionHandlingMetaNamespaceKeyFunc. + // IndexerInformer uses a delta queue, therefore for deletes we have to use this + // key function but it should be just fine for non delete events. + KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc + + // DefaultMXControllerConfiguration is the suggested mxnet-operator configuration for production. + DefaultMXControllerConfiguration = jobcontroller.JobControllerConfiguration{ + ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 15 * time.Second}, + EnableGangScheduling: false, + } +) + +// MXController is the type for MXJob Controller, which manages +// the lifecycle of MXJobs. +type MXController struct { + jobcontroller.JobController + + // mxJobClientSet is a clientset for CRD MXJob. + mxJobClientSet mxjobclientset.Interface + + // To allow injection of sync functions for testing. + syncHandler func(string) (bool, error) + + // To allow injection of updateStatus for testing. + updateStatusHandler func(mxjob *mxv1.MXJob) error + + // To allow injection of deleteMXJob for testing. + deleteMXJobHandler func(mxjob *mxv1.MXJob) error + + // mxJobInformer is a temporary field for unstructured informer support. + mxJobInformer cache.SharedIndexInformer + + // Listers for MXJob, Pod and Service + // mxJobLister can list/get mxjobs from the shared informer's store. + mxJobLister mxjoblisters.MXJobLister + + // mxJobInformerSynced returns true if the mxjob store has been synced at least once. + mxJobInformerSynced cache.InformerSynced +} + +// NewMXController returns a new MXJob controller. +func NewMXController( + // This variable is for unstructured informer. + mxJobInformer mxjobinformersv1.MXJobInformer, + kubeClientSet kubeclientset.Interface, + mxJobClientSet mxjobclientset.Interface, + kubeBatchClientSet kubebatchclient.Interface, + kubeInformerFactory kubeinformers.SharedInformerFactory, + // This field is not used now but we keep it since it will be used + // after we support CRD validation. + mxJobInformerFactory mxjobinformers.SharedInformerFactory, + option options.ServerOption) *MXController { + + mxjobscheme.AddToScheme(scheme.Scheme) + + log.Info("Creating MXJob controller") + // Create new MXController. + tc := &MXController{ + mxJobClientSet: mxJobClientSet, + } + + // Create base controller + log.Info("Creating Job controller") + jc := jobcontroller.NewJobController(tc, metav1.Duration{Duration: 15 * time.Second}, + option.EnableGangScheduling, kubeClientSet, kubeBatchClientSet, kubeInformerFactory, mxv1.Plural) + tc.JobController = jc + // Set sync handler. + tc.syncHandler = tc.syncMXJob + tc.updateStatusHandler = tc.updateMXJobStatus + // set delete handler. + tc.deleteMXJobHandler = tc.deleteMXJob + // Set up an event handler for when mxjob resources change. + mxJobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: tc.addMXJob, + UpdateFunc: tc.updateMXJob, + // This will enter the sync loop and no-op, + // because the mxjob has been deleted from the store. + DeleteFunc: tc.enqueueMXJob, + }) + + tc.mxJobInformer = mxJobInformer.Informer() + tc.mxJobLister = mxJobInformer.Lister() + tc.mxJobInformerSynced = mxJobInformer.Informer().HasSynced + + // Create pod informer. + podInformer := kubeInformerFactory.Core().V1().Pods() + + // Set up an event handler for when pod resources change + podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: jc.AddPod, + UpdateFunc: jc.UpdatePod, + DeleteFunc: jc.DeletePod, + }) + + tc.PodLister = podInformer.Lister() + tc.PodInformerSynced = podInformer.Informer().HasSynced + + // Create service informer. + serviceInformer := kubeInformerFactory.Core().V1().Services() + + // Set up an event handler for when service resources change. + serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: jc.AddService, + UpdateFunc: jc.UpdateService, + DeleteFunc: jc.DeleteService, + }) + + tc.ServiceLister = serviceInformer.Lister() + tc.ServiceInformerSynced = serviceInformer.Informer().HasSynced + + return tc +} + +// Run will set up the event handlers for types we are interested in, as well +// as syncing informer caches and starting workers. It will block until stopCh +// is closed, at which point it will shutdown the workqueue and wait for +// workers to finish processing their current work items. +func (tc *MXController) Run(threadiness int, stopCh <-chan struct{}) error { + defer utilruntime.HandleCrash() + defer tc.WorkQueue.ShutDown() + + // Start the informer factories to begin populating the informer caches. + log.Info("Starting MXJob controller") + + // Wait for the caches to be synced before starting workers. + log.Info("Waiting for informer caches to sync") + if ok := cache.WaitForCacheSync(stopCh, tc.mxJobInformerSynced); !ok { + return fmt.Errorf("failed to wait for mxjob caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.PodInformerSynced); !ok { + return fmt.Errorf("failed to wait for pod caches to sync") + } + + if ok := cache.WaitForCacheSync(stopCh, tc.ServiceInformerSynced); !ok { + return fmt.Errorf("failed to wait for service caches to sync") + } + + log.Infof("Starting %v workers", threadiness) + // Launch workers to process MXJob resources. + for i := 0; i < threadiness; i++ { + go wait.Until(tc.runWorker, time.Second, stopCh) + } + + log.Info("Started workers") + <-stopCh + log.Info("Shutting down workers") + + return nil +} + +// runWorker is a long-running function that will continually call the +// processNextWorkItem function in order to read and process a message on the +// workqueue. +func (tc *MXController) runWorker() { + for tc.processNextWorkItem() { + } +} + +// processNextWorkItem will read a single work item off the workqueue and +// attempt to process it, by calling the syncHandler. +func (tc *MXController) processNextWorkItem() bool { + key, quit := tc.WorkQueue.Get() + if quit { + return false + } + defer tc.WorkQueue.Done(key) + + logger := mxlogger.LoggerForKey(key.(string)) + + mxJob, err := tc.getMXJobFromKey(key.(string)) + if err != nil { + if err == errNotExists { + logger.Infof("MXJob has been deleted: %v", key) + return true + } + + // Log the failure to conditions. + logger.Errorf("Failed to get MXJob from key %s: %v", key, err) + if err == errFailedMarshal { + errMsg := fmt.Sprintf("Failed to unmarshal the object to MXJob object: %v", err) + mxlogger.LoggerForJob(mxJob).Warn(errMsg) + tc.Recorder.Event(mxJob, v1.EventTypeWarning, failedMarshalMXJobReason, errMsg) + } + + return true + } + + // Verify + err = tc.inspectMXjob(mxJob) + if err != nil { + errMsg := fmt.Sprintf("Inspect Fail: %v", err) + mxlogger.LoggerForJob(mxJob).Warn(errMsg) + tc.Recorder.Event(mxJob, v1.EventTypeWarning, inspectFailMXJobReason, errMsg) + return true + } + + // Sync MXJob to match the actual state to this desired state. + forget, err := tc.syncHandler(key.(string)) + if err == nil { + if forget { + tc.WorkQueue.Forget(key) + } + return true + } + + utilruntime.HandleError(fmt.Errorf("error syncing mxjob: %v", err)) + tc.WorkQueue.AddRateLimited(key) + + return true +} + +func (tc *MXController) enqueueMXJob(mxjob interface{}) { + key, err := KeyFunc(mxjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("couldn't get key for mxjob object %#v: %v", mxjob, err)) + return + } + + // TODO: we may need add backoff here + tc.WorkQueue.Add(key) +} + +// syncMXJob syncs the mxjob with the given key if it has had its expectations fulfilled, meaning +// it did not expect to see any more of its pods/services created or deleted. +// This function is not meant to be invoked concurrently with the same key. +func (tc *MXController) syncMXJob(key string) (bool, error) { + startTime := time.Now() + logger := mxlogger.LoggerForKey(key) + defer func() { + logger.Infof("Finished syncing mxjob %q (%v)", key, time.Since(startTime)) + }() + + namespace, name, err := cache.SplitMetaNamespaceKey(key) + if err != nil { + return false, err + } + if len(namespace) == 0 || len(name) == 0 { + return false, fmt.Errorf("invalid mxjob key %q: either namespace or name is missing", key) + } + + sharedMXJob, err := tc.getMXJobFromName(namespace, name) + if err != nil { + if err == errNotExists { + logger.Infof("MXJob has been deleted: %v", key) + // jm.expectations.DeleteExpectations(key) + return true, nil + } + return false, err + } + + mxjob := sharedMXJob.DeepCopy() + mxjobNeedsSync := tc.satisfiedExpectations(mxjob) + + if tc.Config.EnableGangScheduling { + minAvailableReplicas := getTotalReplicas(mxjob) + _, err := tc.SyncPodGroup(mxjob, minAvailableReplicas) + if err != nil { + logger.Warnf("Sync PodGroup %v: %v", mxjob.Name, err) + } + } + + // Set default for the new mxjob. + scheme.Scheme.Default(mxjob) + + var reconcileMXJobsErr error + if mxjobNeedsSync && mxjob.DeletionTimestamp == nil { + reconcileMXJobsErr = tc.reconcileMXJobs(mxjob) + } + + if reconcileMXJobsErr != nil { + return false, reconcileMXJobsErr + } + + return true, err +} + +func getTotalReplicas(mxjob *mxv1.MXJob) int32 { + mxjobReplicas := int32(0) + for _, r := range mxjob.Spec.MXReplicaSpecs { + mxjobReplicas += *r.Replicas + } + return mxjobReplicas +} + +// reconcileMXJobs checks and updates replicas for each given MXReplicaSpec. +// It will requeue the mxjob in case of an error while creating/deleting pods/services. +func (tc *MXController) reconcileMXJobs(mxjob *mxv1.MXJob) error { + logger := mxlogger.LoggerForJob(mxjob) + logger.Infof("Reconcile MXJobs %s", mxjob.Name) + + pods, err := tc.GetPodsForJob(mxjob) + + if err != nil { + logger.Warnf("getPodsForMXJob error %v", err) + return err + } + + services, err := tc.GetServicesForJob(mxjob) + + if err != nil { + logger.Warnf("getServicesForMXJob error %v", err) + return err + } + + // If the MXJob is terminated, delete all pods and services. + if isSucceeded(mxjob.Status) || isFailed(mxjob.Status) { + if err := tc.deletePodsAndServices(mxjob, pods); err != nil { + return err + } + + if err := tc.cleanupMXJob(mxjob); err != nil { + return err + } + + if tc.Config.EnableGangScheduling { + tc.Recorder.Event(mxjob, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting PodGroup") + if err := tc.DeletePodGroup(mxjob); err != nil { + tc.Recorder.Eventf(mxjob, v1.EventTypeWarning, "FailedDeletePodGroup", "Error deleting: %v", err) + return err + } else { + tc.Recorder.Eventf(mxjob, v1.EventTypeNormal, "SuccessfulDeletePodGroup", "Deleted PodGroup: %v", mxjob.Name) + } + } + + // Initialize the status. + initializeMXReplicaStatuses(mxjob, mxv1.MXReplicaTypeScheduler) + initializeMXReplicaStatuses(mxjob, mxv1.MXReplicaTypeWorker) + initializeMXReplicaStatuses(mxjob, mxv1.MXReplicaTypeServer) + return tc.updateStatusHandler(mxjob) + } + + // Save the current state of the replicas + replicasStatus := make(map[string]v1.PodPhase) + + // Diff current active pods/services with replicas. + for rtype, spec := range mxjob.Spec.MXReplicaSpecs { + err = tc.reconcilePods(mxjob, pods, rtype, spec, replicasStatus) + if err != nil { + logger.Warnf("reconcilePods error %v", err) + return err + } + + err = tc.reconcileServices(mxjob, services, rtype, spec) + + if err != nil { + logger.Warnf("reconcileServices error %v", err) + return err + } + } + + // TODO(CPH): Add check here, no need to update the mxjob if the status hasn't changed since last time. + return tc.updateStatusHandler(mxjob) +} + +// inspectMXjob make sure a MXjob has all the necessary MXReplicaSpecs members for a special jobMode. +// if not it return err +func (tc *MXController) inspectMXjob(mxjob *mxv1.MXJob) error { + + logger := mxlogger.LoggerForJob(mxjob) + + if mxjob.Spec.JobMode == mxv1.MXTrain { + // Must have MXReplicaTypeScheduler, MXReplicaTypeServer, MXReplicaTypeWorker, shouldn't have + // MXReplicaTypeTuner + if _, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeScheduler]; !ok { + return errWrongJobMode + } + if _, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer]; !ok { + return errWrongJobMode + } + if _, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker]; !ok { + return errWrongJobMode + } + } else if mxjob.Spec.JobMode == mxv1.MXTune { + // Must have MXReplicaTypeTuner, shouldn't have MXReplicaTypeScheduler, MXReplicaTypeServer, + // MXReplicaTypeWorker + if _, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeTunerTracker]; !ok { + return errWrongJobMode + } + if s, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeTunerServer]; !ok { + return errWrongJobMode + } else if s.Label == "" { + logger.Warnf("MXReplicaTypeTunerRPCServer may need label to set tvm rpc-server key") + } + if _, ok := mxjob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeTuner]; !ok { + return errWrongJobMode + } + } + return nil +} + +// satisfiedExpectations returns true if the required adds/dels for the given mxjob have been observed. +// Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller +// manager. +func (tc *MXController) satisfiedExpectations(mxjob *mxv1.MXJob) bool { + satisfied := false + mxjobKey, err := KeyFunc(mxjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("couldn't get key for mxjob object %#v: %v", mxjob, err)) + return false + } + + for rtype := range mxjob.Spec.MXReplicaSpecs { + // Check the expectations of the pods. + expectationPodsKey := jobcontroller.GenExpectationPodsKey(mxjobKey, string(rtype)) + satisfied = satisfied || tc.Expectations.SatisfiedExpectations(expectationPodsKey) + + // Check the expectations of the services. + expectationServicesKey := jobcontroller.GenExpectationServicesKey(mxjobKey, string(rtype)) + satisfied = satisfied || tc.Expectations.SatisfiedExpectations(expectationServicesKey) + } + + return satisfied +} + +func (tc *MXController) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { + return tc.getMXJobFromName(namespace, name) +} + +func (tc *MXController) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { + return tc.mxJobClientSet.KubeflowV1beta1().MXJobs(namespace).Get(name, metav1.GetOptions{}) +} + +func (tc *MXController) GetAPIGroupVersionKind() schema.GroupVersionKind { + return mxv1.SchemeGroupVersionKind +} + +func (tc *MXController) GetAPIGroupVersion() schema.GroupVersion { + return mxv1.SchemeGroupVersion +} + +func (tc *MXController) GetGroupNameLabelKey() string { + return labelGroupName +} + +func (tc *MXController) GetJobNameLabelKey() string { + return labelMXJobName +} + +func (tc *MXController) GetGroupNameLabelValue() string { + return mxv1.GroupName +} + +func (tc *MXController) GetReplicaTypeLabelKey() string { + return mxReplicaTypeLabel +} + +func (tc *MXController) GetReplicaIndexLabelKey() string { + return mxReplicaIndexLabel +} + +func (tc *MXController) ControllerName() string { + return controllerName +} + +func (tc *MXController) GetJobRoleKey() string { + return labelMXJobRole +} diff --git a/pkg/controller.v1/mxnet/controller_test.go b/pkg/controller.v1/mxnet/controller_test.go new file mode 100644 index 00000000..06d5724d --- /dev/null +++ b/pkg/controller.v1/mxnet/controller_test.go @@ -0,0 +1,456 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "reflect" + "testing" + "time" + + "k8s.io/api/core/v1" + "k8s.io/api/policy/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + kubeinformers "k8s.io/client-go/informers" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/rest" + "k8s.io/kubernetes/pkg/controller" + + "github.com/golang/protobuf/proto" + "github.com/kubeflow/tf-operator/pkg/control" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + mxjobinformers "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" + batchv1alpha1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" +) + +var ( + mxJobRunning = mxv1.MXJobRunning + mxJobSucceeded = mxv1.MXJobSucceeded +) + +func newMXController( + config *rest.Config, + kubeClientSet kubeclientset.Interface, + mxJobClientSet mxjobclientset.Interface, + kubeBatchClientSet kubebatchclient.Interface, + resyncPeriod controller.ResyncPeriodFunc, + option options.ServerOption, +) ( + *MXController, + kubeinformers.SharedInformerFactory, mxjobinformers.SharedInformerFactory, +) { + kubeInformerFactory := kubeinformers.NewSharedInformerFactory(kubeClientSet, resyncPeriod()) + mxJobInformerFactory := mxjobinformers.NewSharedInformerFactory(mxJobClientSet, resyncPeriod()) + + mxJobInformer := NewUnstructuredMXJobInformer(config, metav1.NamespaceAll) + + ctr := NewMXController(mxJobInformer, kubeClientSet, mxJobClientSet, kubeBatchClientSet, kubeInformerFactory, mxJobInformerFactory, option) + ctr.PodControl = &controller.FakePodControl{} + ctr.ServiceControl = &control.FakeServiceControl{} + return ctr, kubeInformerFactory, mxJobInformerFactory +} + +func TestNormalPath(t *testing.T) { + testCases := map[string]struct { + scheduler int + worker int + server int + + // pod setup + ControllerError error + jobKeyForget bool + + pendingSchedulerPods int32 + activeSchedulerPods int32 + succeededSchedulerPods int32 + failedSchedulerPods int32 + + pendingWorkerPods int32 + activeWorkerPods int32 + succeededWorkerPods int32 + failedWorkerPods int32 + + pendingServerPods int32 + activeServerPods int32 + succeededServerPods int32 + failedServerPods int32 + + activeSchedulerServices int32 + activeWorkerServices int32 + activeServerServices int32 + + // expectations + expectedPodCreations int32 + expectedPodDeletions int32 + expectedServiceCreations int32 + + expectedActiveSchedulerPods int32 + expectedSucceededSchedulerPods int32 + expectedFailedSchedulerPods int32 + + expectedActiveWorkerPods int32 + expectedSucceededWorkerPods int32 + expectedFailedWorkerPods int32 + + expectedActiveServerPods int32 + expectedSucceededServerPods int32 + expectedFailedServerPods int32 + + expectedCondition *mxv1.MXJobConditionType + expectedConditionReason string + + // There are some cases that should not check start time since the field should be set in the previous sync loop. + needCheckStartTime bool + }{ + "Distributed TFJob (1 scheduler, 4 workers, 2 servers) is created": { + 1, 4, 2, + nil, true, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, + 7, 0, 7, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + nil, "", + false, + }, + "Distributed MXJob (1 scheduler, 4 workers, 2 servers) is created and all replicas are pending": { + 1, 4, 2, + nil, true, + 1, 0, 0, 0, + 4, 0, 0, 0, + 2, 0, 0, 0, + 1, 4, 2, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + nil, "", + false, + }, + "Distributed MXJob (1 scheduler, 4 workers, 2 servers) is created and all replicas are running": { + 1, 4, 2, + nil, true, + 0, 1, 0, 0, + 0, 4, 0, 0, + 0, 2, 0, 0, + 1, 4, 2, + 0, 0, 0, + 1, 0, 0, + 4, 0, 0, + 2, 0, 0, + &mxJobRunning, mxJobRunningReason, + false, + }, + "Distributed MXJob (1 scheduler, 4 workers, 2 servers) is created, 1 scheduler, 2 workers, 1 server are pending": { + 1, 4, 2, + nil, true, + 1, 0, 0, 0, + 2, 0, 0, 0, + 1, 0, 0, 0, + 1, 2, 1, + 3, 0, 3, + 0, 0, 0, + 0, 0, 0, + 0, 0, 0, + nil, "", + false, + }, + "Distributed MXJob (1 scheduler, 4 workers, 2 servers) is succeeded": { + 1, 4, 2, + nil, true, + 0, 0, 1, 0, + 0, 0, 4, 0, + 0, 0, 2, 0, + 1, 4, 2, + 0, 0, 0, + 0, 1, 0, + 0, 4, 0, + 0, 2, 0, + &mxJobSucceeded, mxJobSucceededReason, + false, + }, + } + + for name, tc := range testCases { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + option := options.ServerOption{} + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, kubeInformerFactory, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, option) + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + + var actual *mxv1.MXJob + ctr.updateStatusHandler = func(mxJob *mxv1.MXJob) error { + actual = mxJob + return nil + } + + // Run the test logic. + mxJob := testutil.NewMXJobWithScheduler(tc.worker, tc.server) + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + + podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() + testutil.SetPodsStatuses(podIndexer, mxJob, testutil.LabelScheduler, tc.pendingSchedulerPods, tc.activeSchedulerPods, tc.succeededSchedulerPods, tc.failedSchedulerPods, t) + testutil.SetPodsStatuses(podIndexer, mxJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t) + testutil.SetPodsStatuses(podIndexer, mxJob, testutil.LabelServer, tc.pendingServerPods, tc.activeServerPods, tc.succeededServerPods, tc.failedServerPods, t) + + serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() + testutil.SetServices(serviceIndexer, mxJob, testutil.LabelScheduler, tc.activeSchedulerServices, t) + testutil.SetServices(serviceIndexer, mxJob, testutil.LabelWorker, tc.activeWorkerServices, t) + testutil.SetServices(serviceIndexer, mxJob, testutil.LabelServer, tc.activeServerServices, t) + + forget, err := ctr.syncMXJob(testutil.GetKey(mxJob, t)) + // We need requeue syncJob task if podController error + if tc.ControllerError != nil { + if err == nil { + t.Errorf("%s: Syncing jobs would return error when podController exception", name) + } + } else { + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", name, err) + } + } + if forget != tc.jobKeyForget { + t.Errorf("%s: unexpected forget value. Expected %v, saw %v\n", name, tc.jobKeyForget, forget) + } + + fakePodControl := ctr.PodControl.(*controller.FakePodControl) + fakeServiceControl := ctr.ServiceControl.(*control.FakeServiceControl) + if int32(len(fakePodControl.Templates)) != tc.expectedPodCreations { + t.Errorf("%s: unexpected number of pod creates. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.Templates)) + } + if int32(len(fakeServiceControl.Templates)) != tc.expectedServiceCreations { + t.Errorf("%s: unexpected number of service creates. Expected %d, saw %d\n", name, tc.expectedServiceCreations, len(fakeServiceControl.Templates)) + } + if int32(len(fakePodControl.DeletePodName)) != tc.expectedPodDeletions { + t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", name, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) + } + // Each create should have an accompanying ControllerRef. + if len(fakePodControl.ControllerRefs) != int(tc.expectedPodCreations) { + t.Errorf("%s: unexpected number of ControllerRefs. Expected %d, saw %d\n", name, tc.expectedPodCreations, len(fakePodControl.ControllerRefs)) + } + // Make sure the ControllerRefs are correct. + for _, controllerRef := range fakePodControl.ControllerRefs { + if got, want := controllerRef.APIVersion, mxv1.SchemeGroupVersion.String(); got != want { + t.Errorf("controllerRef.APIVersion = %q, want %q", got, want) + } + if got, want := controllerRef.Kind, mxv1.Kind; got != want { + t.Errorf("controllerRef.Kind = %q, want %q", got, want) + } + if got, want := controllerRef.Name, mxJob.Name; got != want { + t.Errorf("controllerRef.Name = %q, want %q", got, want) + } + if got, want := controllerRef.UID, mxJob.UID; got != want { + t.Errorf("controllerRef.UID = %q, want %q", got, want) + } + if controllerRef.Controller == nil || !*controllerRef.Controller { + t.Errorf("controllerRef.Controller is not set to true") + } + } + // Validate scheduler status. + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler] != nil { + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Active != tc.expectedActiveSchedulerPods { + t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", name, tc.expectedActiveSchedulerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Active) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Succeeded != tc.expectedSucceededSchedulerPods { + t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", name, tc.expectedSucceededSchedulerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Succeeded) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Failed != tc.expectedFailedSchedulerPods { + t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", name, tc.expectedFailedSchedulerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeScheduler].Failed) + } + } + // Validate worker status. + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker] != nil { + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Active != tc.expectedActiveWorkerPods { + t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", name, tc.expectedActiveWorkerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Active) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Succeeded != tc.expectedSucceededWorkerPods { + t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", name, tc.expectedSucceededWorkerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Succeeded) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Failed != tc.expectedFailedWorkerPods { + t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", name, tc.expectedFailedWorkerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Failed) + } + } + // Validate Server status. + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer] != nil { + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Active != tc.expectedActiveServerPods { + t.Errorf("%s: unexpected number of active pods. Expected %d, saw %d\n", name, tc.expectedActiveServerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Active) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Succeeded != tc.expectedSucceededServerPods { + t.Errorf("%s: unexpected number of succeeded pods. Expected %d, saw %d\n", name, tc.expectedSucceededServerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Succeeded) + } + if actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Failed != tc.expectedFailedServerPods { + t.Errorf("%s: unexpected number of failed pods. Expected %d, saw %d\n", name, tc.expectedFailedServerPods, actual.Status.MXReplicaStatuses[mxv1.MXReplicaTypeServer].Failed) + } + } + // Validate StartTime. + if tc.needCheckStartTime && actual.Status.StartTime == nil { + t.Errorf("%s: StartTime was not set", name) + } + // Validate conditions. + if tc.expectedCondition != nil && !testutil.CheckCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) { + t.Errorf("%s: expected condition %#v, got %#v", name, *tc.expectedCondition, actual.Status.Conditions) + } + } +} + +func TestRun(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + + stopCh := make(chan struct{}) + go func() { + // It is a hack to let the controller stop to run without errors. + // We can not just send a struct to stopCh because there are multiple + // receivers in controller.Run. + time.Sleep(testutil.SleepInterval) + stopCh <- struct{}{} + }() + err := ctr.Run(testutil.ThreadCount, stopCh) + if err != nil { + t.Errorf("Failed to run: %v", err) + } +} + +func TestSyncPdb(t *testing.T) { + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + kubeClientSet := fake.NewSimpleClientset() + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + option := options.ServerOption{ + EnableGangScheduling: true, + } + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, option) + + type testCase struct { + mxJob *mxv1.MXJob + expectPdb *v1beta1.PodDisruptionBudget + } + + minAvailable2 := intstr.FromInt(2) + testCases := []testCase{ + { + mxJob: &mxv1.MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-sync-pdb", + }, + Spec: mxv1.MXJobSpec{ + MXReplicaSpecs: map[mxv1.MXReplicaType]*mxv1.MXReplicaSpec{ + mxv1.MXReplicaTypeWorker: &mxv1.MXReplicaSpec{ + Replicas: proto.Int32(2), + }, + }, + }, + }, + expectPdb: &v1beta1.PodDisruptionBudget{ + Spec: v1beta1.PodDisruptionBudgetSpec{ + MinAvailable: &minAvailable2, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "mxnet_job_name": "test-sync-pdb", + }, + }, + }, + }, + }, + } + for _, c := range testCases { + pdb, _ := ctr.SyncPdb(c.mxJob, getTotalReplicas(c.mxJob)) + if pdb == nil && c.expectPdb != nil { + t.Errorf("Got nil, want %v", c.expectPdb.Spec) + } + + if pdb != nil && !reflect.DeepEqual(c.expectPdb.Spec, pdb.Spec) { + t.Errorf("Got %+v, want %+v", pdb.Spec, c.expectPdb.Spec) + } + } +} diff --git a/pkg/controller.v1/mxnet/informer.go b/pkg/controller.v1/mxnet/informer.go new file mode 100644 index 00000000..d1d3ed51 --- /dev/null +++ b/pkg/controller.v1/mxnet/informer.go @@ -0,0 +1,125 @@ +package mxnet + +import ( + "fmt" + "time" + + log "github.com/sirupsen/logrus" + metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + restclientset "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/validation" + mxjobinformers "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions" + mxjobinformersv1 "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/unstructured" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" +) + +const ( + resyncPeriod = 30 * time.Second + failedMarshalMsg = "Failed to marshal the object to MXJob: %v" +) + +var ( + errGetFromKey = fmt.Errorf("failed to get MXJob from key") + errNotExists = fmt.Errorf("the object is not found") + errFailedMarshal = fmt.Errorf("failed to marshal the object to MXJob") + errWrongJobMode = fmt.Errorf("failed to inspect jobMode, maybe mxReplicaSpecs has a member which is not belong to this jobMode or misses one") +) + +func NewUnstructuredMXJobInformer(restConfig *restclientset.Config, namespace string) mxjobinformersv1.MXJobInformer { + dclient, err := dynamic.NewForConfig(restConfig) + if err != nil { + panic(err) + } + resource := schema.GroupVersionResource{ + Group: mxv1.GroupName, + Version: mxv1.GroupVersion, + Resource: mxv1.Plural, + } + informer := unstructured.NewMXJobInformer( + resource, + dclient, + namespace, + resyncPeriod, + cache.Indexers{}, + ) + return informer +} + +// NewMXJobInformer returns MXJobInformer from the given factory. +func (tc *MXController) NewMXJobInformer(mxJobInformerFactory mxjobinformers.SharedInformerFactory) mxjobinformersv1.MXJobInformer { + return mxJobInformerFactory.Kubeflow().V1().MXJobs() +} + +func (tc *MXController) getMXJobFromName(namespace, name string) (*mxv1.MXJob, error) { + key := fmt.Sprintf("%s/%s", namespace, name) + return tc.getMXJobFromKey(key) +} + +func (tc *MXController) getMXJobFromKey(key string) (*mxv1.MXJob, error) { + // Check if the key exists. + obj, exists, err := tc.mxJobInformer.GetIndexer().GetByKey(key) + logger := mxlogger.LoggerForKey(key) + if err != nil { + logger.Errorf("Failed to get MXJob '%s' from informer index: %+v", key, err) + return nil, errGetFromKey + } + if !exists { + // This happens after a mxjob was deleted, but the work queue still had an entry for it. + return nil, errNotExists + } + + mxjob, err := mxJobFromUnstructured(obj) + if err != nil { + return nil, err + } + return mxjob, nil +} + +func mxJobFromUnstructured(obj interface{}) (*mxv1.MXJob, error) { + // Check if the spec is valid. + un, ok := obj.(*metav1unstructured.Unstructured) + if !ok { + log.Errorf("The object in index is not an unstructured; %+v", obj) + return nil, errGetFromKey + } + var mxjob mxv1.MXJob + err := runtime.DefaultUnstructuredConverter.FromUnstructured(un.Object, &mxjob) + logger := mxlogger.LoggerForUnstructured(un, mxv1.Kind) + if err != nil { + logger.Errorf(failedMarshalMsg, err) + return nil, errFailedMarshal + } + // This is a simple validation for MXJob to close + // TODO(gaocegege): Add more validation here. + err = validation.ValidateV1MXJobSpec(&mxjob.Spec) + if err != nil { + logger.Errorf(failedMarshalMsg, err) + return nil, errFailedMarshal + } + return &mxjob, nil +} + +func unstructuredFromMXJob(obj interface{}, mxJob *mxv1.MXJob) error { + un, ok := obj.(*metav1unstructured.Unstructured) + logger := mxlogger.LoggerForJob(mxJob) + if !ok { + logger.Warn("The object in index isn't type Unstructured") + return errGetFromKey + } + + var err error + un.Object, err = runtime.DefaultUnstructuredConverter.ToUnstructured(mxJob) + if err != nil { + logger.Error("The MXJob convert failed") + return err + } + return nil + +} diff --git a/pkg/controller.v1/mxnet/job.go b/pkg/controller.v1/mxnet/job.go new file mode 100644 index 00000000..d8830be4 --- /dev/null +++ b/pkg/controller.v1/mxnet/job.go @@ -0,0 +1,163 @@ +package mxnet + +import ( + "fmt" + "time" + + log "github.com/sirupsen/logrus" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + metav1unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/kubernetes/scheme" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/util/k8sutil" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" + "k8s.io/apimachinery/pkg/runtime" +) + +const ( + failedMarshalMXJobReason = "FailedInvalidMXJobSpec" + inspectFailMXJobReason = "InspectFailedInvalidMXReplicaSpec" +) + +// When a pod is added, set the defaults and enqueue the current mxjob. +func (tc *MXController) addMXJob(obj interface{}) { + // Convert from unstructured object. + mxJob, err := mxJobFromUnstructured(obj) + if err != nil { + un, ok := obj.(*metav1unstructured.Unstructured) + logger := &log.Entry{} + if ok { + logger = mxlogger.LoggerForUnstructured(un, mxv1.Kind) + } + logger.Errorf("Failed to convert the MXJob: %v", err) + // Log the failure to conditions. + if err == errFailedMarshal { + errMsg := fmt.Sprintf("Failed to marshal the object to MXJob; the spec is invalid: %v", err) + logger.Warn(errMsg) + // TODO(jlewi): v1 doesn't appear to define an error type. + tc.Recorder.Event(un, v1.EventTypeWarning, failedMarshalMXJobReason, errMsg) + + status := mxv1.MXJobStatus{ + Conditions: []mxv1.MXJobCondition{ + { + Type: mxv1.MXJobFailed, + Status: v1.ConditionTrue, + LastUpdateTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Reason: failedMarshalMXJobReason, + Message: errMsg, + }, + }, + } + statusMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&status) + if err != nil { + logger.Errorf("Could not covert the MXJobStatus to unstructured; %v", err) + return + } + client, err := k8sutil.NewCRDRestClient(&mxv1.SchemeGroupVersion) + if err == nil { + if err1 := metav1unstructured.SetNestedField(un.Object, statusMap, "status"); err1 != nil { + logger.Errorf("Could not set nested field: %v", err1) + } + logger.Infof("Updating the job to; %+v", un.Object) + err = client.UpdateStatus(un, mxv1.Plural) + if err != nil { + logger.Errorf("Could not update the MXJob; %v", err) + } + } else { + logger.Errorf("Could not create a REST client to update the MXJob") + } + } + return + } + + // Set default for the new mxjob. + scheme.Scheme.Default(mxJob) + + msg := fmt.Sprintf("MXJob %s is created.", mxJob.Name) + logger := mxlogger.LoggerForJob(mxJob) + logger.Info(msg) + + // Add a created condition. + err = updateMXJobConditions(mxJob, mxv1.MXJobCreated, mxJobCreatedReason, msg) + if err != nil { + logger.Errorf("Append mxJob condition error: %v", err) + return + } + + // Convert from mxjob object + err = unstructuredFromMXJob(obj, mxJob) + if err != nil { + logger.Errorf("Failed to convert the obj: %v", err) + return + } + tc.enqueueMXJob(obj) +} + +// When a pod is updated, enqueue the current mxjob. +func (tc *MXController) updateMXJob(old, cur interface{}) { + oldMXJob, err := mxJobFromUnstructured(old) + if err != nil { + return + } + log.Infof("Updating mxjob: %s", oldMXJob.Name) + tc.enqueueMXJob(cur) +} + +func (tc *MXController) deletePodsAndServices(mxJob *mxv1.MXJob, pods []*v1.Pod) error { + if len(pods) == 0 { + return nil + } + + // Delete nothing when the cleanPodPolicy is None. + if *mxJob.Spec.CleanPodPolicy == mxv1.CleanPodPolicyNone { + return nil + } + + for _, pod := range pods { + if *mxJob.Spec.CleanPodPolicy == mxv1.CleanPodPolicyRunning && pod.Status.Phase != v1.PodRunning { + continue + } + + if err := tc.PodControl.DeletePod(pod.Namespace, pod.Name, mxJob); err != nil { + return err + } + // Pod and service have the same name, thus the service could be deleted using pod's name. + if err := tc.ServiceControl.DeleteService(pod.Namespace, pod.Name, mxJob); err != nil { + return err + } + } + return nil +} + +func (tc *MXController) cleanupMXJob(mxJob *mxv1.MXJob) error { + currentTime := time.Now() + ttl := mxJob.Spec.TTLSecondsAfterFinished + if ttl == nil { + // do nothing if the cleanup delay is not set + return nil + } + duration := time.Second * time.Duration(*ttl) + if currentTime.After(mxJob.Status.CompletionTime.Add(duration)) { + err := tc.deleteMXJobHandler(mxJob) + if err != nil { + mxlogger.LoggerForJob(mxJob).Warnf("Cleanup MXJob error: %v.", err) + return err + } + return nil + } + key, err := KeyFunc(mxJob) + if err != nil { + mxlogger.LoggerForJob(mxJob).Warnf("Couldn't get key for mxjob object: %v", err) + return err + } + tc.WorkQueue.AddRateLimited(key) + return nil +} + +// deleteMXJob deletes the given MXJob. +func (tc *MXController) deleteMXJob(mxJob *mxv1.MXJob) error { + return tc.mxJobClientSet.KubeflowV1beta1().MXJobs(mxJob.Namespace).Delete(mxJob.Name, &metav1.DeleteOptions{}) +} diff --git a/pkg/controller.v1/mxnet/job_test.go b/pkg/controller.v1/mxnet/job_test.go new file mode 100644 index 00000000..d8ea91e0 --- /dev/null +++ b/pkg/controller.v1/mxnet/job_test.go @@ -0,0 +1,596 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mxnet + +import ( + "testing" + "time" + + "k8s.io/api/core/v1" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" + "k8s.io/kubernetes/pkg/controller" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" + "github.com/kubeflow/tf-operator/pkg/control" + batchv1alpha1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" +) + +func TestAddMXJob(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + ctr.Run(testutil.ThreadCount, stopCh) + } + go run(stopCh) + + var key string + syncChan := make(chan string) + ctr.syncHandler = func(mxJobKey string) (bool, error) { + key = mxJobKey + <-syncChan + return true, nil + } + ctr.updateStatusHandler = func(mxjob *mxv1.MXJob) error { + return nil + } + ctr.deleteMXJobHandler = func(mxjob *mxv1.MXJob) error { + return nil + } + + mxJob := testutil.NewMXJob(1, 0) + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + ctr.addMXJob(unstructured) + + syncChan <- "sync" + if key != testutil.GetKey(mxJob, t) { + t.Errorf("Failed to enqueue the MXJob %s: expected %s, got %s", mxJob.Name, testutil.GetKey(mxJob, t), key) + } + close(stopCh) +} + +func TestCopyLabelsAndAnnotation(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + fakePodControl := &controller.FakePodControl{} + ctr.PodControl = fakePodControl + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + ctr.Run(testutil.ThreadCount, stopCh) + } + go run(stopCh) + + ctr.updateStatusHandler = func(mxJob *mxv1.MXJob) error { + return nil + } + + mxJob := testutil.NewMXJob(1, 0) + annotations := map[string]string{ + "annotation1": "1", + } + labels := map[string]string{ + "label1": "1", + } + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].Template.Labels = labels + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].Template.Annotations = annotations + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + + _, err = ctr.syncMXJob(testutil.GetKey(mxJob, t)) + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", mxJob.Name, err) + } + + if len(fakePodControl.Templates) != 1 { + t.Errorf("Expected to create 1 pod while got %d", len(fakePodControl.Templates)) + } + actual := fakePodControl.Templates[0] + v, exist := actual.Labels["label1"] + if !exist { + t.Errorf("Labels does not exist") + } + if v != "1" { + t.Errorf("Labels value do not equal") + } + + v, exist = actual.Annotations["annotation1"] + if !exist { + t.Errorf("Annotations does not exist") + } + if v != "1" { + t.Errorf("Annotations value does not equal") + } + + close(stopCh) +} + +func TestDeletePodsAndServices(t *testing.T) { + type testCase struct { + description string + mxJob *mxv1.MXJob + + pendingSchedulerPods int32 + activeSchedulerPods int32 + succeededSchedulerPods int32 + failedSchedulerPods int32 + + pendingWorkerPods int32 + activeWorkerPods int32 + succeededWorkerPods int32 + failedWorkerPods int32 + + pendingServerPods int32 + activeServerPods int32 + succeededServerPods int32 + failedServerPods int32 + + activeSchedulerServices int32 + activeWorkerServices int32 + activeServerServices int32 + + expectedPodDeletions int + } + + testCases := []testCase{ + testCase{ + description: "1 scheduler , 4 workers and 2 server is running, policy is all", + mxJob: testutil.NewMXJobWithCleanPolicy(1, 4, 2, mxv1.CleanPodPolicyAll), + + pendingSchedulerPods: 0, + activeSchedulerPods: 1, + succeededSchedulerPods: 0, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 4, + succeededWorkerPods: 0, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 2, + succeededServerPods: 0, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedPodDeletions: 7, + }, + testCase{ + description: "1 scheduler, 4 workers and 2 servers is running, policy is running", + mxJob: testutil.NewMXJobWithCleanPolicy(1, 4, 2, mxv1.CleanPodPolicyRunning), + + pendingSchedulerPods: 0, + activeSchedulerPods: 1, + succeededSchedulerPods: 0, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 4, + succeededWorkerPods: 0, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 2, + succeededServerPods: 0, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedPodDeletions: 7, + }, + testCase{ + description: "1 scheduler, 4 workers and 2 servers is succeeded, policy is running", + mxJob: testutil.NewMXJobWithCleanPolicy(1, 4, 2, mxv1.CleanPodPolicyRunning), + + pendingSchedulerPods: 0, + activeSchedulerPods: 0, + succeededSchedulerPods: 1, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 0, + succeededWorkerPods: 4, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 0, + succeededServerPods: 2, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedPodDeletions: 0, + }, + testCase{ + description: "1 scheduler, 4 workers and 2 servers is succeeded, policy is None", + mxJob: testutil.NewMXJobWithCleanPolicy(1, 4, 2, mxv1.CleanPodPolicyNone), + + pendingSchedulerPods: 0, + activeSchedulerPods: 0, + succeededSchedulerPods: 1, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 0, + succeededWorkerPods: 4, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 0, + succeededServerPods: 2, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedPodDeletions: 0, + }, + } + for _, tc := range testCases { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, kubeInformerFactory, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + fakePodControl := &controller.FakePodControl{} + ctr.PodControl = fakePodControl + fakeServiceControl := &control.FakeServiceControl{} + ctr.ServiceControl = fakeServiceControl + ctr.Recorder = &record.FakeRecorder{} + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + ctr.updateStatusHandler = func(mxJob *mxv1.MXJob) error { + return nil + } + + // Set succeeded to run the logic about deleting. + err := updateMXJobConditions(tc.mxJob, mxv1.MXJobSucceeded, mxJobSucceededReason, "") + if err != nil { + t.Errorf("Append mxjob condition error: %v", err) + } + + unstructured, err := testutil.ConvertMXJobToUnstructured(tc.mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + + podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelScheduler, tc.pendingSchedulerPods, tc.activeSchedulerPods, tc.succeededSchedulerPods, tc.failedSchedulerPods, t) + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t) + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelServer, tc.pendingServerPods, tc.activeServerPods, tc.succeededServerPods, tc.failedServerPods, t) + + serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelScheduler, tc.activeSchedulerServices, t) + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelWorker, tc.activeWorkerServices, t) + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelServer, tc.activeServerServices, t) + + forget, err := ctr.syncMXJob(testutil.GetKey(tc.mxJob, t)) + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) + } + if !forget { + t.Errorf("%s: unexpected forget value. Expected true, saw %v\n", tc.description, forget) + } + + if len(fakePodControl.DeletePodName) != tc.expectedPodDeletions { + t.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakePodControl.DeletePodName)) + } + if len(fakeServiceControl.DeleteServiceName) != tc.expectedPodDeletions { + t.Errorf("%s: unexpected number of service deletes. Expected %d, saw %d\n", tc.description, tc.expectedPodDeletions, len(fakeServiceControl.DeleteServiceName)) + } + } +} + +func TestCleanupMXJob(t *testing.T) { + type testCase struct { + description string + mxJob *mxv1.MXJob + + pendingSchedulerPods int32 + activeSchedulerPods int32 + succeededSchedulerPods int32 + failedSchedulerPods int32 + + pendingWorkerPods int32 + activeWorkerPods int32 + succeededWorkerPods int32 + failedWorkerPods int32 + + pendingServerPods int32 + activeServerPods int32 + succeededServerPods int32 + failedServerPods int32 + + activeSchedulerServices int32 + activeWorkerServices int32 + activeServerServices int32 + + expectedDeleteFinished bool + } + + ttlaf0 := int32(0) + ttl0 := &ttlaf0 + ttlaf2s := int32(2) + ttl2s := &ttlaf2s + testCases := []testCase{ + testCase{ + description: "1 scheduler , 4 workers and 2 server is running, TTLSecondsAfterFinished unset", + mxJob: testutil.NewMXJobWithCleanupJobDelay(1, 4, 2, nil), + + pendingSchedulerPods: 0, + activeSchedulerPods: 1, + succeededSchedulerPods: 0, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 4, + succeededWorkerPods: 0, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 2, + succeededServerPods: 0, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedDeleteFinished: false, + }, + testCase{ + description: "1 scheduler, 4 workers and 2 servers is running, TTLSecondsAfterFinished is 0", + mxJob: testutil.NewMXJobWithCleanupJobDelay(1, 4, 2, ttl0), + + pendingSchedulerPods: 0, + activeSchedulerPods: 1, + succeededSchedulerPods: 0, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 4, + succeededWorkerPods: 0, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 2, + succeededServerPods: 0, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedDeleteFinished: true, + }, + testCase{ + description: "1 scheduler, 4 workers and 2 servers is succeeded, TTLSecondsAfterFinished is 2", + mxJob: testutil.NewMXJobWithCleanupJobDelay(1, 4, 2, ttl2s), + + pendingSchedulerPods: 0, + activeSchedulerPods: 0, + succeededSchedulerPods: 1, + failedSchedulerPods: 0, + + pendingWorkerPods: 0, + activeWorkerPods: 0, + succeededWorkerPods: 4, + failedWorkerPods: 0, + + pendingServerPods: 0, + activeServerPods: 0, + succeededServerPods: 2, + failedServerPods: 0, + + activeSchedulerServices: 1, + activeWorkerServices: 4, + activeServerServices: 2, + + expectedDeleteFinished: true, + }, + } + for _, tc := range testCases { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, kubeInformerFactory, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + fakePodControl := &controller.FakePodControl{} + ctr.PodControl = fakePodControl + fakeServiceControl := &control.FakeServiceControl{} + ctr.ServiceControl = fakeServiceControl + ctr.Recorder = &record.FakeRecorder{} + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + ctr.updateStatusHandler = func(mxJob *mxv1.MXJob) error { + return nil + } + deleteFinished := false + ctr.deleteMXJobHandler = func(mxJob *mxv1.MXJob) error { + deleteFinished = true + return nil + } + + // Set succeeded to run the logic about deleting. + testutil.SetMXJobCompletionTime(tc.mxJob) + + err := updateMXJobConditions(tc.mxJob, mxv1.MXJobSucceeded, mxJobSucceededReason, "") + if err != nil { + t.Errorf("Append mxjob condition error: %v", err) + } + + unstructured, err := testutil.ConvertMXJobToUnstructured(tc.mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + + podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelScheduler, tc.pendingSchedulerPods, tc.activeSchedulerPods, tc.succeededSchedulerPods, tc.failedSchedulerPods, t) + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, t) + testutil.SetPodsStatuses(podIndexer, tc.mxJob, testutil.LabelServer, tc.pendingServerPods, tc.activeServerPods, tc.succeededServerPods, tc.failedServerPods, t) + + serviceIndexer := kubeInformerFactory.Core().V1().Services().Informer().GetIndexer() + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelScheduler, tc.activeSchedulerServices, t) + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelWorker, tc.activeWorkerServices, t) + testutil.SetServices(serviceIndexer, tc.mxJob, testutil.LabelServer, tc.activeServerServices, t) + + ttl := tc.mxJob.Spec.TTLSecondsAfterFinished + if ttl != nil { + dur := time.Second * time.Duration(*ttl) + time.Sleep(dur) + } + + forget, err := ctr.syncMXJob(testutil.GetKey(tc.mxJob, t)) + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", tc.description, err) + } + if !forget { + t.Errorf("%s: unexpected forget value. Expected true, saw %v\n", tc.description, forget) + } + + if deleteFinished != tc.expectedDeleteFinished { + t.Errorf("%s: unexpected status. Expected %v, saw %v", tc.description, tc.expectedDeleteFinished, deleteFinished) + } + } +} diff --git a/pkg/controller.v1/mxnet/mxnet.go b/pkg/controller.v1/mxnet/mxnet.go new file mode 100644 index 00000000..ecb93d9d --- /dev/null +++ b/pkg/controller.v1/mxnet/mxnet.go @@ -0,0 +1,122 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "fmt" + "strconv" + "strings" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/tf-operator/pkg/common/jobcontroller" +) + +// MXConfig is a struct representing the distributed Mxnet config. +// This struct is turned into an environment variable MX_CONFIG +// which is used by Mxnet processes to configure themselves. +type MXConfig struct { + // Cluster represents a Mxnet ClusterSpec. + Cluster ClusterSpec `json:"cluster"` + // Labels include all label of task. + Labels LabelsSpec `json:"labels"` + // Task include information of current node. + Task TaskSpec `json:"task"` +} + +// ClusterSpec represents a cluster Mxnet specification. +type ClusterSpec map[string][]Url_Port + +type Url_Port struct { + Url string `json:"url"` + Port int `json:"port"` +} + +// LabelsSpec represents a label specification. +type LabelsSpec map[string]string + +// TaskSpec is the specification for a task (server or worker ...) of the MXJob. +type TaskSpec struct { + Type string `json:"type"` + Index int `json:"index"` +} + +func genMXConfig(mxjob *mxv1.MXJob, rtype, index string) (MXConfig, error) { + // Configure the MXCONFIG environment variable. + i, err := strconv.ParseInt(index, 0, 32) + if err != nil { + return MXConfig{}, err + } + + cluster, err := genClusterSpec(mxjob) + if err != nil { + return MXConfig{}, err + } + + labels, err := genLabelsSpec(mxjob) + if err != nil { + return MXConfig{}, err + } + + mxConfig := MXConfig{ + Cluster: cluster, + Labels: labels, + Task: TaskSpec{ + Type: rtype, + Index: int(i), + }, + } + + return mxConfig, nil +} + +// genClusterSpec will generate ClusterSpec. +func genClusterSpec(mxjob *mxv1.MXJob) (ClusterSpec, error) { + clusterSpec := make(ClusterSpec) + + for rtype, spec := range mxjob.Spec.MXReplicaSpecs { + rt := strings.ToLower(string(rtype)) + replicaNames := make([]Url_Port, 0, *spec.Replicas) + + port, err := GetPortFromMXJob(mxjob, rtype) + if err != nil { + return nil, err + } + for i := int32(0); i < *spec.Replicas; i++ { + host := Url_Port{ + Url: jobcontroller.GenGeneralName(mxjob.Name, rt, fmt.Sprintf("%d", i)), + Port: int(port), + } + replicaNames = append(replicaNames, host) + } + + clusterSpec[rt] = replicaNames + } + + return clusterSpec, nil +} + +// genLabelsSpec will generate LabelsSpec. +func genLabelsSpec(mxjob *mxv1.MXJob) (LabelsSpec, error) { + labelsSpec := make(LabelsSpec) + + for rtype, spec := range mxjob.Spec.MXReplicaSpecs { + rt := strings.ToLower(string(rtype)) + + labelsSpec[rt] = spec.Label + } + + return labelsSpec, nil +} diff --git a/pkg/controller.v1/mxnet/pod.go b/pkg/controller.v1/mxnet/pod.go new file mode 100644 index 00000000..ad7f3407 --- /dev/null +++ b/pkg/controller.v1/mxnet/pod.go @@ -0,0 +1,290 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "encoding/json" + "fmt" + "strconv" + "strings" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/tf-operator/pkg/common/jobcontroller" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" + train_util "github.com/kubeflow/tf-operator/pkg/util/train" +) + +const ( + // gang scheduler name. + gangSchedulerName = "kube-batch" + + // mxConfig is the environment variable name of MXNet cluster spec. + mxConfig = "MX_CONFIG" + + // podTemplateRestartPolicyReason is the warning reason when the restart + // policy is set in pod template. + podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy" + // exitedWithCodeReason is the normal reason when the pod is exited because of the exit code. + exitedWithCodeReason = "ExitedWithCode" + // podTemplateSchedulerNameReason is the warning reason when other scheduler name is set + // in pod templates with gang-scheduling enabled + podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName" +) + +// reconcilePods checks and updates pods for each given MXReplicaSpec. +// It will requeue the mxjob in case of an error while creating/deleting pods. +func (tc *MXController) reconcilePods( + mxjob *mxv1.MXJob, + pods []*v1.Pod, + rtype mxv1.MXReplicaType, + spec *mxv1.MXReplicaSpec, rstatus map[string]v1.PodPhase) error { + + // Convert MXReplicaType to lower string. + rt := strings.ToLower(string(rtype)) + logger := mxlogger.LoggerForReplica(mxjob, rt) + // Get all pods for the type rt. + pods, err := tc.FilterPodsForReplicaType(pods, rt) + if err != nil { + return err + } + replicas := int(*spec.Replicas) + restart := false + schedulerCompleted := false + + initializeMXReplicaStatuses(mxjob, rtype) + + podSlices := tc.GetPodSlices(pods, replicas, logger) + for index, podSlice := range podSlices { + if len(podSlice) > 1 { + logger.Warningf("We have too many pods for %s %d", rt, index) + // TODO(gaocegege): Kill some pods. + } else if len(podSlice) == 0 { + logger.Infof("Need to create new pod: %s-%d", rt, index) + err = tc.createNewPod(mxjob, rt, strconv.Itoa(index), spec) + if err != nil { + return err + } + } else { + // Check the status of the current pod. + pod := podSlice[0] + // Get the exit code of the mxnet container. + var exitCode int32 = 0xbeef // magic number + for _, status := range pod.Status.ContainerStatuses { + state := status.State + if status.Name == mxv1.DefaultContainerName && state.Terminated != nil { + exitCode = state.Terminated.ExitCode + logger.Infof("Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) + tc.Recorder.Eventf(mxjob, v1.EventTypeNormal, exitedWithCodeReason, "Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) + } + } + // Check if the pod is retryable. + if spec.RestartPolicy == mxv1.RestartPolicyExitCode { + if pod.Status.Phase == v1.PodFailed && train_util.IsRetryableExitCode(exitCode) { + logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) + if err := tc.PodControl.DeletePod(pod.Namespace, pod.Name, mxjob); err != nil { + return err + } + restart = true + } + } + + // Check whether scheduler is exited without error. + if rtype == mxv1.MXReplicaTypeScheduler && exitCode == 0 { + schedulerCompleted = true + } + updateMXJobReplicaStatuses(mxjob, rtype, pod) + } + } + + return updateStatusSingle(mxjob, rtype, replicas, restart, schedulerCompleted) +} + +// createNewPod creates a new pod for the given index and type. +func (tc *MXController) createNewPod(mxjob *mxv1.MXJob, rt, index string, spec *mxv1.MXReplicaSpec) error { + mxjobKey, err := KeyFunc(mxjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("couldn't get key for mxjob object %#v: %v", mxjob, err)) + return err + } + expectationPodsKey := jobcontroller.GenExpectationPodsKey(mxjobKey, rt) + err = tc.Expectations.ExpectCreations(expectationPodsKey, 1) + if err != nil { + return err + } + logger := mxlogger.LoggerForReplica(mxjob, rt) + // Create OwnerReference. + controllerRef := tc.GenOwnerReference(mxjob) + + // Set type and index for the worker. + labels := tc.GenLabels(mxjob.Name) + labels[mxReplicaTypeLabel] = rt + labels[mxReplicaIndexLabel] = index + + podTemplate := spec.Template.DeepCopy() + + // Set name for the template. + podTemplate.Name = jobcontroller.GenGeneralName(mxjob.Name, rt, index) + + if podTemplate.Labels == nil { + podTemplate.Labels = make(map[string]string) + } + + for key, value := range labels { + podTemplate.Labels[key] = value + } + + if err := setClusterSpec(podTemplate, mxjob, rt, index); err != nil { + return err + } + + // Submit a warning event if the user specifies restart policy for + // the pod template. We recommend to set it from the replica level. + if podTemplate.Spec.RestartPolicy != v1.RestartPolicy("") { + errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec" + logger.Warning(errMsg) + tc.Recorder.Event(mxjob, v1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg) + } + setRestartPolicy(podTemplate, spec) + + // if gang-scheduling is enabled: + // 1. if user has specified other scheduler, we report a warning without overriding any fields. + // 2. if no SchedulerName is set for pods, then we set the SchedulerName to "kube-batch". + if tc.Config.EnableGangScheduling { + if isNonGangSchedulerSet(mxjob) { + errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten" + logger.Warning(errMsg) + tc.Recorder.Event(mxjob, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg) + } else { + podTemplate.Spec.SchedulerName = gangSchedulerName + } + } + + err = tc.PodControl.CreatePodsWithControllerRef(mxjob.Namespace, podTemplate, mxjob, controllerRef) + if err != nil && errors.IsTimeout(err) { + // Pod is created but its initialization has timed out. + // If the initialization is successful eventually, the + // controller will observe the creation via the informer. + // If the initialization fails, or if the pod keeps + // uninitialized for a long time, the informer will not + // receive any update, and the controller will create a new + // pod when the expectation expires. + return nil + } else if err != nil { + return err + } + return nil +} + +func setClusterSpec(podTemplateSpec *v1.PodTemplateSpec, mxjob *mxv1.MXJob, rt, index string) error { + + // Generate MX_CONFIG JSON. + mxConfigData, err := genMXConfig(mxjob, rt, index) + if err != nil { + return err + } + + // Generate MX_CONFIG JSON Str. + mxConfigJson, err := json.Marshal(mxConfigData) + if err != nil { + return err + } + + // Add MX_CONFIG environment variable. + for i := range podTemplateSpec.Spec.Containers { + + c := &podTemplateSpec.Spec.Containers[i] + + // Set environment variable MX_CONFIG + c.Env = append(c.Env, v1.EnvVar{ + Name: mxConfig, + Value: string(mxConfigJson), + }) + + // Set Mxnet Distributed Training environment variable + // We get these envs from MX_COFING to make them stay identical + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_PS_ROOT_PORT", + Value: strconv.Itoa(getConfigAddr(&mxConfigData, mxv1.MXReplicaTypeScheduler, 0).Port), + }) + + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_PS_ROOT_URI", + Value: getConfigAddr(&mxConfigData, mxv1.MXReplicaTypeScheduler, 0).Url, + }) + + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_NUM_SERVER", + Value: strconv.Itoa(getConfigReplica(&mxConfigData, mxv1.MXReplicaTypeServer)), + }) + + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_NUM_WORKER", + Value: strconv.Itoa(getConfigReplica(&mxConfigData, mxv1.MXReplicaTypeWorker)), + }) + + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_ROLE", + Value: mxConfigData.Task.Type, + }) + + c.Env = append(c.Env, v1.EnvVar{ + Name: "DMLC_USE_KUBERNETES", + Value: strconv.Itoa(1), + }) + } + return nil +} + +func setRestartPolicy(podTemplateSpec *v1.PodTemplateSpec, spec *mxv1.MXReplicaSpec) { + if spec.RestartPolicy == mxv1.RestartPolicyExitCode { + podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicyNever + } else { + podTemplateSpec.Spec.RestartPolicy = v1.RestartPolicy(spec.RestartPolicy) + } +} + +func getConfigAddr(mxConfigData *MXConfig, rtype mxv1.MXReplicaType, index int) Url_Port { + rt := strings.ToLower(string(rtype)) + var url_port Url_Port + if len(mxConfigData.Cluster[rt]) <= index { + // index out of range, maybe this url doen't exist + url_port = Url_Port{ + Url: "", + Port: 0, + } + } else { + url_port = mxConfigData.Cluster[rt][index] + } + return url_port +} + +func getConfigReplica(mxConfigData *MXConfig, rtype mxv1.MXReplicaType) int { + rt := strings.ToLower(string(rtype)) + return len(mxConfigData.Cluster[rt]) +} + +func isNonGangSchedulerSet(job *mxv1.MXJob) bool { + for _, spec := range job.Spec.MXReplicaSpecs { + if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName != gangSchedulerName { + return true + } + } + return false +} diff --git a/pkg/controller.v1/mxnet/pod_test.go b/pkg/controller.v1/mxnet/pod_test.go new file mode 100644 index 00000000..d596ae9d --- /dev/null +++ b/pkg/controller.v1/mxnet/pod_test.go @@ -0,0 +1,238 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/kubernetes/pkg/controller" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" + batchv1alpha1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" +) + +func TestAddPod(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + ctr.Run(testutil.ThreadCount, stopCh) + } + go run(stopCh) + + var key string + syncChan := make(chan string) + ctr.syncHandler = func(mxJobKey string) (bool, error) { + key = mxJobKey + <-syncChan + return true, nil + } + + mxJob := testutil.NewMXJob(1, 0) + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + pod := testutil.NewPod(mxJob, testutil.LabelWorker, 0, t) + ctr.AddPod(pod) + + syncChan <- "sync" + if key != testutil.GetKey(mxJob, t) { + t.Errorf("Failed to enqueue the MXJob %s: expected %s, got %s", mxJob.Name, testutil.GetKey(mxJob, t), key) + } + close(stopCh) +} + +func TestRestartPolicy(t *testing.T) { + type tc struct { + mxJob *mxv1.MXJob + expectedRestartPolicy v1.RestartPolicy + expectedType mxv1.MXReplicaType + } + testCase := []tc{ + func() tc { + mxJob := testutil.NewMXJob(1, 0) + specRestartPolicy := mxv1.RestartPolicyExitCode + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].RestartPolicy = specRestartPolicy + return tc{ + mxJob: mxJob, + expectedRestartPolicy: v1.RestartPolicyNever, + expectedType: mxv1.MXReplicaTypeWorker, + } + }(), + func() tc { + mxJob := testutil.NewMXJob(1, 0) + specRestartPolicy := mxv1.RestartPolicyNever + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].RestartPolicy = specRestartPolicy + return tc{ + mxJob: mxJob, + expectedRestartPolicy: v1.RestartPolicyNever, + expectedType: mxv1.MXReplicaTypeWorker, + } + }(), + func() tc { + mxJob := testutil.NewMXJob(1, 0) + specRestartPolicy := mxv1.RestartPolicyAlways + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].RestartPolicy = specRestartPolicy + return tc{ + mxJob: mxJob, + expectedRestartPolicy: v1.RestartPolicyAlways, + expectedType: mxv1.MXReplicaTypeWorker, + } + }(), + func() tc { + mxJob := testutil.NewMXJob(1, 0) + specRestartPolicy := mxv1.RestartPolicyOnFailure + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].RestartPolicy = specRestartPolicy + return tc{ + mxJob: mxJob, + expectedRestartPolicy: v1.RestartPolicyOnFailure, + expectedType: mxv1.MXReplicaTypeWorker, + } + }(), + } + for _, c := range testCase { + spec := c.mxJob.Spec.MXReplicaSpecs[c.expectedType] + podTemplate := spec.Template + setRestartPolicy(&podTemplate, spec) + if podTemplate.Spec.RestartPolicy != c.expectedRestartPolicy { + t.Errorf("Expected %s, got %s", c.expectedRestartPolicy, podTemplate.Spec.RestartPolicy) + } + } +} + +func TestExitCode(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, kubeInformerFactory, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + fakePodControl := &controller.FakePodControl{} + ctr.PodControl = fakePodControl + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + podIndexer := kubeInformerFactory.Core().V1().Pods().Informer().GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + ctr.Run(testutil.ThreadCount, stopCh) + } + go run(stopCh) + + ctr.updateStatusHandler = func(mxJob *mxv1.MXJob) error { + return nil + } + + mxJob := testutil.NewMXJob(1, 0) + mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].RestartPolicy = mxv1.RestartPolicyExitCode + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + pod := testutil.NewPod(mxJob, testutil.LabelWorker, 0, t) + pod.Status.Phase = v1.PodFailed + pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{}) + pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, v1.ContainerStatus{ + Name: mxv1.DefaultContainerName, + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 130, + }, + }, + }) + + if err := podIndexer.Add(pod); err != nil { + t.Errorf("%s: unexpected error when adding pod %v", mxJob.Name, err) + } + _, err = ctr.syncMXJob(testutil.GetKey(mxJob, t)) + if err != nil { + t.Errorf("%s: unexpected error when syncing jobs %v", mxJob.Name, err) + } + + found := false + for _, deletedPodName := range fakePodControl.DeletePodName { + if deletedPodName == pod.Name { + found = true + } + } + if !found { + t.Errorf("Failed to delete pod %s", pod.Name) + } + close(stopCh) +} diff --git a/pkg/controller.v1/mxnet/service.go b/pkg/controller.v1/mxnet/service.go new file mode 100644 index 00000000..3db74c3d --- /dev/null +++ b/pkg/controller.v1/mxnet/service.go @@ -0,0 +1,127 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "fmt" + "strconv" + "strings" + + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/tf-operator/pkg/common/jobcontroller" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" +) + +// reconcileServices checks and updates services for each given MXReplicaSpec. +// It will requeue the mxjob in case of an error while creating/deleting services. +func (tc *MXController) reconcileServices( + mxjob *mxv1.MXJob, + services []*v1.Service, + rtype mxv1.MXReplicaType, + spec *mxv1.MXReplicaSpec) error { + + // Convert MXReplicaType to lower string. + rt := strings.ToLower(string(rtype)) + + replicas := int(*spec.Replicas) + // Get all services for the type rt. + services, err := tc.FilterServicesForReplicaType(services, rt) + if err != nil { + return err + } + + serviceSlices := tc.GetServiceSlices(services, replicas, mxlogger.LoggerForReplica(mxjob, rt)) + + for index, serviceSlice := range serviceSlices { + if len(serviceSlice) > 1 { + mxlogger.LoggerForReplica(mxjob, rt).Warningf("We have too many services for %s %d", rt, index) + // TODO(gaocegege): Kill some services. + } else if len(serviceSlice) == 0 { + mxlogger.LoggerForReplica(mxjob, rt).Infof("need to create new service: %s-%d", rt, index) + err = tc.createNewService(mxjob, rtype, strconv.Itoa(index), spec) + if err != nil { + return err + } + } + } + + return nil +} + +// createNewService creates a new service for the given index and type. +func (tc *MXController) createNewService(mxjob *mxv1.MXJob, rtype mxv1.MXReplicaType, index string, spec *mxv1.MXReplicaSpec) error { + mxjobKey, err := KeyFunc(mxjob) + if err != nil { + utilruntime.HandleError(fmt.Errorf("couldn't get key for mxjob object %#v: %v", mxjob, err)) + return err + } + + // Convert MXReplicaType to lower string. + rt := strings.ToLower(string(rtype)) + expectationServicesKey := jobcontroller.GenExpectationServicesKey(mxjobKey, rt) + err = tc.Expectations.ExpectCreations(expectationServicesKey, 1) + if err != nil { + return err + } + + // Create OwnerReference. + controllerRef := tc.GenOwnerReference(mxjob) + + // Append mxReplicaTypeLabel and mxReplicaIndexLabel labels. + labels := tc.GenLabels(mxjob.Name) + labels[mxReplicaTypeLabel] = rt + labels[mxReplicaIndexLabel] = index + + port, err := GetPortFromMXJob(mxjob, rtype) + if err != nil { + return err + } + + service := &v1.Service{ + Spec: v1.ServiceSpec{ + ClusterIP: "None", + Selector: labels, + Ports: []v1.ServicePort{ + { + Name: mxv1.DefaultPortName, + Port: port, + }, + }, + }, + } + + service.Name = jobcontroller.GenGeneralName(mxjob.Name, rt, index) + service.Labels = labels + + err = tc.ServiceControl.CreateServicesWithControllerRef(mxjob.Namespace, service, mxjob, controllerRef) + if err != nil && errors.IsTimeout(err) { + // Service is created but its initialization has timed out. + // If the initialization is successful eventually, the + // controller will observe the creation via the informer. + // If the initialization fails, or if the service keeps + // uninitialized for a long time, the informer will not + // receive any update, and the controller will create a new + // service when the expectation expires. + return nil + } else if err != nil { + return err + } + return nil +} diff --git a/pkg/controller.v1/mxnet/service_test.go b/pkg/controller.v1/mxnet/service_test.go new file mode 100644 index 00000000..dfe0345d --- /dev/null +++ b/pkg/controller.v1/mxnet/service_test.go @@ -0,0 +1,95 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "testing" + + "k8s.io/api/core/v1" + kubeclientset "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/kubernetes/pkg/controller" + + "github.com/kubeflow/mxnet-operator/cmd/mxnet-operator.v1/app/options" + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxjobclientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" + batchv1alpha1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1" + kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned" +) + +func TestAddService(t *testing.T) { + // Prepare the clientset and controller for the test. + kubeClientSet := kubeclientset.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &v1.SchemeGroupVersion, + }, + }, + ) + // Prepare the kube-batch clientset and controller for the test. + kubeBatchClientSet := kubebatchclient.NewForConfigOrDie(&rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &batchv1alpha1.SchemeGroupVersion, + }, + }, + ) + config := &rest.Config{ + Host: "", + ContentConfig: rest.ContentConfig{ + GroupVersion: &mxv1.SchemeGroupVersion, + }, + } + mxJobClientSet := mxjobclientset.NewForConfigOrDie(config) + ctr, _, _ := newMXController(config, kubeClientSet, mxJobClientSet, kubeBatchClientSet, controller.NoResyncPeriodFunc, options.ServerOption{}) + ctr.mxJobInformerSynced = testutil.AlwaysReady + ctr.PodInformerSynced = testutil.AlwaysReady + ctr.ServiceInformerSynced = testutil.AlwaysReady + mxJobIndexer := ctr.mxJobInformer.GetIndexer() + + stopCh := make(chan struct{}) + run := func(<-chan struct{}) { + ctr.Run(testutil.ThreadCount, stopCh) + } + go run(stopCh) + + var key string + syncChan := make(chan string) + ctr.syncHandler = func(mxJobKey string) (bool, error) { + key = mxJobKey + <-syncChan + return true, nil + } + + mxJob := testutil.NewMXJob(1, 0) + unstructured, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Failed to convert the MXJob to Unstructured: %v", err) + } + + if err := mxJobIndexer.Add(unstructured); err != nil { + t.Errorf("Failed to add mxjob to mxJobIndexer: %v", err) + } + service := testutil.NewService(mxJob, testutil.LabelWorker, 0, t) + ctr.AddService(service) + + syncChan <- "sync" + if key != testutil.GetKey(mxJob, t) { + t.Errorf("Failed to enqueue the MXJob %s: expected %s, got %s", mxJob.Name, testutil.GetKey(mxJob, t), key) + } + close(stopCh) +} diff --git a/pkg/controller.v1/mxnet/status.go b/pkg/controller.v1/mxnet/status.go new file mode 100644 index 00000000..61aa3aec --- /dev/null +++ b/pkg/controller.v1/mxnet/status.go @@ -0,0 +1,249 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "fmt" + + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + mxlogger "github.com/kubeflow/tf-operator/pkg/logger" +) + +const ( + // mxJobCreatedReason is added in a mxjob when it is created. + mxJobCreatedReason = "MXJobCreated" + // mxJobSucceededReason is added in a mxjob when it is succeeded. + mxJobSucceededReason = "MXJobSucceeded" + // mxJobSucceededReason is added in a mxjob when it is running. + mxJobRunningReason = "MXJobRunning" + // mxJobSucceededReason is added in a mxjob when it is failed. + mxJobFailedReason = "MXJobFailed" + // mxJobRestarting is added in a mxjob when it is restarting. + mxJobRestartingReason = "MXJobRestarting" +) + +// updateStatus updates the status of the mxjob. +func updateStatusSingle(mxjob *mxv1.MXJob, rtype mxv1.MXReplicaType, replicas int, restart, schedulerCompleted bool) error { + // Expect to have `replicas - succeeded` pods alive. + expected := replicas - int(mxjob.Status.MXReplicaStatuses[rtype].Succeeded) + running := int(mxjob.Status.MXReplicaStatuses[rtype].Active) + failed := int(mxjob.Status.MXReplicaStatuses[rtype].Failed) + + mxlogger.LoggerForJob(mxjob).Infof("MXJob=%s, ReplicaType=%s expected=%d, running=%d, failed=%d", + mxjob.Name, rtype, expected, running, failed) + // All workers are running, set StartTime. + if running == replicas && mxjob.Status.StartTime == nil { + now := metav1.Now() + mxjob.Status.StartTime = &now + } + + if ContainSchedulerSpec(mxjob) { + if rtype == mxv1.MXReplicaTypeScheduler { + if running > 0 { + msg := fmt.Sprintf("MXJob %s is running.", mxjob.Name) + err := updateMXJobConditions(mxjob, mxv1.MXJobRunning, mxJobRunningReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } + if expected == 0 { + msg := fmt.Sprintf("MXJob %s is successfully completed.", mxjob.Name) + if mxjob.Status.CompletionTime == nil { + now := metav1.Now() + mxjob.Status.CompletionTime = &now + } + err := updateMXJobConditions(mxjob, mxv1.MXJobSucceeded, mxJobSucceededReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } + } + } else { + if rtype == mxv1.MXReplicaTypeWorker || rtype == mxv1.MXReplicaTypeTuner { + // All workers are succeeded or scheduler completed, leave a succeeded condition. + if expected == 0 || schedulerCompleted { + msg := fmt.Sprintf("MXJob %s is successfully completed.", mxjob.Name) + if mxjob.Status.CompletionTime == nil { + now := metav1.Now() + mxjob.Status.CompletionTime = &now + } + err := updateMXJobConditions(mxjob, mxv1.MXJobSucceeded, mxJobSucceededReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } else if running > 0 { + // Some workers are still running, leave a running condition. + msg := fmt.Sprintf("MXJob %s is running.", mxjob.Name) + err := updateMXJobConditions(mxjob, mxv1.MXJobRunning, mxJobRunningReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } + } + } + + if failed > 0 { + if restart { + msg := fmt.Sprintf("MXJob %s is restarting.", mxjob.Name) + err := updateMXJobConditions(mxjob, mxv1.MXJobRestarting, mxJobRestartingReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } else { + msg := fmt.Sprintf("MXJob %s is failed.", mxjob.Name) + if mxjob.Status.CompletionTime == nil { + now := metav1.Now() + mxjob.Status.CompletionTime = &now + } + err := updateMXJobConditions(mxjob, mxv1.MXJobFailed, mxJobFailedReason, msg) + if err != nil { + mxlogger.LoggerForJob(mxjob).Infof("Append mxjob condition error: %v", err) + return err + } + } + } + return nil +} + +// updateMXJobStatus updates the status of the given MXJob. +func (tc *MXController) updateMXJobStatus(mxjob *mxv1.MXJob) error { + _, err := tc.mxJobClientSet.KubeflowV1().MXJobs(mxjob.Namespace).UpdateStatus(mxjob) + return err +} + +// updateMXJobConditions updates the conditions of the given mxjob. +func updateMXJobConditions(mxjob *mxv1.MXJob, conditionType mxv1.MXJobConditionType, reason, message string) error { + condition := newCondition(conditionType, reason, message) + setCondition(&mxjob.Status, condition) + return nil +} + +// initializeMXReplicaStatuses initializes the MXReplicaStatuses for replica. +func initializeMXReplicaStatuses(mxjob *mxv1.MXJob, rtype mxv1.MXReplicaType) { + if mxjob.Status.MXReplicaStatuses == nil { + mxjob.Status.MXReplicaStatuses = make(map[mxv1.MXReplicaType]*mxv1.MXReplicaStatus) + } + + mxjob.Status.MXReplicaStatuses[rtype] = &mxv1.MXReplicaStatus{} +} + +// updateMXJobReplicaStatuses updates the MXJobReplicaStatuses according to the pod. +func updateMXJobReplicaStatuses(mxjob *mxv1.MXJob, rtype mxv1.MXReplicaType, pod *v1.Pod) { + switch pod.Status.Phase { + case v1.PodRunning: + mxjob.Status.MXReplicaStatuses[rtype].Active++ + case v1.PodSucceeded: + mxjob.Status.MXReplicaStatuses[rtype].Succeeded++ + case v1.PodFailed: + mxjob.Status.MXReplicaStatuses[rtype].Failed++ + } +} + +// newCondition creates a new mxjob condition. +func newCondition(conditionType mxv1.MXJobConditionType, reason, message string) mxv1.MXJobCondition { + return mxv1.MXJobCondition{ + Type: conditionType, + Status: v1.ConditionTrue, + LastUpdateTime: metav1.Now(), + LastTransitionTime: metav1.Now(), + Reason: reason, + Message: message, + } +} + +// getCondition returns the condition with the provided type. +func getCondition(status mxv1.MXJobStatus, condType mxv1.MXJobConditionType) *mxv1.MXJobCondition { + if len(status.Conditions) > 0 { + return &status.Conditions[len(status.Conditions)-1] + } + return nil +} + +func hasCondition(status mxv1.MXJobStatus, condType mxv1.MXJobConditionType) bool { + for _, condition := range status.Conditions { + if condition.Type == condType && condition.Status == v1.ConditionTrue { + return true + } + } + return false +} + +func isSucceeded(status mxv1.MXJobStatus) bool { + return hasCondition(status, mxv1.MXJobSucceeded) +} + +func isFailed(status mxv1.MXJobStatus) bool { + return hasCondition(status, mxv1.MXJobFailed) +} + +// setCondition updates the mxjob to include the provided condition. +// If the condition that we are about to add already exists +// and has the same status and reason then we are not going to update. +func setCondition(status *mxv1.MXJobStatus, condition mxv1.MXJobCondition) { + // Do nothing if MXJobStatus have failed condition + if isFailed(*status) { + return + } + + currentCond := getCondition(*status, condition.Type) + + // Do nothing if condition doesn't change + if currentCond != nil && currentCond.Status == condition.Status && currentCond.Reason == condition.Reason { + return + } + + // Do not update lastTransitionTime if the status of the condition doesn't change. + if currentCond != nil && currentCond.Status == condition.Status { + condition.LastTransitionTime = currentCond.LastTransitionTime + } + + // Append the updated condition to the + newConditions := filterOutCondition(status.Conditions, condition.Type) + status.Conditions = append(newConditions, condition) +} + +// filterOutCondition returns a new slice of mxjob conditions without conditions with the provided type. +func filterOutCondition(conditions []mxv1.MXJobCondition, condType mxv1.MXJobConditionType) []mxv1.MXJobCondition { + var newConditions []mxv1.MXJobCondition + for _, c := range conditions { + if condType == mxv1.MXJobRestarting && c.Type == mxv1.MXJobRunning { + continue + } + if condType == mxv1.MXJobRunning && c.Type == mxv1.MXJobRestarting { + continue + } + + if c.Type == condType { + continue + } + + // Set the running condition status to be false when current condition failed or succeeded + if (condType == mxv1.MXJobFailed || condType == mxv1.MXJobSucceeded) && c.Type == mxv1.MXJobRunning { + c.Status = v1.ConditionFalse + } + + newConditions = append(newConditions, c) + } + return newConditions +} diff --git a/pkg/controller.v1/mxnet/status_test.go b/pkg/controller.v1/mxnet/status_test.go new file mode 100644 index 00000000..8d80d4b0 --- /dev/null +++ b/pkg/controller.v1/mxnet/status_test.go @@ -0,0 +1,257 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package controller provides a Kubernetes controller for a MXJob resource. +package mxnet + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" +) + +func TestFailed(t *testing.T) { + mxJob := testutil.NewMXJob(3, 0) + initializeMXReplicaStatuses(mxJob, mxv1.MXReplicaTypeWorker) + pod := testutil.NewBasePod("pod", mxJob, t) + pod.Status.Phase = v1.PodFailed + updateMXJobReplicaStatuses(mxJob, mxv1.MXReplicaTypeWorker, pod) + if mxJob.Status.MXReplicaStatuses[mxv1.MXReplicaTypeWorker].Failed != 1 { + t.Errorf("Failed to set the failed to 1") + } + err := updateStatusSingle(mxJob, mxv1.MXReplicaTypeWorker, 3, false, false) + if err != nil { + t.Errorf("Expected error %v to be nil", err) + } + found := false + for _, condition := range mxJob.Status.Conditions { + if condition.Type == mxv1.MXJobFailed { + found = true + } + } + if !found { + t.Errorf("Failed condition is not found") + } +} + +func TestStatus(t *testing.T) { + type testCase struct { + description string + mxJob *mxv1.MXJob + + expectedFailedScheduler int32 + expectedSucceededScheduler int32 + expectedActiveScheduler int32 + + expectedFailedWorker int32 + expectedSucceededWorker int32 + expectedActiveWorker int32 + + expectedFailedServer int32 + expectedSucceededServer int32 + expectedActiveServer int32 + + restart bool + schedulerCompleted bool + + expectedType mxv1.MXJobConditionType + } + + testCases := []testCase{ + { + description: "Worker is failed", + mxJob: testutil.NewMXJob(1, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 0, + expectedActiveScheduler: 0, + expectedFailedWorker: 1, + expectedSucceededWorker: 0, + expectedActiveWorker: 0, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: false, + expectedType: mxv1.MXJobFailed, + }, + { + description: "Worker is succeeded", + mxJob: testutil.NewMXJobWithScheduler(1, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 1, + expectedActiveScheduler: 0, + expectedFailedWorker: 0, + expectedSucceededWorker: 1, + expectedActiveWorker: 0, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: true, + expectedType: mxv1.MXJobSucceeded, + }, + { + description: " Worker is running", + mxJob: testutil.NewMXJobWithScheduler(1, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 0, + expectedActiveScheduler: 1, + expectedFailedWorker: 0, + expectedSucceededWorker: 0, + expectedActiveWorker: 1, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: false, + expectedType: mxv1.MXJobRunning, + }, + { + description: " 2 workers are succeeded, 2 workers are active", + mxJob: testutil.NewMXJobWithScheduler(4, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 0, + expectedActiveScheduler: 1, + expectedFailedWorker: 0, + expectedSucceededWorker: 2, + expectedActiveWorker: 2, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: false, + expectedType: mxv1.MXJobRunning, + }, + { + description: " 2 workers are running, 2 workers are failed", + mxJob: testutil.NewMXJobWithScheduler(4, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 0, + expectedActiveScheduler: 1, + expectedFailedWorker: 2, + expectedSucceededWorker: 0, + expectedActiveWorker: 2, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: false, + expectedType: mxv1.MXJobFailed, + }, + { + description: " 2 workers are succeeded, 2 workers are failed", + mxJob: testutil.NewMXJobWithScheduler(4, 0), + expectedFailedScheduler: 0, + expectedSucceededScheduler: 0, + expectedActiveScheduler: 1, + expectedFailedWorker: 2, + expectedSucceededWorker: 2, + expectedActiveWorker: 0, + expectedFailedServer: 0, + expectedSucceededServer: 0, + expectedActiveServer: 0, + restart: false, + schedulerCompleted: false, + expectedType: mxv1.MXJobFailed, + }, + } + + for i, c := range testCases { + initializeMXReplicaStatuses(c.mxJob, mxv1.MXReplicaTypeScheduler) + initializeMXReplicaStatuses(c.mxJob, mxv1.MXReplicaTypeServer) + initializeMXReplicaStatuses(c.mxJob, mxv1.MXReplicaTypeWorker) + + setStatusForTest(c.mxJob, mxv1.MXReplicaTypeScheduler, c.expectedFailedScheduler, c.expectedSucceededScheduler, c.expectedActiveScheduler, t) + setStatusForTest(c.mxJob, mxv1.MXReplicaTypeServer, c.expectedFailedServer, c.expectedSucceededServer, c.expectedActiveServer, t) + setStatusForTest(c.mxJob, mxv1.MXReplicaTypeWorker, c.expectedFailedWorker, c.expectedSucceededWorker, c.expectedActiveWorker, t) + + if _, ok := c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeScheduler]; ok { + err := updateStatusSingle(c.mxJob, mxv1.MXReplicaTypeScheduler, 1, c.restart, c.schedulerCompleted) + if err != nil { + t.Errorf("%s: Expected error %v to be nil", c.description, err) + } + if c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker] != nil { + replicas := c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].Replicas + err := updateStatusSingle(c.mxJob, mxv1.MXReplicaTypeWorker, int(*replicas), c.restart, c.schedulerCompleted) + if err != nil { + t.Errorf("%s: Expected error %v to be nil", c.description, err) + } + } + if c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer] != nil { + replicas := c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer].Replicas + err := updateStatusSingle(c.mxJob, mxv1.MXReplicaTypeServer, int(*replicas), c.restart, c.schedulerCompleted) + if err != nil { + t.Errorf("%s: Expected error %v to be nil", c.description, err) + } + } + } else { + if c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker] != nil { + replicas := c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeWorker].Replicas + err := updateStatusSingle(c.mxJob, mxv1.MXReplicaTypeWorker, int(*replicas), c.restart, c.schedulerCompleted) + if err != nil { + t.Errorf("%s: Expected error %v to be nil", c.description, err) + } + } + if c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer] != nil { + replicas := c.mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeServer].Replicas + err := updateStatusSingle(c.mxJob, mxv1.MXReplicaTypeServer, int(*replicas), c.restart, c.schedulerCompleted) + if err != nil { + t.Errorf("%s: Expected error %v to be nil", c.description, err) + } + } + } + + // Test filterOutCondition + filterOutConditionTest(c.mxJob.Status, t) + + found := false + for _, condition := range c.mxJob.Status.Conditions { + if condition.Type == c.expectedType { + found = true + } + } + if !found { + t.Errorf("Case[%d]%s: Condition %s is not found", i, c.description, c.expectedType) + } + } +} + +func setStatusForTest(mxJob *mxv1.MXJob, typ mxv1.MXReplicaType, failed, succeeded, active int32, t *testing.T) { + pod := testutil.NewBasePod("pod", mxJob, t) + var i int32 + for i = 0; i < failed; i++ { + pod.Status.Phase = v1.PodFailed + updateMXJobReplicaStatuses(mxJob, typ, pod) + } + for i = 0; i < succeeded; i++ { + pod.Status.Phase = v1.PodSucceeded + updateMXJobReplicaStatuses(mxJob, typ, pod) + } + for i = 0; i < active; i++ { + pod.Status.Phase = v1.PodRunning + updateMXJobReplicaStatuses(mxJob, typ, pod) + } +} + +func filterOutConditionTest(status mxv1.MXJobStatus, t *testing.T) { + flag := isFailed(status) || isSucceeded(status) + for _, condition := range status.Conditions { + if flag && condition.Type == mxv1.MXJobRunning && condition.Status == v1.ConditionTrue { + t.Error("Error condition status when succeeded or failed") + } + } +} diff --git a/pkg/controller.v1/mxnet/util.go b/pkg/controller.v1/mxnet/util.go new file mode 100644 index 00000000..64479c63 --- /dev/null +++ b/pkg/controller.v1/mxnet/util.go @@ -0,0 +1,48 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mxnet + +import ( + "fmt" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" +) + +var ( + errPortNotFound = fmt.Errorf("failed to found the port") +) + +// GetPortFromMXJob gets the port of mxnet container. +func GetPortFromMXJob(mxJob *mxv1.MXJob, rtype mxv1.MXReplicaType) (int32, error) { + containers := mxJob.Spec.MXReplicaSpecs[rtype].Template.Spec.Containers + for _, container := range containers { + if container.Name == mxv1.DefaultContainerName { + ports := container.Ports + for _, port := range ports { + if port.Name == mxv1.DefaultPortName { + return port.ContainerPort, nil + } + } + } + } + return -1, errPortNotFound +} + +func ContainSchedulerSpec(mxJob *mxv1.MXJob) bool { + if _, ok := mxJob.Spec.MXReplicaSpecs[mxv1.MXReplicaTypeScheduler]; ok { + return true + } + return false +} diff --git a/pkg/controller.v1/mxnet/util_test.go b/pkg/controller.v1/mxnet/util_test.go new file mode 100644 index 00000000..c17c6a7e --- /dev/null +++ b/pkg/controller.v1/mxnet/util_test.go @@ -0,0 +1,80 @@ +// Copyright 2018 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mxnet + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + mxv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/common/util/v1/testutil" +) + +func TestGenOwnerReference(t *testing.T) { + testName := "test-mxjob" + testUID := types.UID("test-UID") + mxJob := &mxv1.MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: testName, + UID: testUID, + }, + } + + ref := testutil.GenOwnerReference(mxJob) + if ref.UID != testUID { + t.Errorf("Expected UID %s, got %s", testUID, ref.UID) + } + if ref.Name != testName { + t.Errorf("Expected Name %s, got %s", testName, ref.Name) + } + if ref.APIVersion != mxv1.SchemeGroupVersion.String() { + t.Errorf("Expected APIVersion %s, got %s", mxv1.SchemeGroupVersion.String(), ref.APIVersion) + } +} + +func TestGenLabels(t *testing.T) { + testKey := "test/key" + expctedKey := "test-key" + + labels := testutil.GenLabels(testKey) + + if labels[labelMXJobName] != expctedKey { + t.Errorf("Expected %s %s, got %s", labelMXJobName, expctedKey, labels[labelMXJobName]) + } + if labels[labelGroupName] != mxv1.GroupName { + t.Errorf("Expected %s %s, got %s", labelGroupName, mxv1.GroupName, labels[labelGroupName]) + } +} + +func TestConvertMXJobToUnstructured(t *testing.T) { + testName := "test-mxjob" + testUID := types.UID("test-UID") + mxJob := &mxv1.MXJob{ + TypeMeta: metav1.TypeMeta{ + Kind: mxv1.Kind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: testName, + UID: testUID, + }, + } + + _, err := testutil.ConvertMXJobToUnstructured(mxJob) + if err != nil { + t.Errorf("Expected error to be nil while got %v", err) + } +} diff --git a/pkg/util/k8sutil/client.go b/pkg/util/k8sutil/client.go index 05e46db2..da866f0f 100644 --- a/pkg/util/k8sutil/client.go +++ b/pkg/util/k8sutil/client.go @@ -80,3 +80,17 @@ func (c *CRDRestClient) Update(obj *metav1unstructured.Unstructured, plural stri } return err } + +func (c *CRDRestClient) UpdateStatus(obj *metav1unstructured.Unstructured, plural string) error { + logger := mxlogger.LoggerForUnstructured(obj, obj.GetKind()) + if plural == "" { + logger.Errorf("Could not issue update because plural not set.") + return fmt.Errorf("plural must be set") + } + r := c.restcli.Put().Resource(plural).Namespace(obj.GetNamespace()).Name(obj.GetName()).SubResource("status").Body(obj) + _, err := r.DoRaw() + if err != nil { + logger.Errorf("Could not issue update using URL: %v; error; %v", r.URL().String(), err) + } + return err +} From 91b420118a60b51071521e55dcda73a67f03e4d3 Mon Sep 17 00:00:00 2001 From: wackxu Date: Tue, 28 May 2019 15:15:45 +0800 Subject: [PATCH 2/3] add generate file --- pkg/client/clientset/versioned/clientset.go | 20 +- .../versioned/fake/clientset_generated.go | 13 +- .../clientset/versioned/fake/register.go | 2 + .../clientset/versioned/scheme/register.go | 2 + .../clientset/versioned/typed/mxnet/v1/doc.go | 18 ++ .../versioned/typed/mxnet/v1/fake/doc.go | 18 ++ .../typed/mxnet/v1/fake/fake_mxjob.go | 138 ++++++++++++++ .../typed/mxnet/v1/fake/fake_mxnet_client.go | 38 ++++ .../typed/mxnet/v1/generated_expansion.go | 19 ++ .../versioned/typed/mxnet/v1/mxjob.go | 172 ++++++++++++++++++ .../versioned/typed/mxnet/v1/mxnet_client.go | 88 +++++++++ .../informers/externalversions/generic.go | 7 +- .../externalversions/mxnet/interface.go | 8 + .../externalversions/mxnet/v1/interface.go | 43 +++++ .../externalversions/mxnet/v1/mxjob.go | 87 +++++++++ .../listers/mxnet/v1/expansion_generated.go | 25 +++ pkg/client/listers/mxnet/v1/mxjob.go | 92 ++++++++++ 17 files changed, 783 insertions(+), 7 deletions(-) create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/doc.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/fake/doc.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxjob.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxnet_client.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/generated_expansion.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/mxjob.go create mode 100644 pkg/client/clientset/versioned/typed/mxnet/v1/mxnet_client.go create mode 100644 pkg/client/informers/externalversions/mxnet/v1/interface.go create mode 100644 pkg/client/informers/externalversions/mxnet/v1/mxjob.go create mode 100644 pkg/client/listers/mxnet/v1/expansion_generated.go create mode 100644 pkg/client/listers/mxnet/v1/mxjob.go diff --git a/pkg/client/clientset/versioned/clientset.go b/pkg/client/clientset/versioned/clientset.go index 1ec3f730..25f5be2e 100644 --- a/pkg/client/clientset/versioned/clientset.go +++ b/pkg/client/clientset/versioned/clientset.go @@ -17,6 +17,7 @@ package versioned import ( + kubeflowv1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1" kubeflowv1beta1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1beta1" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" @@ -26,8 +27,9 @@ import ( type Interface interface { Discovery() discovery.DiscoveryInterface KubeflowV1beta1() kubeflowv1beta1.KubeflowV1beta1Interface + KubeflowV1() kubeflowv1.KubeflowV1Interface // Deprecated: please explicitly pick a version if possible. - Kubeflow() kubeflowv1beta1.KubeflowV1beta1Interface + Kubeflow() kubeflowv1.KubeflowV1Interface } // Clientset contains the clients for groups. Each group has exactly one @@ -35,6 +37,7 @@ type Interface interface { type Clientset struct { *discovery.DiscoveryClient kubeflowV1beta1 *kubeflowv1beta1.KubeflowV1beta1Client + kubeflowV1 *kubeflowv1.KubeflowV1Client } // KubeflowV1beta1 retrieves the KubeflowV1beta1Client @@ -42,10 +45,15 @@ func (c *Clientset) KubeflowV1beta1() kubeflowv1beta1.KubeflowV1beta1Interface { return c.kubeflowV1beta1 } +// KubeflowV1 retrieves the KubeflowV1Client +func (c *Clientset) KubeflowV1() kubeflowv1.KubeflowV1Interface { + return c.kubeflowV1 +} + // Deprecated: Kubeflow retrieves the default version of KubeflowClient. // Please explicitly pick a version. -func (c *Clientset) Kubeflow() kubeflowv1beta1.KubeflowV1beta1Interface { - return c.kubeflowV1beta1 +func (c *Clientset) Kubeflow() kubeflowv1.KubeflowV1Interface { + return c.kubeflowV1 } // Discovery retrieves the DiscoveryClient @@ -68,6 +76,10 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { if err != nil { return nil, err } + cs.kubeflowV1, err = kubeflowv1.NewForConfig(&configShallowCopy) + if err != nil { + return nil, err + } cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy) if err != nil { @@ -81,6 +93,7 @@ func NewForConfig(c *rest.Config) (*Clientset, error) { func NewForConfigOrDie(c *rest.Config) *Clientset { var cs Clientset cs.kubeflowV1beta1 = kubeflowv1beta1.NewForConfigOrDie(c) + cs.kubeflowV1 = kubeflowv1.NewForConfigOrDie(c) cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) return &cs @@ -90,6 +103,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { func New(c rest.Interface) *Clientset { var cs Clientset cs.kubeflowV1beta1 = kubeflowv1beta1.New(c) + cs.kubeflowV1 = kubeflowv1.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs diff --git a/pkg/client/clientset/versioned/fake/clientset_generated.go b/pkg/client/clientset/versioned/fake/clientset_generated.go index 202604a4..fe3450e1 100644 --- a/pkg/client/clientset/versioned/fake/clientset_generated.go +++ b/pkg/client/clientset/versioned/fake/clientset_generated.go @@ -18,6 +18,8 @@ package fake import ( clientset "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + kubeflowv1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1" + fakekubeflowv1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1/fake" kubeflowv1beta1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1beta1" fakekubeflowv1beta1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1beta1/fake" "k8s.io/apimachinery/pkg/runtime" @@ -74,7 +76,12 @@ func (c *Clientset) KubeflowV1beta1() kubeflowv1beta1.KubeflowV1beta1Interface { return &fakekubeflowv1beta1.FakeKubeflowV1beta1{Fake: &c.Fake} } -// Kubeflow retrieves the KubeflowV1beta1Client -func (c *Clientset) Kubeflow() kubeflowv1beta1.KubeflowV1beta1Interface { - return &fakekubeflowv1beta1.FakeKubeflowV1beta1{Fake: &c.Fake} +// KubeflowV1 retrieves the KubeflowV1Client +func (c *Clientset) KubeflowV1() kubeflowv1.KubeflowV1Interface { + return &fakekubeflowv1.FakeKubeflowV1{Fake: &c.Fake} +} + +// Kubeflow retrieves the KubeflowV1Client +func (c *Clientset) Kubeflow() kubeflowv1.KubeflowV1Interface { + return &fakekubeflowv1.FakeKubeflowV1{Fake: &c.Fake} } diff --git a/pkg/client/clientset/versioned/fake/register.go b/pkg/client/clientset/versioned/fake/register.go index 47339eb1..2938fb81 100644 --- a/pkg/client/clientset/versioned/fake/register.go +++ b/pkg/client/clientset/versioned/fake/register.go @@ -17,6 +17,7 @@ package fake import ( + kubeflowv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" kubeflowv1beta1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -49,4 +50,5 @@ func init() { // correctly. func AddToScheme(scheme *runtime.Scheme) { kubeflowv1beta1.AddToScheme(scheme) + kubeflowv1.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/scheme/register.go b/pkg/client/clientset/versioned/scheme/register.go index 8e15377c..c08bb5d5 100644 --- a/pkg/client/clientset/versioned/scheme/register.go +++ b/pkg/client/clientset/versioned/scheme/register.go @@ -17,6 +17,7 @@ package scheme import ( + kubeflowv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" kubeflowv1beta1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -49,4 +50,5 @@ func init() { // correctly. func AddToScheme(scheme *runtime.Scheme) { kubeflowv1beta1.AddToScheme(scheme) + kubeflowv1.AddToScheme(scheme) } diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/doc.go b/pkg/client/clientset/versioned/typed/mxnet/v1/doc.go new file mode 100644 index 00000000..5c8101df --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/doc.go @@ -0,0 +1,18 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated typed clients. +package v1 diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/fake/doc.go b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/doc.go new file mode 100644 index 00000000..ce2fa8a9 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/doc.go @@ -0,0 +1,18 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +// Package fake has the automatically generated clients. +package fake diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxjob.go b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxjob.go new file mode 100644 index 00000000..55b76cb4 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxjob.go @@ -0,0 +1,138 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + mxnetv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + labels "k8s.io/apimachinery/pkg/labels" + schema "k8s.io/apimachinery/pkg/runtime/schema" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + testing "k8s.io/client-go/testing" +) + +// FakeMXJobs implements MXJobInterface +type FakeMXJobs struct { + Fake *FakeKubeflowV1 + ns string +} + +var mxjobsResource = schema.GroupVersionResource{Group: "kubeflow.org", Version: "v1", Resource: "mxjobs"} + +var mxjobsKind = schema.GroupVersionKind{Group: "kubeflow.org", Version: "v1", Kind: "MXJob"} + +// Get takes name of the mXJob, and returns the corresponding mXJob object, and an error if there is any. +func (c *FakeMXJobs) Get(name string, options v1.GetOptions) (result *mxnetv1.MXJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewGetAction(mxjobsResource, c.ns, name), &mxnetv1.MXJob{}) + + if obj == nil { + return nil, err + } + return obj.(*mxnetv1.MXJob), err +} + +// List takes label and field selectors, and returns the list of MXJobs that match those selectors. +func (c *FakeMXJobs) List(opts v1.ListOptions) (result *mxnetv1.MXJobList, err error) { + obj, err := c.Fake. + Invokes(testing.NewListAction(mxjobsResource, mxjobsKind, c.ns, opts), &mxnetv1.MXJobList{}) + + if obj == nil { + return nil, err + } + + label, _, _ := testing.ExtractFromListOptions(opts) + if label == nil { + label = labels.Everything() + } + list := &mxnetv1.MXJobList{ListMeta: obj.(*mxnetv1.MXJobList).ListMeta} + for _, item := range obj.(*mxnetv1.MXJobList).Items { + if label.Matches(labels.Set(item.Labels)) { + list.Items = append(list.Items, item) + } + } + return list, err +} + +// Watch returns a watch.Interface that watches the requested mXJobs. +func (c *FakeMXJobs) Watch(opts v1.ListOptions) (watch.Interface, error) { + return c.Fake. + InvokesWatch(testing.NewWatchAction(mxjobsResource, c.ns, opts)) + +} + +// Create takes the representation of a mXJob and creates it. Returns the server's representation of the mXJob, and an error, if there is any. +func (c *FakeMXJobs) Create(mXJob *mxnetv1.MXJob) (result *mxnetv1.MXJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewCreateAction(mxjobsResource, c.ns, mXJob), &mxnetv1.MXJob{}) + + if obj == nil { + return nil, err + } + return obj.(*mxnetv1.MXJob), err +} + +// Update takes the representation of a mXJob and updates it. Returns the server's representation of the mXJob, and an error, if there is any. +func (c *FakeMXJobs) Update(mXJob *mxnetv1.MXJob) (result *mxnetv1.MXJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateAction(mxjobsResource, c.ns, mXJob), &mxnetv1.MXJob{}) + + if obj == nil { + return nil, err + } + return obj.(*mxnetv1.MXJob), err +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). +func (c *FakeMXJobs) UpdateStatus(mXJob *mxnetv1.MXJob) (*mxnetv1.MXJob, error) { + obj, err := c.Fake. + Invokes(testing.NewUpdateSubresourceAction(mxjobsResource, "status", c.ns, mXJob), &mxnetv1.MXJob{}) + + if obj == nil { + return nil, err + } + return obj.(*mxnetv1.MXJob), err +} + +// Delete takes name of the mXJob and deletes it. Returns an error if one occurs. +func (c *FakeMXJobs) Delete(name string, options *v1.DeleteOptions) error { + _, err := c.Fake. + Invokes(testing.NewDeleteAction(mxjobsResource, c.ns, name), &mxnetv1.MXJob{}) + + return err +} + +// DeleteCollection deletes a collection of objects. +func (c *FakeMXJobs) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { + action := testing.NewDeleteCollectionAction(mxjobsResource, c.ns, listOptions) + + _, err := c.Fake.Invokes(action, &mxnetv1.MXJobList{}) + return err +} + +// Patch applies the patch and returns the patched mXJob. +func (c *FakeMXJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *mxnetv1.MXJob, err error) { + obj, err := c.Fake. + Invokes(testing.NewPatchSubresourceAction(mxjobsResource, c.ns, name, data, subresources...), &mxnetv1.MXJob{}) + + if obj == nil { + return nil, err + } + return obj.(*mxnetv1.MXJob), err +} diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxnet_client.go b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxnet_client.go new file mode 100644 index 00000000..337ecdfa --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/fake/fake_mxnet_client.go @@ -0,0 +1,38 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + v1 "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/typed/mxnet/v1" + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" +) + +type FakeKubeflowV1 struct { + *testing.Fake +} + +func (c *FakeKubeflowV1) MXJobs(namespace string) v1.MXJobInterface { + return &FakeMXJobs{c, namespace} +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeKubeflowV1) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/generated_expansion.go b/pkg/client/clientset/versioned/typed/mxnet/v1/generated_expansion.go new file mode 100644 index 00000000..e1fc0a68 --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/generated_expansion.go @@ -0,0 +1,19 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1 + +type MXJobExpansion interface{} diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/mxjob.go b/pkg/client/clientset/versioned/typed/mxnet/v1/mxjob.go new file mode 100644 index 00000000..4149c61a --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/mxjob.go @@ -0,0 +1,172 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1 + +import ( + v1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + scheme "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/scheme" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + rest "k8s.io/client-go/rest" +) + +// MXJobsGetter has a method to return a MXJobInterface. +// A group's client should implement this interface. +type MXJobsGetter interface { + MXJobs(namespace string) MXJobInterface +} + +// MXJobInterface has methods to work with MXJob resources. +type MXJobInterface interface { + Create(*v1.MXJob) (*v1.MXJob, error) + Update(*v1.MXJob) (*v1.MXJob, error) + UpdateStatus(*v1.MXJob) (*v1.MXJob, error) + Delete(name string, options *metav1.DeleteOptions) error + DeleteCollection(options *metav1.DeleteOptions, listOptions metav1.ListOptions) error + Get(name string, options metav1.GetOptions) (*v1.MXJob, error) + List(opts metav1.ListOptions) (*v1.MXJobList, error) + Watch(opts metav1.ListOptions) (watch.Interface, error) + Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1.MXJob, err error) + MXJobExpansion +} + +// mXJobs implements MXJobInterface +type mXJobs struct { + client rest.Interface + ns string +} + +// newMXJobs returns a MXJobs +func newMXJobs(c *KubeflowV1Client, namespace string) *mXJobs { + return &mXJobs{ + client: c.RESTClient(), + ns: namespace, + } +} + +// Get takes name of the mXJob, and returns the corresponding mXJob object, and an error if there is any. +func (c *mXJobs) Get(name string, options metav1.GetOptions) (result *v1.MXJob, err error) { + result = &v1.MXJob{} + err = c.client.Get(). + Namespace(c.ns). + Resource("mxjobs"). + Name(name). + VersionedParams(&options, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// List takes label and field selectors, and returns the list of MXJobs that match those selectors. +func (c *mXJobs) List(opts metav1.ListOptions) (result *v1.MXJobList, err error) { + result = &v1.MXJobList{} + err = c.client.Get(). + Namespace(c.ns). + Resource("mxjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Do(). + Into(result) + return +} + +// Watch returns a watch.Interface that watches the requested mXJobs. +func (c *mXJobs) Watch(opts metav1.ListOptions) (watch.Interface, error) { + opts.Watch = true + return c.client.Get(). + Namespace(c.ns). + Resource("mxjobs"). + VersionedParams(&opts, scheme.ParameterCodec). + Watch() +} + +// Create takes the representation of a mXJob and creates it. Returns the server's representation of the mXJob, and an error, if there is any. +func (c *mXJobs) Create(mXJob *v1.MXJob) (result *v1.MXJob, err error) { + result = &v1.MXJob{} + err = c.client.Post(). + Namespace(c.ns). + Resource("mxjobs"). + Body(mXJob). + Do(). + Into(result) + return +} + +// Update takes the representation of a mXJob and updates it. Returns the server's representation of the mXJob, and an error, if there is any. +func (c *mXJobs) Update(mXJob *v1.MXJob) (result *v1.MXJob, err error) { + result = &v1.MXJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("mxjobs"). + Name(mXJob.Name). + Body(mXJob). + Do(). + Into(result) + return +} + +// UpdateStatus was generated because the type contains a Status member. +// Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + +func (c *mXJobs) UpdateStatus(mXJob *v1.MXJob) (result *v1.MXJob, err error) { + result = &v1.MXJob{} + err = c.client.Put(). + Namespace(c.ns). + Resource("mxjobs"). + Name(mXJob.Name). + SubResource("status"). + Body(mXJob). + Do(). + Into(result) + return +} + +// Delete takes name of the mXJob and deletes it. Returns an error if one occurs. +func (c *mXJobs) Delete(name string, options *metav1.DeleteOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("mxjobs"). + Name(name). + Body(options). + Do(). + Error() +} + +// DeleteCollection deletes a collection of objects. +func (c *mXJobs) DeleteCollection(options *metav1.DeleteOptions, listOptions metav1.ListOptions) error { + return c.client.Delete(). + Namespace(c.ns). + Resource("mxjobs"). + VersionedParams(&listOptions, scheme.ParameterCodec). + Body(options). + Do(). + Error() +} + +// Patch applies the patch and returns the patched mXJob. +func (c *mXJobs) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1.MXJob, err error) { + result = &v1.MXJob{} + err = c.client.Patch(pt). + Namespace(c.ns). + Resource("mxjobs"). + SubResource(subresources...). + Name(name). + Body(data). + Do(). + Into(result) + return +} diff --git a/pkg/client/clientset/versioned/typed/mxnet/v1/mxnet_client.go b/pkg/client/clientset/versioned/typed/mxnet/v1/mxnet_client.go new file mode 100644 index 00000000..8e47590a --- /dev/null +++ b/pkg/client/clientset/versioned/typed/mxnet/v1/mxnet_client.go @@ -0,0 +1,88 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by client-gen. DO NOT EDIT. + +package v1 + +import ( + v1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned/scheme" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + rest "k8s.io/client-go/rest" +) + +type KubeflowV1Interface interface { + RESTClient() rest.Interface + MXJobsGetter +} + +// KubeflowV1Client is used to interact with features provided by the kubeflow.org group. +type KubeflowV1Client struct { + restClient rest.Interface +} + +func (c *KubeflowV1Client) MXJobs(namespace string) MXJobInterface { + return newMXJobs(c, namespace) +} + +// NewForConfig creates a new KubeflowV1Client for the given config. +func NewForConfig(c *rest.Config) (*KubeflowV1Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + return &KubeflowV1Client{client}, nil +} + +// NewForConfigOrDie creates a new KubeflowV1Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *KubeflowV1Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new KubeflowV1Client for the given RESTClient. +func New(c rest.Interface) *KubeflowV1Client { + return &KubeflowV1Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := v1.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = serializer.DirectCodecFactory{CodecFactory: scheme.Codecs} + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *KubeflowV1Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/pkg/client/informers/externalversions/generic.go b/pkg/client/informers/externalversions/generic.go index afb538e1..cace5438 100644 --- a/pkg/client/informers/externalversions/generic.go +++ b/pkg/client/informers/externalversions/generic.go @@ -19,6 +19,7 @@ package externalversions import ( "fmt" + v1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" v1beta1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1beta1" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" @@ -50,7 +51,11 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=kubeflow.org, Version=v1beta1 + // Group=kubeflow.org, Version=v1 + case v1.SchemeGroupVersion.WithResource("mxjobs"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1().MXJobs().Informer()}, nil + + // Group=kubeflow.org, Version=v1beta1 case v1beta1.SchemeGroupVersion.WithResource("mxjobs"): return &genericInformer{resource: resource.GroupResource(), informer: f.Kubeflow().V1beta1().MXJobs().Informer()}, nil diff --git a/pkg/client/informers/externalversions/mxnet/interface.go b/pkg/client/informers/externalversions/mxnet/interface.go index 57ffc7bb..7a60fa9d 100644 --- a/pkg/client/informers/externalversions/mxnet/interface.go +++ b/pkg/client/informers/externalversions/mxnet/interface.go @@ -18,6 +18,7 @@ package kubeflow import ( internalinterfaces "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/internalinterfaces" + v1 "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/mxnet/v1" v1beta1 "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/mxnet/v1beta1" ) @@ -25,6 +26,8 @@ import ( type Interface interface { // V1beta1 provides access to shared informers for resources in V1beta1. V1beta1() v1beta1.Interface + // V1 provides access to shared informers for resources in V1. + V1() v1.Interface } type group struct { @@ -42,3 +45,8 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList func (g *group) V1beta1() v1beta1.Interface { return v1beta1.New(g.factory, g.namespace, g.tweakListOptions) } + +// V1 returns a new v1.Interface. +func (g *group) V1() v1.Interface { + return v1.New(g.factory, g.namespace, g.tweakListOptions) +} diff --git a/pkg/client/informers/externalversions/mxnet/v1/interface.go b/pkg/client/informers/externalversions/mxnet/v1/interface.go new file mode 100644 index 00000000..f4dd4c76 --- /dev/null +++ b/pkg/client/informers/externalversions/mxnet/v1/interface.go @@ -0,0 +1,43 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package v1 + +import ( + internalinterfaces "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // MXJobs returns a MXJobInformer. + MXJobs() MXJobInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// MXJobs returns a MXJobInformer. +func (v *version) MXJobs() MXJobInformer { + return &mXJobInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/pkg/client/informers/externalversions/mxnet/v1/mxjob.go b/pkg/client/informers/externalversions/mxnet/v1/mxjob.go new file mode 100644 index 00000000..c394a811 --- /dev/null +++ b/pkg/client/informers/externalversions/mxnet/v1/mxjob.go @@ -0,0 +1,87 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by informer-gen. DO NOT EDIT. + +package v1 + +import ( + time "time" + + mxnetv1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + versioned "github.com/kubeflow/mxnet-operator/pkg/client/clientset/versioned" + internalinterfaces "github.com/kubeflow/mxnet-operator/pkg/client/informers/externalversions/internalinterfaces" + v1 "github.com/kubeflow/mxnet-operator/pkg/client/listers/mxnet/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" +) + +// MXJobInformer provides access to a shared informer and lister for +// MXJobs. +type MXJobInformer interface { + Informer() cache.SharedIndexInformer + Lister() v1.MXJobLister +} + +type mXJobInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewMXJobInformer constructs a new informer for MXJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewMXJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredMXJobInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredMXJobInformer constructs a new informer for MXJob type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredMXJobInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.KubeflowV1().MXJobs(namespace).List(options) + }, + WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.KubeflowV1().MXJobs(namespace).Watch(options) + }, + }, + &mxnetv1.MXJob{}, + resyncPeriod, + indexers, + ) +} + +func (f *mXJobInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredMXJobInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *mXJobInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&mxnetv1.MXJob{}, f.defaultInformer) +} + +func (f *mXJobInformer) Lister() v1.MXJobLister { + return v1.NewMXJobLister(f.Informer().GetIndexer()) +} diff --git a/pkg/client/listers/mxnet/v1/expansion_generated.go b/pkg/client/listers/mxnet/v1/expansion_generated.go new file mode 100644 index 00000000..eb7a4b9a --- /dev/null +++ b/pkg/client/listers/mxnet/v1/expansion_generated.go @@ -0,0 +1,25 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by lister-gen. DO NOT EDIT. + +package v1 + +// MXJobListerExpansion allows custom methods to be added to +// MXJobLister. +type MXJobListerExpansion interface{} + +// MXJobNamespaceListerExpansion allows custom methods to be added to +// MXJobNamespaceLister. +type MXJobNamespaceListerExpansion interface{} diff --git a/pkg/client/listers/mxnet/v1/mxjob.go b/pkg/client/listers/mxnet/v1/mxjob.go new file mode 100644 index 00000000..8397f376 --- /dev/null +++ b/pkg/client/listers/mxnet/v1/mxjob.go @@ -0,0 +1,92 @@ +// Copyright 2019 The Kubeflow Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by lister-gen. DO NOT EDIT. + +package v1 + +import ( + v1 "github.com/kubeflow/mxnet-operator/pkg/apis/mxnet/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/tools/cache" +) + +// MXJobLister helps list MXJobs. +type MXJobLister interface { + // List lists all MXJobs in the indexer. + List(selector labels.Selector) (ret []*v1.MXJob, err error) + // MXJobs returns an object that can list and get MXJobs. + MXJobs(namespace string) MXJobNamespaceLister + MXJobListerExpansion +} + +// mXJobLister implements the MXJobLister interface. +type mXJobLister struct { + indexer cache.Indexer +} + +// NewMXJobLister returns a new MXJobLister. +func NewMXJobLister(indexer cache.Indexer) MXJobLister { + return &mXJobLister{indexer: indexer} +} + +// List lists all MXJobs in the indexer. +func (s *mXJobLister) List(selector labels.Selector) (ret []*v1.MXJob, err error) { + err = cache.ListAll(s.indexer, selector, func(m interface{}) { + ret = append(ret, m.(*v1.MXJob)) + }) + return ret, err +} + +// MXJobs returns an object that can list and get MXJobs. +func (s *mXJobLister) MXJobs(namespace string) MXJobNamespaceLister { + return mXJobNamespaceLister{indexer: s.indexer, namespace: namespace} +} + +// MXJobNamespaceLister helps list and get MXJobs. +type MXJobNamespaceLister interface { + // List lists all MXJobs in the indexer for a given namespace. + List(selector labels.Selector) (ret []*v1.MXJob, err error) + // Get retrieves the MXJob from the indexer for a given namespace and name. + Get(name string) (*v1.MXJob, error) + MXJobNamespaceListerExpansion +} + +// mXJobNamespaceLister implements the MXJobNamespaceLister +// interface. +type mXJobNamespaceLister struct { + indexer cache.Indexer + namespace string +} + +// List lists all MXJobs in the indexer for a given namespace. +func (s mXJobNamespaceLister) List(selector labels.Selector) (ret []*v1.MXJob, err error) { + err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { + ret = append(ret, m.(*v1.MXJob)) + }) + return ret, err +} + +// Get retrieves the MXJob from the indexer for a given namespace and name. +func (s mXJobNamespaceLister) Get(name string) (*v1.MXJob, error) { + obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) + if err != nil { + return nil, err + } + if !exists { + return nil, errors.NewNotFound(v1.Resource("mxjob"), name) + } + return obj.(*v1.MXJob), nil +} From c4c4042ea423194b765f561967e8d3364eaf4aa4 Mon Sep 17 00:00:00 2001 From: xushiwei 00425595 Date: Tue, 28 May 2019 16:32:41 +0800 Subject: [PATCH 3/3] Remove usage of crd client for checking CRD existence --- cmd/mxnet-operator.v1/app/server.go | 28 +++++++++++---------- pkg/common/util/v1/unstructured/informer.go | 4 +-- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cmd/mxnet-operator.v1/app/server.go b/cmd/mxnet-operator.v1/app/server.go index e27d76e8..083ee777 100644 --- a/cmd/mxnet-operator.v1/app/server.go +++ b/cmd/mxnet-operator.v1/app/server.go @@ -22,7 +22,7 @@ import ( log "github.com/sirupsen/logrus" "k8s.io/api/core/v1" - crdclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeinformers "k8s.io/client-go/informers" kubeclientset "k8s.io/client-go/kubernetes" @@ -102,6 +102,11 @@ func Run(opt *options.ServerOption) error { return err } + if !checkCRDExists(mxJobClientSet, opt.Namespace) { + log.Info("CRD doesn't exist. Exiting") + os.Exit(1) + } + // Create informer factory. kubeInformerFactory := kubeinformers.NewFilteredSharedInformerFactory(kubeClientSet, resyncPeriod, opt.Namespace, nil) mxJobInformerFactory := mxjobinformers.NewSharedInformerFactory(mxJobClientSet, resyncPeriod) @@ -167,15 +172,6 @@ func Run(opt *options.ServerOption) error { } func createClientSets(config *restclientset.Config) (kubeclientset.Interface, kubeclientset.Interface, mxjobclientset.Interface, kubebatchclient.Interface, error) { - - crdClient, err := crdclient.NewForConfig(config) - - if err != nil { - return nil, nil, nil, nil, err - } - - checkCRDExists(crdClient, mxnetv1.MXCRD) - kubeClientSet, err := kubeclientset.NewForConfig(restclientset.AddUserAgent(config, "mxnet-operator")) if err != nil { return nil, nil, nil, nil, err @@ -199,10 +195,16 @@ func createClientSets(config *restclientset.Config) (kubeclientset.Interface, ku return kubeClientSet, leaderElectionClientSet, mxJobClientSet, kubeBatchClientSet, nil } -func checkCRDExists(clientset crdclient.Interface, crdName string) { - _, err := clientset.ApiextensionsV1beta1().CustomResourceDefinitions().Get(crdName, metav1.GetOptions{}) +func checkCRDExists(clientset mxjobclientset.Interface, namespace string) bool { + _, err := clientset.KubeflowV1().MXJobs(namespace).List(metav1.ListOptions{}) + if err != nil { log.Error(err) - os.Exit(1) + if _, ok := err.(*errors.StatusError); ok { + if errors.IsNotFound(err) { + return false + } + } } + return true } diff --git a/pkg/common/util/v1/unstructured/informer.go b/pkg/common/util/v1/unstructured/informer.go index 6dc4ccd4..6d31c57e 100644 --- a/pkg/common/util/v1/unstructured/informer.go +++ b/pkg/common/util/v1/unstructured/informer.go @@ -49,10 +49,10 @@ func newFilteredUnstructuredInformer(resource schema.GroupVersionResource, clien return cache.NewSharedIndexInformer( &cache.ListWatch{ ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { - return client.Resource(resource).List(options) + return client.Resource(resource).Namespace(namespace).List(options) }, WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { - return client.Resource(resource).Watch(options) + return client.Resource(resource).Namespace(namespace).Watch(options) }, }, &unstructured.Unstructured{},