Skip to content

Commit

Permalink
Remove Kubeflow Training dependencies from Katib (#1599)
Browse files Browse the repository at this point in the history
* Remove Kubeflow Training dependencies from Katib

* Add code-generator to go.mod
  • Loading branch information
andreyvelich authored Aug 1, 2021
1 parent 44875b8 commit ddf064a
Show file tree
Hide file tree
Showing 14 changed files with 271 additions and 838 deletions.
33 changes: 3 additions & 30 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ go 1.15

require (
github.com/DATA-DOG/go-sqlmock v1.5.0
github.com/StackExchange/wmi v0.0.0-20210224194228-fe8f1750fd46 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/awalterschulze/gographviz v2.0.3+incompatible
github.com/c-bata/goptuna v0.8.0
github.com/ghodss/yaml v1.0.1-0.20190212211648-25d852aebe32
github.com/go-ole/go-ole v1.2.5 // indirect
github.com/go-openapi/spec v0.19.3
github.com/go-sql-driver/mysql v1.5.0
github.com/golang/mock v1.4.4
Expand All @@ -17,8 +16,6 @@ require (
github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20210224013640-6928f6d356ab
github.com/grpc-ecosystem/go-grpc-middleware v1.2.0
github.com/hpcloud/tail v1.0.1-0.20180514194441-a1dbeea552b7
github.com/kubeflow/common v0.3.3-0.20210201092343-3fbe0ce98269
github.com/kubeflow/tf-operator v1.0.1-rc.5.0.20210224195440-6d9ee3264d9f
github.com/mattbaird/jsonpatch v0.0.0-20171005235357-81af80346b1a
github.com/onsi/gomega v1.10.3
github.com/prometheus/client_golang v1.9.0
Expand All @@ -30,34 +27,10 @@ require (
gopkg.in/fsnotify/fsnotify.v1 v1.4.7 // indirect
k8s.io/api v0.20.4
k8s.io/apimachinery v0.20.4
k8s.io/client-go v11.0.0+incompatible
k8s.io/client-go v0.20.4
k8s.io/klog v1.0.0
k8s.io/kube-openapi v0.0.0-20210216185858-15cd8face8d6
k8s.io/utils v0.0.0-20210111153108-fddb29f9d009
sigs.k8s.io/controller-runtime v0.8.2
)

replace (
k8s.io/api => k8s.io/api v0.20.4
k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.4
k8s.io/apimachinery => k8s.io/apimachinery v0.20.4
k8s.io/apiserver => k8s.io/apiserver v0.20.4
k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.4
k8s.io/client-go => k8s.io/client-go v0.20.4
k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.4
k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.4
k8s.io/code-generator => k8s.io/code-generator v0.20.4
k8s.io/component-base => k8s.io/component-base v0.20.4
k8s.io/cri-api => k8s.io/cri-api v0.20.4
k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.4
k8s.io/klog => k8s.io/klog v1.0.0
k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.4
k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.4
k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.4
k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.4
k8s.io/kubectl => k8s.io/kubectl v0.20.4
k8s.io/kubelet => k8s.io/kubelet v0.20.4
k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.4
k8s.io/metrics => k8s.io/metrics v0.20.4
k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.4
)
replace k8s.io/code-generator => k8s.io/code-generator v0.20.4
285 changes: 32 additions & 253 deletions go.sum

Large diffs are not rendered by default.

27 changes: 0 additions & 27 deletions pkg/apis/controller/addtoscheme_pytorchjob_v1.go

This file was deleted.

26 changes: 0 additions & 26 deletions pkg/apis/controller/addtoscheme_tfjob_v1.go

This file was deleted.

87 changes: 36 additions & 51 deletions pkg/controller.v1beta1/experiment/experiment_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@ import (
"time"

"github.com/golang/mock/gomock"
commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1"
"github.com/onsi/gomega"
"github.com/prometheus/client_golang/prometheus"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
Expand All @@ -54,7 +52,7 @@ const (
experimentName = "test-experiment"
trialName = "test-trial"
namespace = "default"
primaryContainer = "tensorflow"
primaryContainer = "training-container"

timeout = time.Second * 40
)
Expand Down Expand Up @@ -146,9 +144,9 @@ func TestReconcile(t *testing.T) {
g.Expect(mgr.Start(context.TODO())).NotTo(gomega.HaveOccurred())
}()

returnedTFJob := newFakeTFJob()
returnedBatchJob := newFakeBatchJob()

returnedUnstructured, err := util.ConvertObjectToUnstructured(returnedTFJob)
returnedUnstructured, err := util.ConvertObjectToUnstructured(returnedBatchJob)
if err != nil {
t.Errorf("ConvertObjectToUnstructured failed: %v", err)
}
Expand Down Expand Up @@ -378,33 +376,27 @@ func newFakeInstance() *experimentsv1beta1.Experiment {
var parallelCount int32 = 2
var goal float64 = 99.9

trialTemplateJob := &tfv1.TFJob{
trialTemplateJob := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: "kubeflow.org/v1",
Kind: "TFJob",
APIVersion: "batch/v1",
Kind: "Job",
},
Spec: tfv1.TFJobSpec{
TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
tfv1.TFReplicaTypePS: {
Replicas: func() *int32 { i := int32(1); return &i }(),
RestartPolicy: commonv1.RestartPolicyOnFailure,
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: primaryContainer,
Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
Command: []string{
"python",
"/var/tf_mnist/mnist_with_summaries.py",
"--log_dir=/train/metrics",
"--lr=${trialParameters.learningRate}",
"--num-layers=${trialParameters.numberLayers}",
},
},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: primaryContainer,
Image: "docker.io/kubeflowkatib/mxnet-mnist",
Command: []string{
"python3",
"/opt/mxnet-mnist/mnist.py",
"--lr=${trialParameters.learningRate}",
"--num-layers=${trialParameters.numberLayers}",
},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
Expand Down Expand Up @@ -514,35 +506,28 @@ func newFakeSuggestion() *suggestionsv1beta1.Suggestion {
}
}

func newFakeTFJob() *tfv1.TFJob {
return &tfv1.TFJob{
func newFakeBatchJob() *batchv1.Job {
return &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: "kubeflow.org/v1",
Kind: "TFJob",
APIVersion: "batch/v1",
Kind: "Job",
},
ObjectMeta: metav1.ObjectMeta{
Name: "trial-name",
Namespace: "trial-namespace",
},
Spec: tfv1.TFJobSpec{
TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
tfv1.TFReplicaTypePS: {
Replicas: func() *int32 { i := int32(1); return &i }(),
RestartPolicy: commonv1.RestartPolicyOnFailure,
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: primaryContainer,
Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
Command: []string{
"python",
"/var/tf_mnist/mnist_with_summaries.py",
"--log_dir=/train/metrics",
"--lr=0.01",
"--num-layers=5",
},
},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: primaryContainer,
Image: "docker.io/kubeflowkatib/mxnet-mnist",
Command: []string{
"python3",
"/opt/mxnet-mnist/mnist.py",
"--lr=0.01",
"--num-layers=5",
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ limitations under the License.
package suggestion

import (
"context"
"encoding/json"
"sync"
"testing"
"time"

"github.com/golang/mock/gomock"
"github.com/onsi/gomega"
"golang.org/x/net/context"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand Down
6 changes: 4 additions & 2 deletions pkg/controller.v1beta1/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ const (
var (
log = logf.Log.WithName(ControllerName)
// errMetricsNotReported is the error when Trial job is succeeded but metrics are not reported yet
errMetricsNotReported = fmt.Errorf("Metrics are not reported yet")
errMetricsNotReported = fmt.Errorf("metrics are not reported yet")
)

// Add creates a new Trial Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
Expand Down Expand Up @@ -230,7 +230,9 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error {

// Job already exists.
// If Trial is EarlyStopped we need to verify/update observation logs.
if deployedJob != nil && (!instance.IsCompleted() || instance.IsEarlyStopped()) {
// TODO (andreyvelich): We can include "MetricsUnavailable" condition to "Complete".
// In that case, Trial's job will be deleted even if metrics are not available.
if deployedJob != nil && ((!instance.IsCompleted() && !instance.IsMetricsUnavailable()) || instance.IsEarlyStopped()) {
jobStatus, err := trialutil.GetDeployedJobStatus(instance, deployedJob)
if err != nil {
logger.Error(err, "GetDeployedJobStatus error")
Expand Down
Loading

0 comments on commit ddf064a

Please sign in to comment.