Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to AWS CI/CD #1356

Merged
merged 51 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
a1bbaa1
Add changes for AWS test infra
andreyvelich Oct 16, 2020
faa7f4c
Remove comment from resume e2e
andreyvelich Oct 16, 2020
1058b66
Change worker image
andreyvelich Oct 16, 2020
0e38b84
Refactor e2e test script
andreyvelich Oct 16, 2020
e7b2237
Replace create and delete cluster with testing scripts
andreyvelich Oct 16, 2020
aa7b47b
Fix cluster name
andreyvelich Oct 16, 2020
8c9cb49
Fix delete cluster
andreyvelich Oct 16, 2020
5257b90
Test without folder for GOPATH
andreyvelich Oct 17, 2020
28eaab8
Add AWS creds to env
andreyvelich Oct 19, 2020
e8444c3
Comment creds
andreyvelich Oct 19, 2020
55701eb
Delete v1alpha3 workflow from prow
andreyvelich Oct 19, 2020
b595b43
Add AWS cred
andreyvelich Oct 19, 2020
6b13d51
Get other build for all images
andreyvelich Oct 19, 2020
b78c5b1
Attach volume to create and delete cluster
andreyvelich Oct 19, 2020
e9db92f
Fix path for NAS suggestions
andreyvelich Oct 19, 2020
83cbecd
Change make deploy
andreyvelich Oct 19, 2020
ce7a413
Fix path
andreyvelich Oct 19, 2020
a0bf0bf
Change deploy
andreyvelich Oct 19, 2020
ae827b5
Move create cluster to build
andreyvelich Oct 19, 2020
01618b4
Fix path to valid exp
andreyvelich Oct 19, 2020
9928bce
Remove
andreyvelich Oct 19, 2020
2196cc0
Change command
andreyvelich Oct 19, 2020
137f49b
Fix region
andreyvelich Oct 19, 2020
04a11b6
Fix run e2e go path
andreyvelich Oct 19, 2020
f2c78e6
Add backoff
andreyvelich Oct 19, 2020
8c544c2
Add github.com to folder
andreyvelich Oct 19, 2020
ebd7e0d
Add github to src folder
andreyvelich Oct 19, 2020
a848c4f
Fix Katib path
andreyvelich Oct 19, 2020
346084c
Print CRDs in e2e Experiment
andreyvelich Oct 19, 2020
0b1ef6f
Show known types
andreyvelich Oct 19, 2020
ced7095
Trigger CI
andreyvelich Oct 19, 2020
16ef99d
Set TypeMeta for experiment
andreyvelich Oct 19, 2020
89be584
Print known types
andreyvelich Oct 19, 2020
fb68b6d
Build binary e2e
andreyvelich Oct 19, 2020
bf9f744
manually create experiment
andreyvelich Oct 19, 2020
f084630
Print ns list
andreyvelich Oct 19, 2020
f128950
Remove GCP auth
andreyvelich Oct 19, 2020
7c89cbf
Set kube config
andreyvelich Oct 20, 2020
13c178c
Add other e2e tests
andreyvelich Oct 20, 2020
6289e52
Remove bin
andreyvelich Oct 20, 2020
61b3995
Fix template name
andreyvelich Oct 20, 2020
9898db2
Return exp in case of error
andreyvelich Oct 20, 2020
e407aa3
Deploy TF and PyTorch controllers
andreyvelich Oct 20, 2020
fea3383
Create Kubeflow namespace
andreyvelich Oct 20, 2020
729f6da
Remove v1alpha3 tests
andreyvelich Oct 20, 2020
b7a46ed
Remove Katib client changes
andreyvelich Oct 20, 2020
e80d7c9
Add ttl seconds after finished
andreyvelich Oct 21, 2020
5bb4249
Change to 5 hours
andreyvelich Oct 21, 2020
dac8a2e
Increase activeDeadlineSeconds
andreyvelich Oct 21, 2020
ce654af
Add comments
andreyvelich Oct 22, 2020
de3c82b
Add release workflow
andreyvelich Oct 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 139 additions & 139 deletions prow_config.yaml
Original file line number Diff line number Diff line change
@@ -1,98 +1,98 @@
# This file configures the workflows to trigger in our Prow jobs.
# see kubeflow/testing/py/run_e2e_workflow.py
workflows:
- app_dir: kubeflow/katib/test/workflows
component: workflows-v1alpha3
name: e2e-v1alpha3
job_types:
- presubmit
include_dirs:
- pkg/apis/controller/common/v1alpha3/*
- pkg/apis/controller/experiments/v1alpha3/*
- pkg/apis/controller/trials/v1alpha3/*
- pkg/apis/controller/suggestions/v1alpha3/*
- pkg/apis/controller/a*.go
- pkg/apis/manager/health/*
- pkg/apis/manager/v1alpha3/*
- pkg/apis/v1alpha3/*
- pkg/common/v1alpha3/*
- pkg/controller.v1alpha3/*
- pkg/db/v1alpha3/*
- pkg/job/v1alpha3/*
- pkg/metricscollector/v1alpha3/*
- pkg/suggestion/v1alpha3/*
- pkg/ui/v1alpha3/*
- pkg/util/v1alpha3/*
- pkg/webhook/v1alpha3/*
- cmd/db-manager/v1alpha3/*
- cmd/katib-controller/v1alpha3/*
- cmd/metricscollector/v1alpha3/*
- cmd/suggestion/chocolate/v1alpha3/*
- cmd/suggestion/hyperband/v1alpha3/*
- cmd/suggestion/hyperopt/v1alpha3/*
- cmd/suggestion/nas/enas/v1alpha3/*
- cmd/suggestion/nas/darts/v1alpha3/*
- cmd/suggestion/skopt/v1alpha3/*
- cmd/suggestion/goptuna/v1alpha3/*
- cmd/ui/v1alpha3/*
- examples/v1alpha3/*
- test/e2e/v1alpha3/*
- test/scripts/v1alpha3/*
- test/suggestion/v1alpha3/*
- test/unit/v1alpha3/*
- test/workflows/*
- manifests/v1alpha3/*
- scripts/v1alpha3/*
- vendor/*
- prow_config.yaml
params:
registry: "gcr.io/automl-ci"
- app_dir: kubeflow/katib/test/workflows
component: workflows-v1alpha3
name: e2e-v1alpha3
job_types:
- postsubmit
include_dirs:
- pkg/apis/controller/common/v1alpha3/*
- pkg/apis/controller/experiments/v1alpha3/*
- pkg/apis/controller/trials/v1alpha3/*
- pkg/apis/controller/suggestions/v1alpha3/*
- pkg/apis/controller/a*.go
- pkg/apis/manager/health/*
- pkg/apis/manager/v1alpha3/*
- pkg/apis/v1alpha3/*
- pkg/common/v1alpha3/*
- pkg/controller.v1alpha3/*
- pkg/db/v1alpha3/*
- pkg/job/v1alpha3/*
- pkg/metricscollector/v1alpha3/*
- pkg/suggestion/v1alpha3/*
- pkg/ui/v1alpha3/*
- pkg/util/v1alpha3/*
- pkg/webhook/v1alpha3/*
- cmd/db-manager/v1alpha3/*
- cmd/katib-controller/v1alpha3/*
- cmd/metricscollector/v1alpha3/*
- cmd/suggestion/chocolate/v1alpha3/*
- cmd/suggestion/hyperband/v1alpha3/*
- cmd/suggestion/hyperopt/v1alpha3/*
- cmd/suggestion/nas/enas/v1alpha3/*
- cmd/suggestion/nas/darts/v1alpha3/*
- cmd/suggestion/skopt/v1alpha3/*
- cmd/suggestion/goptuna/v1alpha3/*
- cmd/ui/v1alpha3/*
- examples/v1alpha3/*
- test/e2e/v1alpha3/*
- test/scripts/v1alpha3/*
- test/suggestion/v1alpha3/*
- test/unit/v1alpha3/*
- test/workflows/*
- manifests/v1alpha3/*
- scripts/v1alpha3/*
- vendor/*
- prow_config.yaml
params:
registry: "gcr.io/kubeflow-images-public"
# - app_dir: kubeflow/katib/test/workflows
# component: workflows-v1alpha3
# name: e2e-v1alpha3
# job_types:
# - presubmit
# include_dirs:
# - pkg/apis/controller/common/v1alpha3/*
# - pkg/apis/controller/experiments/v1alpha3/*
# - pkg/apis/controller/trials/v1alpha3/*
# - pkg/apis/controller/suggestions/v1alpha3/*
# - pkg/apis/controller/a*.go
# - pkg/apis/manager/health/*
# - pkg/apis/manager/v1alpha3/*
# - pkg/apis/v1alpha3/*
# - pkg/common/v1alpha3/*
# - pkg/controller.v1alpha3/*
# - pkg/db/v1alpha3/*
# - pkg/job/v1alpha3/*
# - pkg/metricscollector/v1alpha3/*
# - pkg/suggestion/v1alpha3/*
# - pkg/ui/v1alpha3/*
# - pkg/util/v1alpha3/*
# - pkg/webhook/v1alpha3/*
# - cmd/db-manager/v1alpha3/*
# - cmd/katib-controller/v1alpha3/*
# - cmd/metricscollector/v1alpha3/*
# - cmd/suggestion/chocolate/v1alpha3/*
# - cmd/suggestion/hyperband/v1alpha3/*
# - cmd/suggestion/hyperopt/v1alpha3/*
# - cmd/suggestion/nas/enas/v1alpha3/*
# - cmd/suggestion/nas/darts/v1alpha3/*
# - cmd/suggestion/skopt/v1alpha3/*
# - cmd/suggestion/goptuna/v1alpha3/*
# - cmd/ui/v1alpha3/*
# - examples/v1alpha3/*
# - test/e2e/v1alpha3/*
# - test/scripts/v1alpha3/*
# - test/suggestion/v1alpha3/*
# - test/unit/v1alpha3/*
# - test/workflows/*
# - manifests/v1alpha3/*
# - scripts/v1alpha3/*
# - vendor/*
# - prow_config.yaml
# params:
# registry: "gcr.io/automl-ci"
# - app_dir: kubeflow/katib/test/workflows
# component: workflows-v1alpha3
# name: e2e-v1alpha3
# job_types:
# - postsubmit
# include_dirs:
# - pkg/apis/controller/common/v1alpha3/*
# - pkg/apis/controller/experiments/v1alpha3/*
# - pkg/apis/controller/trials/v1alpha3/*
# - pkg/apis/controller/suggestions/v1alpha3/*
# - pkg/apis/controller/a*.go
# - pkg/apis/manager/health/*
# - pkg/apis/manager/v1alpha3/*
# - pkg/apis/v1alpha3/*
# - pkg/common/v1alpha3/*
# - pkg/controller.v1alpha3/*
# - pkg/db/v1alpha3/*
# - pkg/job/v1alpha3/*
# - pkg/metricscollector/v1alpha3/*
# - pkg/suggestion/v1alpha3/*
# - pkg/ui/v1alpha3/*
# - pkg/util/v1alpha3/*
# - pkg/webhook/v1alpha3/*
# - cmd/db-manager/v1alpha3/*
# - cmd/katib-controller/v1alpha3/*
# - cmd/metricscollector/v1alpha3/*
# - cmd/suggestion/chocolate/v1alpha3/*
# - cmd/suggestion/hyperband/v1alpha3/*
# - cmd/suggestion/hyperopt/v1alpha3/*
# - cmd/suggestion/nas/enas/v1alpha3/*
# - cmd/suggestion/nas/darts/v1alpha3/*
# - cmd/suggestion/skopt/v1alpha3/*
# - cmd/suggestion/goptuna/v1alpha3/*
# - cmd/ui/v1alpha3/*
# - examples/v1alpha3/*
# - test/e2e/v1alpha3/*
# - test/scripts/v1alpha3/*
# - test/suggestion/v1alpha3/*
# - test/unit/v1alpha3/*
# - test/workflows/*
# - manifests/v1alpha3/*
# - scripts/v1alpha3/*
# - vendor/*
# - prow_config.yaml
# params:
# registry: "gcr.io/kubeflow-images-public"
- app_dir: kubeflow/katib/test/workflows
component: workflows-v1beta1
name: e2e-v1beta1
Expand Down Expand Up @@ -138,50 +138,50 @@ workflows:
- vendor/*
- prow_config.yaml
params:
registry: "gcr.io/automl-ci"
- app_dir: kubeflow/katib/test/workflows
component: workflows-v1beta1
name: e2e-v1beta1
job_types:
- postsubmit
include_dirs:
- pkg/apis/controller/common/v1beta1/*
- pkg/apis/controller/experiments/v1beta1/*
- pkg/apis/controller/trials/v1beta1/*
- pkg/apis/controller/suggestions/v1beta1/*
- pkg/apis/controller/a*.go
- pkg/apis/manager/health/*
- pkg/apis/manager/v1beta1/*
- pkg/apis/v1beta1/*
- pkg/common/v1beta1/*
- pkg/controller.v1beta1/*
- pkg/db/v1beta1/*
- pkg/job/v1beta1/*
- pkg/metricscollector/v1beta1/*
- pkg/suggestion/v1beta1/*
- pkg/ui/v1beta1/*
- pkg/util/v1beta1/*
- pkg/webhook/v1beta1/*
- cmd/db-manager/v1beta1/*
- cmd/katib-controller/v1beta1/*
- cmd/metricscollector/v1beta1/*
- cmd/suggestion/chocolate/v1beta1/*
- cmd/suggestion/hyperband/v1beta1/*
- cmd/suggestion/hyperopt/v1beta1/*
- cmd/suggestion/nas/enas/v1beta1/*
- cmd/suggestion/nas/darts/v1beta1/*
- cmd/suggestion/skopt/v1beta1/*
- cmd/suggestion/goptuna/v1beta1/*
- cmd/ui/v1beta1/*
- examples/v1beta1/*
- test/e2e/v1beta1/*
- test/scripts/v1beta1/*
- test/suggestion/v1beta1/*
- test/unit/v1beta1/*
- test/workflows/*
- manifests/v1beta1/*
- scripts/v1beta1/*
- vendor/*
- prow_config.yaml
params:
registry: "gcr.io/kubeflow-images-public"
registry: http://527798164940.dkr.ecr.us-west-2.amazonaws.com/
# - app_dir: kubeflow/katib/test/workflows
# component: workflows-v1beta1
# name: e2e-v1beta1
# job_types:
# - postsubmit
# include_dirs:
# - pkg/apis/controller/common/v1beta1/*
# - pkg/apis/controller/experiments/v1beta1/*
# - pkg/apis/controller/trials/v1beta1/*
# - pkg/apis/controller/suggestions/v1beta1/*
# - pkg/apis/controller/a*.go
# - pkg/apis/manager/health/*
# - pkg/apis/manager/v1beta1/*
# - pkg/apis/v1beta1/*
# - pkg/common/v1beta1/*
# - pkg/controller.v1beta1/*
# - pkg/db/v1beta1/*
# - pkg/job/v1beta1/*
# - pkg/metricscollector/v1beta1/*
# - pkg/suggestion/v1beta1/*
# - pkg/ui/v1beta1/*
# - pkg/util/v1beta1/*
# - pkg/webhook/v1beta1/*
# - cmd/db-manager/v1beta1/*
# - cmd/katib-controller/v1beta1/*
# - cmd/metricscollector/v1beta1/*
# - cmd/suggestion/chocolate/v1beta1/*
# - cmd/suggestion/hyperband/v1beta1/*
# - cmd/suggestion/hyperopt/v1beta1/*
# - cmd/suggestion/nas/enas/v1beta1/*
# - cmd/suggestion/nas/darts/v1beta1/*
# - cmd/suggestion/skopt/v1beta1/*
# - cmd/suggestion/goptuna/v1beta1/*
# - cmd/ui/v1beta1/*
# - examples/v1beta1/*
# - test/e2e/v1beta1/*
# - test/scripts/v1beta1/*
# - test/suggestion/v1beta1/*
# - test/unit/v1beta1/*
# - test/workflows/*
# - manifests/v1beta1/*
# - scripts/v1beta1/*
# - vendor/*
# - prow_config.yaml
# params:
# registry: "gcr.io/kubeflow-images-public"
35 changes: 31 additions & 4 deletions test/e2e/v1beta1/run-e2e-experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
"io/ioutil"
"log"
"os"
"os/exec"
"strconv"
"strings"
"time"

"k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -52,19 +54,25 @@ func main() {
log.Fatal("Experiment name is missing")
}
expName := os.Args[1]
b, err := ioutil.ReadFile(expName)
byteExp, err := ioutil.ReadFile(expName)
if err != nil {
log.Fatal("Error in reading file ", err)
}

// Replace batch size to number of epochs for faster execution.
strExp := strings.Replace(string(byteExp), "--batch-size=64", "--num-epochs=2", -1)

exp := &experimentsv1beta1.Experiment{}
buf := bytes.NewBufferString(string(b))
buf := bytes.NewBufferString(strExp)
if err = k8syaml.NewYAMLOrJSONDecoder(buf, 1024).Decode(exp); err != nil {
log.Fatal("Yaml decode error ", err)
}

kclient, err := katibclient.NewClient(client.Options{})
if err != nil {
log.Fatal("NewClient for Katib failed: ", err)
}

if exp.Spec.Algorithm.AlgorithmName != "hyperband" && exp.Spec.Algorithm.AlgorithmName != "darts" {
// Hyperband will validate the parallel trial count,
// thus we should not change it.
Expand All @@ -86,7 +94,7 @@ func main() {
}
log.Printf("Waiting for Experiment %s to finish.", exp.Name)
log.Printf(`Experiment %s's trials: %d trials, %d pending trials,
%d running trials, %d killed trials, %d succeeded trials, %d failed trials.`,
%d running trials, %d killed trials, %d succeeded trials, %d failed trials.`,
exp.Name,
exp.Status.Trials, exp.Status.TrialsPending, exp.Status.TrialsRunning,
exp.Status.TrialsKilled, exp.Status.TrialsSucceeded, exp.Status.TrialsFailed)
Expand Down Expand Up @@ -187,5 +195,24 @@ func main() {
}
}

log.Printf("Experiment has recorded best current Optimal Trial %v", exp.Status.CurrentOptimalTrial)
log.Printf("Experiment has recorded best current Optimal Trial %v\n", exp.Status.CurrentOptimalTrial)

out, err := exec.Command("kubectl", "describe", "suggestion", exp.Name, "-n", exp.Namespace).Output()
if err != nil {
log.Fatalf("Execute \"kubectl describe suggestion\" failed: %v", err)
}
fmt.Println(string(out))

out, err = exec.Command("kubectl", "describe", "experiment", exp.Name, "-n", exp.Namespace).Output()
if err != nil {
log.Fatalf("Execute \"kubectl describe experiment\" failed: %v", err)
}
fmt.Println(string(out))

log.Printf("Deleting Experiment: %v\n", exp.Name)
err = kclient.DeleteRuntimeObject(exp)
if err != nil {
log.Fatalf("Unable to delete Experiment: %v, error: %v", exp.Name, err)
}

}
Loading