Merge branch 'feature-py-sdk-create-exp-generateName' of github.com:b…

…harathk005/katib into feature-py-sdk-create-exp-generateName
kubeflow · Apr 2, 2024 · a4e4b62 · a4e4b62
2 parents c583c68 + 9e24312
commit a4e4b62
Show file tree

Hide file tree

Showing 37 changed files with 377 additions and 128 deletions.
diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml
@@ -21,7 +21,27 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.9
+          python-version: 3.11
 
       - name: Run Python test
         run: make pytest
+
+  # The skopt service doesn't work appropriately with Python 3.11.
+  # So, we need to run the test with Python 3.9.
+  # TODO (tenzen-y): Once we stop to support skopt, we can remove this test.
+  # REF: https://github.com/kubeflow/katib/issues/2280
+  test-skopt:
+    name: Test Skopt
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+
+      - name: Run Python test
+        run: make pytest-skopt
diff --git a/Makefile b/Makefile
@@ -157,7 +157,6 @@ update-boilerplate:
 prepare-pytest:
 	pip install --prefer-binary -r test/unit/v1beta1/requirements.txt
 	pip install --prefer-binary -r cmd/suggestion/hyperopt/v1beta1/requirements.txt
-	pip install --prefer-binary -r cmd/suggestion/skopt/v1beta1/requirements.txt
 	pip install --prefer-binary -r cmd/suggestion/optuna/v1beta1/requirements.txt
 	pip install --prefer-binary -r cmd/suggestion/hyperband/v1beta1/requirements.txt
 	pip install --prefer-binary -r cmd/suggestion/nas/enas/v1beta1/requirements.txt
@@ -176,6 +175,16 @@ ifeq ("$(wildcard $(TEST_TENSORFLOW_EVENT_FILE_PATH))", "")
 endif
 
 pytest: prepare-pytest prepare-pytest-testdata
-	PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion
+	PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion --ignore=./test/unit/v1beta1/suggestion/test_skopt_service.py
 	PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/earlystopping
 	PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/metricscollector
+
+# The skopt service doesn't work appropriately with Python 3.11.
+# So, we need to run the test with Python 3.9.
+# TODO (tenzen-y): Once we stop to support skopt, we can remove this test.
+# REF: https://github.com/kubeflow/katib/issues/2280
+pytest-skopt:
+	pip install six
+	pip install --prefer-binary -r test/unit/v1beta1/requirements.txt
+	pip install --prefer-binary -r cmd/suggestion/skopt/v1beta1/requirements.txt
+	PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion/test_skopt_service.py
diff --git a/cmd/earlystopping/medianstop/v1beta1/Dockerfile b/cmd/earlystopping/medianstop/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/hyperband/v1beta1/Dockerfile b/cmd/suggestion/hyperband/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/hyperband/v1beta1/requirements.txt b/cmd/suggestion/hyperband/v1beta1/requirements.txt
@@ -1,6 +1,6 @@
 grpcio>=1.41.1
 cloudpickle==0.5.6
-numpy>=1.20.0
+numpy>=1.25.2
 scikit-learn>=0.24.0
 scipy>=1.5.4
 forestci==0.3

diff --git a/cmd/suggestion/hyperopt/v1beta1/Dockerfile b/cmd/suggestion/hyperopt/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/hyperopt/v1beta1/requirements.txt b/cmd/suggestion/hyperopt/v1beta1/requirements.txt
@@ -1,6 +1,6 @@
 grpcio>=1.41.1
 cloudpickle==0.5.6
-numpy>=1.20.0
+numpy>=1.25.2
 scikit-learn>=0.24.0
 scipy>=1.5.4
 forestci==0.3

diff --git a/cmd/suggestion/nas/darts/v1beta1/Dockerfile b/cmd/suggestion/nas/darts/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile b/cmd/suggestion/nas/enas/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/optuna/v1beta1/Dockerfile b/cmd/suggestion/optuna/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/pbt/v1beta1/Dockerfile b/cmd/suggestion/pbt/v1beta1/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/katib

diff --git a/cmd/suggestion/pbt/v1beta1/requirements.txt b/cmd/suggestion/pbt/v1beta1/requirements.txt
@@ -1,4 +1,4 @@
 grpcio>=1.41.1
 protobuf>=3.19.5, <=3.20.3
 googleapis-common-protos==1.53.0
-numpy==1.22.2
+numpy==1.25.2
diff --git a/docs/developer-guide.md b/docs/developer-guide.md
@@ -16,7 +16,7 @@ see the following user guides:
 - [Docker](https://docs.docker.com/) (20.10 or later)
 - [Docker Buildx](https://docs.docker.com/build/buildx/) (0.8.0 or later)
 - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later)
-- [Python](https://www.python.org/) (3.10 or later)
+- [Python](https://www.python.org/) (3.11 or later)
 - [kustomize](https://kustomize.io/) (4.0.5 or later)
 
 ## Build from source code

diff --git a/examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.cpu
@@ -1,11 +1,12 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ENV TARGET_DIR /opt/darts-cnn-cifar10
 
 ADD examples/v1beta1/trial-images/darts-cnn-cifar10 ${TARGET_DIR}
 
 WORKDIR  ${TARGET_DIR}
 
+RUN pip install --prefer-binary --no-cache-dir torch==2.2.1 torchvision==0.17.1
 RUN pip install --prefer-binary --no-cache-dir -r requirements.txt
 RUN chgrp -R 0 ${TARGET_DIR} \
   && chmod -R g+rwX ${TARGET_DIR}

diff --git a/examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.gpu
@@ -1,7 +1,7 @@
 # We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
-# PyTorch=1.13.0, cuda=11.8.0
-# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
-FROM nvcr.io/nvidia/pytorch:22.11-py3
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+FROM nvcr.io/nvidia/pytorch:24.01-py3
 
 ENV TARGET_DIR /opt/darts-cnn-cifar10
 

diff --git a/examples/v1beta1/trial-images/darts-cnn-cifar10/architect.py b/examples/v1beta1/trial-images/darts-cnn-cifar10/architect.py
@@ -20,11 +20,12 @@ class Architect():
     """" Architect controls architecture of cell by computing gradients of alphas
     """
 
-    def __init__(self, model, w_momentum, w_weight_decay):
+    def __init__(self, model, w_momentum, w_weight_decay, device):
         self.model = model
         self.v_model = copy.deepcopy(model)
         self.w_momentum = w_momentum
         self.w_weight_decay = w_weight_decay
+        self.device = device
 
     def virtual_step(self, train_x, train_y, xi, w_optim):
         """
@@ -43,17 +44,21 @@ def virtual_step(self, train_x, train_y, xi, w_optim):
         # Forward and calculate loss
         # Loss for train with w. L_train(w)
         loss = self.model.loss(train_x, train_y)
+
         # Compute gradient
         gradients = torch.autograd.grad(loss, self.model.getWeights())
-
+        
         # Do virtual step (Update gradient)
         # Below operations do not need gradient tracking
         with torch.no_grad():
             # dict key is not the value, but the pointer. So original network weight have to
             # be iterated also.
             for w, vw, g in zip(self.model.getWeights(), self.v_model.getWeights(), gradients):
                 m = w_optim.state[w].get("momentum_buffer", 0.) * self.w_momentum
-                vw.copy_(w - torch.FloatTensor(xi) * (m + g + self.w_weight_decay * w))
+                if(self.device == 'cuda'):
+                    vw.copy_(w - torch.cuda.FloatTensor(xi) * (m + g + self.w_weight_decay * w))
+                elif(self.device == 'cpu'):
+                    vw.copy_(w - torch.FloatTensor(xi) * (m + g + self.w_weight_decay * w))
 
             # Sync alphas
             for a, va in zip(self.model.getAlphas(), self.v_model.getAlphas()):
@@ -71,7 +76,7 @@ def unrolled_backward(self, train_x, train_y, valid_x, valid_y, xi, w_optim):
         # Calculate unrolled loss
         # Loss for validation with w'. L_valid(w')
         loss = self.v_model.loss(valid_x, valid_y)
-
+        
         # Calculate gradient
         v_alphas = tuple(self.v_model.getAlphas())
         v_weights = tuple(self.v_model.getWeights())
@@ -85,7 +90,10 @@ def unrolled_backward(self, train_x, train_y, valid_x, valid_y, xi, w_optim):
         # Update final gradient = dalpha - xi * hessian
         with torch.no_grad():
             for alpha, da, h in zip(self.model.getAlphas(), dalpha, hessian):
-                alpha.grad = da - torch.FloatTensor(xi) * h
+                if(self.device == 'cuda'):
+                    alpha.grad = da - torch.cuda.FloatTensor(xi) * h
+                elif(self.device == 'cpu'):
+                    alpha.grad = da - torch.cpu.FloatTensor(xi) * h
 
     def compute_hessian(self, dws, train_x, train_y):
         """

diff --git a/examples/v1beta1/trial-images/darts-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/darts-cnn-cifar10/requirements.txt
@@ -1,3 +1 @@
-torch==1.13.1
-torchvision==0.14.1
 Pillow>=9.1.1
diff --git a/examples/v1beta1/trial-images/darts-cnn-cifar10/run_trial.py b/examples/v1beta1/trial-images/darts-cnn-cifar10/run_trial.py
@@ -140,7 +140,7 @@ def main():
         num_epochs,
         eta_min=w_lr_min)
 
-    architect = Architect(model, w_momentum, w_weight_decay)
+    architect = Architect(model, w_momentum, w_weight_decay, device)
 
     # Start training
     best_top1 = 0.

diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 ENV TARGET_DIR /opt/enas-cnn-cifar10

diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
@@ -1,11 +1,12 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
 
 WORKDIR /opt/pytorch-mnist
 
 # Add folder for the logs.
 RUN mkdir /katib
+RUN pip install --prefer-binary --no-cache-dir torch==2.2.1 torchvision==0.17.1
 RUN pip install --prefer-binary --no-cache-dir -r requirements.txt
 
 RUN chgrp -R 0 /opt/pytorch-mnist \

diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
@@ -1,7 +1,7 @@
 # We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
-# PyTorch=1.13.0, cuda=11.8.0
-# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
-FROM nvcr.io/nvidia/pytorch:22.11-py3
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+FROM nvcr.io/nvidia/pytorch:24.01-py3
 
 ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
 

diff --git a/examples/v1beta1/trial-images/pytorch-mnist/requirements.txt b/examples/v1beta1/trial-images/pytorch-mnist/requirements.txt
@@ -1,4 +1,2 @@
 cloudml-hypertune==0.1.0.dev6
-torch==1.13.1
-torchvision==0.14.1
 Pillow>=9.1.1
diff --git a/examples/v1beta1/trial-images/simple-pbt/Dockerfile b/examples/v1beta1/trial-images/simple-pbt/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ADD examples/v1beta1/trial-images/simple-pbt /opt/pbt
 

diff --git a/examples/v1beta1/trial-images/simple-pbt/requirements.txt b/examples/v1beta1/trial-images/simple-pbt/requirements.txt
@@ -1 +1 @@
-numpy==1.22.2
+numpy==1.25.2
diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.11-slim
 
 ARG TARGETARCH
 

diff --git a/go.mod b/go.mod
@@ -21,8 +21,8 @@ require (
 	github.com/shirou/gopsutil/v3 v3.22.5
 	github.com/spf13/viper v1.9.0
 	github.com/tidwall/gjson v1.14.1
-	golang.org/x/net v0.10.0
-	google.golang.org/grpc v1.55.0
+	golang.org/x/net v0.17.0
+	google.golang.org/grpc v1.56.3
 	k8s.io/api v0.27.4
 	k8s.io/apimachinery v0.27.4
 	k8s.io/client-go v0.27.4
@@ -69,7 +69,7 @@ require (
 	github.com/dimchansky/utfbom v1.1.1 // indirect
 	github.com/docker/cli v24.0.0+incompatible // indirect
 	github.com/docker/distribution v2.8.2+incompatible // indirect
-	github.com/docker/docker v24.0.0+incompatible // indirect
+	github.com/docker/docker v24.0.9+incompatible // indirect
 	github.com/docker/docker-credential-helpers v0.7.0 // indirect
 	github.com/emicklei/go-restful/v3 v3.10.2 // indirect
 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
@@ -126,20 +126,20 @@ require (
 	go.uber.org/atomic v1.7.0 // indirect
 	go.uber.org/multierr v1.6.0 // indirect
 	go.uber.org/zap v1.24.0 // indirect
-	golang.org/x/crypto v0.9.0 // indirect
+	golang.org/x/crypto v0.17.0 // indirect
 	golang.org/x/mod v0.10.0 // indirect
 	golang.org/x/oauth2 v0.8.0 // indirect
 	golang.org/x/sync v0.2.0 // indirect
-	golang.org/x/sys v0.8.0 // indirect
-	golang.org/x/term v0.8.0 // indirect
-	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/sys v0.15.0 // indirect
+	golang.org/x/term v0.15.0 // indirect
+	golang.org/x/text v0.14.0 // indirect
 	golang.org/x/time v0.3.0 // indirect
 	golang.org/x/tools v0.9.1 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.3.0 // indirect
 	gonum.org/v1/gonum v0.8.2 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
-	google.golang.org/protobuf v1.30.0 // indirect
+	google.golang.org/protobuf v1.33.0 // indirect
 	gopkg.in/fsnotify/fsnotify.v1 v1.4.7 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/ini.v1 v1.63.2 // indirect