kubeflow · google-oss-prow · Oct 31, 2025 · Oct 9, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.github/workflows/check-pr-title.yaml b/.github/workflows/check-pr-title.yaml
@@ -33,6 +33,7 @@ jobs:
             ci
             docs
             examples
+            optimizer
             scripts
             test
             trainer

diff --git a/Makefile b/Makefile
@@ -37,8 +37,6 @@ VENV_DIR := $(PROJECT_DIR)/.venv
 help: ## Display this help.
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
-#UV := $(shell which uv)
-
 .PHONY: uv
 uv: ## Install UV
 	@command -v uv &> /dev/null || { \
@@ -57,7 +55,7 @@ verify: install-dev  ## install all required tools
 	@uv run ruff format --check kubeflow
 
 .PHONY: uv-venv
-uv-venv:
+uv-venv:  ## Create uv virtual environment
 	@if [ ! -d "$(VENV_DIR)" ]; then \
 		echo "Creating uv virtual environment in $(VENV_DIR)..."; \
 		uv venv; \
@@ -75,10 +73,14 @@ release: install-dev
 
  # make test-python will produce html coverage by default. Run with `make test-python report=xml` to produce xml report.
 .PHONY: test-python
-test-python: uv-venv
+test-python: uv-venv  ## Run Python unit tests
 	@uv sync
-	@uv run coverage run --source=kubeflow.trainer.backends.kubernetes.backend,kubeflow.trainer.utils.utils -m pytest ./kubeflow/trainer/backends/kubernetes/backend_test.py ./kubeflow/trainer/utils/utils_test.py
-	@uv run coverage report -m kubeflow/trainer/backends/kubernetes/backend.py kubeflow/trainer/utils/utils.py
+	@uv run coverage run --source=kubeflow.trainer.backends.kubernetes.backend,kubeflow.trainer.utils.utils -m pytest \
+		./kubeflow/trainer/backends/kubernetes/backend_test.py \
+		./kubeflow/trainer/backends/kubernetes/utils_test.py
+	@uv run coverage report -m \
+		kubeflow/trainer/backends/kubernetes/backend.py \
+		kubeflow/trainer/backends/kubernetes/utils.py
 ifeq ($(report),xml)
 	@uv run coverage xml
 else
@@ -87,7 +89,7 @@ endif
 
 
 .PHONY: install-dev
-install-dev: uv uv-venv ruff ## Install uv, create .venv, sync deps; DEV=1 to include dev group; EXTRAS=comma,list for extras
+install-dev: uv uv-venv ruff  ## Install uv, create .venv, sync deps.
 	@echo "Using virtual environment at: $(VENV_DIR)"
 	@echo "Syncing dependencies with uv..."
 	@uv sync

diff --git a/docs/proposals/2-trainer-local-execution/README.md b/docs/proposals/2-trainer-local-execution/README.md
@@ -14,13 +14,15 @@ AI Practitioners often want to experiment locally before scaling their models to
 The proposed local execution mode will allow engineers to quickly test their models in isolated containers or virtualenvs via subprocess, facilitating a faster and more efficient workflow.
 
 ### Goals
+
 - Allow users to run training jobs on their local machines using container runtimes or subprocess.
 - Rework current Kubeflow Trainer SDK to implement Execution Backends with Kubernetes Backend as default.
 - Implement Local Execution Backends that integrates seamlessly with the Kubeflow SDK, supporting both single-node and multi-node training processes.
 - Provide an implementation that supports PyTorch, with the potential to extend to other ML frameworks or runtimes.
 - Ensure compatibility with existing Kubeflow Trainer SDK features and user interfaces.
 
 ### Non-Goals
+
 - Full support for distributed training in the first phase of implementation.
 - Support for all ML frameworks or runtime environments in the initial proof-of-concept.
 - Major changes to the Kubeflow Trainer SDK architecture.
@@ -34,18 +36,22 @@ The local execution mode will allow users to run training jobs in container runt
 ### User Stories (Optional)
 
 #### Story 1
+
 As an AI Practitioner, I want to run my model locally using Podman/Docker containers so that I can test my training job without incurring the costs of running a Kubernetes cluster.
 
 #### Story 2
+
 As an AI Practitioner, I want to initialize datasets and models within Podman/Docker containers, so that I can streamline my local training environment.
 
 ### Notes/Constraints/Caveats
+
 - Local execution mode will first support Subprocess, with future plans to explore Podman, Docker, and Apple Container.
 - The subprocess implementation will be restricted to single node.
 - The local execution mode will support only pytorch runtime initially.
 - Resource limitations on memory, cpu and gpu is not fully supported locally and might not be supported if the execution backend doesn't expose apis to support it.
 
 ### Risks and Mitigations
+
 - **Risk**: Compatibility issues with non-Docker container runtimes.
   - **Mitigation**: Initially restrict support to Podman/Docker and evaluate alternatives for future phases.
 - **Risk**: Potential conflicts between local and Kubernetes execution modes.
@@ -55,7 +61,7 @@ As an AI Practitioner, I want to initialize datasets and models within Podman/Do
 
 The local execution mode will be implemented using a new `LocalProcessBackend`, `PodmanBackend`, `DockerBackend` which will allow users to execute training jobs using containers and virtual environment isolation. The client will utilize container runtime capabilities to create isolated environments, including volumes and networks, to manage the training lifecycle. It will also allow for easy dataset and model initialization.
 
-- Different execution backends will need to implement the same interface from the `ExecutionBackend` abstract class so `TrainerClient` can initialize and load the backend.
+- Different execution backends will need to implement the same interface from the `RuntimeBackend` abstract class so `TrainerClient` can initialize and load the backend.
 - The Podman/Docker client will connect to a local container environment, create shared volumes, and initialize datasets and models as needed.
 - The **DockerBackend** will manage Docker containers, networks, and volumes using runtime definitions specified by the user.
 - The **PodmanBackend** will manage Podman containers, networks, and volumes using runtime definitions specified by the user.
@@ -70,16 +76,20 @@ The local execution mode will be implemented using a new `LocalProcessBackend`,
 - **E2E Tests**: Conduct end-to-end tests to validate the local execution mode, ensuring that jobs can be initialized, executed, and tracked correctly within Podman/Docker containers.
 
 ### Graduation Criteria
+
 - The feature will move to the `beta` stage once it supports multi-node training with pytorch framework as default runtime and works seamlessly with local environments.
 - Full support for multi-worker configurations and additional ML frameworks will be considered for the `stable` release.
 
 ## Implementation History
+
 - **KEP Creation**: April 2025
 - **Implementation Start**: April 2025
+
 ## Drawbacks
 
 - The initial implementation will be limited to single-worker training jobs, which may restrict users who need multi-node support.
 - The local execution mode will initially only support Subprocess and may require additional configurations for Podman/Docker container runtimes in the future.
 
 ## Alternatives
+
 - **Full Kubernetes Execution**: Enable users to always run jobs on Kubernetes clusters, though this comes with higher costs and longer development cycles for ML engineers.
diff --git a/kubeflow/common/__init__.py b/kubeflow/common/__init__.py
diff --git a/kubeflow/common/constants.py b/kubeflow/common/constants.py
@@ -0,0 +1,22 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The default Kubernetes namespace.
+DEFAULT_NAMESPACE = "default"
+
+# How long to wait in seconds for requests to the Kubernetes API Server.
+DEFAULT_TIMEOUT = 120
+
+# Unknown indicates that the value can't be identified.
+UNKNOWN = "Unknown"
diff --git a/...flow/trainer/backends/kubernetes/types.py → kubeflow/common/types.py b/...flow/trainer/backends/kubernetes/types.py → kubeflow/common/types.py
diff --git a/kubeflow/common/utils.py b/kubeflow/common/utils.py
@@ -0,0 +1,40 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+from kubernetes import config
+
+from kubeflow.common import constants
+
+
+def is_running_in_k8s() -> bool:
+    return os.path.isdir("/var/run/secrets/kubernetes.io/")
+
+
+def get_default_target_namespace(context: Optional[str] = None) -> str:
+    if not is_running_in_k8s():
+        try:
+            all_contexts, current_context = config.list_kube_config_contexts()
+            # If context is set, we should get namespace from it.
+            if context:
+                for c in all_contexts:
+                    if isinstance(c, dict) and c.get("name") == context:
+                        return c["context"]["namespace"]
+            # Otherwise, try to get namespace from the current context.
+            return current_context["context"]["namespace"]
+        except Exception:
+            return constants.DEFAULT_NAMESPACE
+    with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
+        return f.readline()
diff --git a/kubeflow/optimizer/__init__.py b/kubeflow/optimizer/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Import common types.
+from kubeflow.common.types import KubernetesBackendConfig
+
+# Import the Kubeflow Optimizer client.
+from kubeflow.optimizer.api.optimizer_client import OptimizerClient
+
+# Import the Kubeflow Optimizer types.
+from kubeflow.optimizer.types.algorithm_types import RandomSearch
+from kubeflow.optimizer.types.optimization_types import Objective, OptimizationJob, TrialConfig
+from kubeflow.optimizer.types.search_types import Search
+
+# Import the Kubeflow Trainer types.
+from kubeflow.trainer.types.types import TrainJobTemplate
+
+__all__ = [
+    "KubernetesBackendConfig",
+    "Objective",
+    "OptimizationJob",
+    "OptimizerClient",
+    "RandomSearch",
+    "Search",
+    "TrainJobTemplate",
+    "TrialConfig",
+]
diff --git a/kubeflow/optimizer/api/__init__.py b/kubeflow/optimizer/api/__init__.py
diff --git a/kubeflow/optimizer/api/optimizer_client.py b/kubeflow/optimizer/api/optimizer_client.py
@@ -0,0 +1,126 @@
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any, Optional
+
+from kubeflow.common.types import KubernetesBackendConfig
+from kubeflow.optimizer.backends.kubernetes.backend import KubernetesBackend
+from kubeflow.optimizer.types.algorithm_types import RandomSearch
+from kubeflow.optimizer.types.optimization_types import Objective, OptimizationJob, TrialConfig
+from kubeflow.trainer.types.types import TrainJobTemplate
+
+logger = logging.getLogger(__name__)
+
+
+class OptimizerClient:
+    def __init__(
+        self,
+        backend_config: Optional[KubernetesBackendConfig] = None,
+    ):
+        """Initialize a Kubeflow Optimizer client.
+
+        Args:
+            backend_config: Backend configuration. Either KubernetesBackendConfig or None to use
+                default config class. Defaults to KubernetesBackendConfig.
+
+        Raises:
+            ValueError: Invalid backend configuration.
+
+        """
+        # Set the default backend config.
+        if not backend_config:
+            backend_config = KubernetesBackendConfig()
+
+        if isinstance(backend_config, KubernetesBackendConfig):
+            self.backend = KubernetesBackend(backend_config)
+        else:
+            raise ValueError(f"Invalid backend config '{backend_config}'")
+
+    def optimize(
+        self,
+        trial_template: TrainJobTemplate,
+        *,
+        trial_config: Optional[TrialConfig] = None,
+        search_space: dict[str, Any],
+        objectives: Optional[list[Objective]] = None,
+        algorithm: Optional[RandomSearch] = None,
-        algorithm: Optional[RandomSearch] = None,
+        algorithm: Optional[BaseAlgorithm] = None,
-        algorithm: Optional[RandomSearch] = None,
+        algorithm: Optional[BaseAlgorithm] = None,
+    ) -> str:
+        """Create an OptimizationJob for hyperparameter tuning.
+
+        Args:
+            trial_template: The TrainJob template defining the training script.
+            trial_config: Optional configuration to run Trials.
+            objectives: List of objectives to optimize.
+            search_space: Dictionary mapping parameter names to Search specifications using
+                Search.uniform(), Search.loguniform(), Search.choice(), etc.
+            algorithm: The optimization algorithm to use. Defaults to RandomSearch.
+
+        Returns:
+            The unique name of the Experiment that has been generated.
+
+        Raises:
+            ValueError: Input arguments are invalid.
+            TimeoutError: Timeout to create Experiment.
+            RuntimeError: Failed to create Experiment.
+        """
+        return self.backend.optimize(
+            trial_template=trial_template,
+            trial_config=trial_config,
+            objectives=objectives,
+            search_space=search_space,
+            algorithm=algorithm,
+        )
+
+    def list_jobs(self) -> list[OptimizationJob]:
+        """List of the created OptimizationJobs
+
+        Returns:
+            List of created OptimizationJobs. If no OptimizationJob exist,
+                an empty list is returned.
+
+        Raises:
+            TimeoutError: Timeout to list OptimizationJobs.
+            RuntimeError: Failed to list OptimizationJobs.
+        """
+
+        return self.backend.list_jobs()
+
+    def get_job(self, name: str) -> OptimizationJob:
+        """Get the OptimizationJob object
+
+        Args:
+            name: Name of the OptimizationJob.
+
+        Returns:
+            A OptimizationJob object.
+
+        Raises:
+            TimeoutError: Timeout to get a OptimizationJob.
+            RuntimeError: Failed to get a OptimizationJob.
+        """
+
+        return self.backend.get_job(name=name)
+
+    def delete_job(self, name: str):
+        """Delete the OptimizationJob.
+
+        Args:
+            name: Name of the OptimizationJob.
+
+        Raises:
+            TimeoutError: Timeout to delete OptimizationJob.
+            RuntimeError: Failed to delete OptimizationJob.
+        """
+        return self.backend.delete_job(name=name)
diff --git a/kubeflow/trainer/utils/__init__.py → kubeflow/optimizer/backends/__init__.py b/kubeflow/trainer/utils/__init__.py → kubeflow/optimizer/backends/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2024 The Kubeflow Authors.
+# Copyright 2025 The Kubeflow Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-Original file line number
+Diff line change
@@ Expand Up / @@ -33,6 +33,7 @@ jobs: @@
                 ci
                 docs
                 examples
+                optimizer
                 scripts
                 test
                 trainer
@@ Expand Down @@