From 6108300220d6264d3093ccedc5e274312a6ec109 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 13 May 2024 23:52:16 +0300 Subject: [PATCH 01/64] Initial version. Signed-off-by: Revital Sur --- .make.versions | 2 +- kfp/requirements.env | 2 +- transforms/.make.transforms_workflows | 13 ++++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.make.versions b/.make.versions index 039c1a502..5933c5b2b 100644 --- a/.make.versions +++ b/.make.versions @@ -27,5 +27,5 @@ CODE_QUALITY_VERSION=0.4.0 DOC_QUALITY_VERSION=0.4.0 INGEST_TO_PARQUET_VERSION=0.4.0 - +KFP_DOCKER_VERSION_v2=0.1.0-v2 KFP_DOCKER_VERSION=0.2.0 diff --git a/kfp/requirements.env b/kfp/requirements.env index ef5110bcc..7a9c3f360 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -1,2 +1,2 @@ RAY=2.9.3 -KFP=1.8.22 +KFP=2.7.0 diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index 19370568d..0e1ba3540 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -52,7 +52,18 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/; @# Help: Create the virtual environment common to all workflows - + + +#TODO KFPv2 +${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt + @# Help: Create the virtual environment common to all workflows + rm -rf ${REPOROOT}/kfp/transform_workflows/venv + $(PYTHON) -m venv ${REPOROOT}/kfp/transform_workflows/venv + . ${VENV_ACTIVATE}; \ + pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ + pip install kfp-kubernetes --extra-index-url https://pypi.org/simple; \ + pip install -e $(REPOROOT)/kfp/kfp_support_lib/ + .PHONY: .transforms_workflows.upload-pipeline .transforms_workflows.upload-pipeline: $(call set_env_var, CLUSTER_EXISTS, $(shell kind get clusters | grep ${KIND_CLUSTER_NAME})) From 0d4b3f75122795448385dc33c389071ddd0da86a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 15 May 2024 06:55:01 -0500 Subject: [PATCH 02/64] Comment compute_exec_params_op. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 69f9f0d67..23e1c14d8 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -8,6 +8,8 @@ LABEL git-commit=$GIT_COMMIT # install libraries COPY requirements.txt requirements.txt +RUN pip install kfp==2.7.0 --extra-index-url https://pypi.org/simple +RUN pip install kfp-kubernetes --extra-index-url https://pypi.org/simple RUN pip install --no-cache-dir -r requirements.txt # Copy and install data processing libraries From b2ad17f8e8e6cbe0f02ea1d4c79e59f48d3ae853 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 21 May 2024 15:07:57 +0300 Subject: [PATCH 03/64] separate lib for compile only dependencies Signed-off-by: Alexey Roytman --- .../workflow_support/comp_utils/__init__.py | 3 + .../workflow_support/comp_utils/component.py | 54 ++++++++ .../pipeline_utils/__init__.py | 3 + .../pipeline_utils/pipeline_utils.py | 121 ++++++++++++++++++ .../pipelines_tests_utils.py | 16 ++- 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py create mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py create mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py create mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py rename kfp/kfp_support_lib/src/kfp_support/workflow_support/{utils => pipeline_utils}/pipelines_tests_utils.py (78%) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py new file mode 100644 index 000000000..9bc541af8 --- /dev/null +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.comp_utils.component import ( + CompileComponentUtils +) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py new file mode 100644 index 000000000..adaa971c1 --- /dev/null +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py @@ -0,0 +1,54 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class CompileComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py new file mode 100644 index 000000000..654608dc4 --- /dev/null +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.pipeline_utils.pipeline_utils import ( + PipelinesUtils, +) \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py new file mode 100644 index 000000000..47d886209 --- /dev/null +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py @@ -0,0 +1,121 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import time + +from typing import Any, Optional +import kfp_server_api +from kfp import Client +from data_processing.utils import get_logger + +logger = get_logger(__name__) + +class PipelinesUtils: + """ + Helper class for pipeline management + """ + + def __init__(self, host: str = "http://localhost:8080"): + """ + Initialization + :param host: host to connect to + """ + self.kfp_client = Client(host=host) + + def start_pipeline( + self, + pipeline: kfp_server_api.V2beta1Pipeline, + experiment: kfp_server_api.V2beta1Experiment, + params: Optional[dict[str, Any]], + ) -> str: + """ + Start a specified pipeline. + :param pipeline: pipeline definition + :param experiment: experiment to use + :param params: pipeline parameters + :return: the id of the run object + """ + job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + try: + run_id = self.kfp_client.run_pipeline( + experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params + ) + logger.info("Pipeline submitted") + return run_id.id + except Exception as e: + logger.warning(f"Exception starting pipeline {e}") + return None + + def get_experiment_by_name(self, name: str = "Default") -> kfp_server_api.V2beta1Experiment: + """ + Get experiment by name + :param name: name + :return: experiment + """ + try: + return self.kfp_client.get_experiment(experiment_name=name) + except Exception as e: + logger.warning(f"Exception getting experiment {e}") + return None + + def get_pipeline_by_name(self, name: str, np: int = 100) -> kfp_server_api.V2beta1Pipeline: + """ + Given pipeline name, return the pipeline + :param name: pipeline name + :param np: page size for pipeline query. For large clusters with many pipelines, you might need to + increase this number + :return: pipeline + """ + try: + # Get all pipelines + pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines + required = list(filter(lambda p: name in p.name, pipelines)) + if len(required) != 1: + logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") + return None + return required[0] + + except Exception as e: + logger.warning(f"Exception getting pipeline {e}") + return None + + def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: + """ + Waits for a pipeline run to complete + :param run_id: run id + :param timeout: timeout (sec) (-1 wait forever) + :param wait: internal wait (sec) + :return: Completion status and an error message if such exists + """ + try: + if timeout > 0: + end = time.time() + timeout + else: + end = 2**63 - 1 + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: + time.sleep(wait) + if (end - time.time()) < 0: + return "failed", f"Execution is taking too long" + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + logger.info(f"Got pipeline execution status {status}") + + if status.lower() in ["succeeded", "completed"]: + return status, "" + return status, run_details.run.error + + except Exception as e: + logger.warning(f"Failed waiting pipeline completion {e}") + return "failed", str(e) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py similarity index 78% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py index 1e7ff9cf7..a30003018 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -1,9 +1,23 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime + import os import sys from data_processing.utils import get_logger, str2bool -from . import PipelinesUtils +from kfp_support.workflow_support.pipeline_utils import PipelinesUtils logger = get_logger(__name__) From cbe07d2ec92e957134ac713be395e7df50ea48ba Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 29 May 2024 23:56:35 +0300 Subject: [PATCH 04/64] merge with dev Signed-off-by: Alexey Roytman --- .make.defaults | 9 + .make.versions | 3 +- kfp/doc/simple_transform_pipeline.md | 14 +- kfp/kfp_ray_components/Dockerfile | 2 - kfp/kfp_ray_components/Dockerfile_v2 | 25 + kfp/kfp_ray_components/Makefile | 33 +- .../kfp_support_lib_v2/README.md | 68 ++ .../kfp_support_lib_v2/pyproject.toml | 47 ++ .../kfp_support/api_server_client/README.md | 4 + .../kfp_support/api_server_client/__init__.py | 1 + .../api_server_client/kuberay_apis.py | 636 ++++++++++++++++++ .../api_server_client/params/__init__.py | 53 ++ .../api_server_client/params/cluster.py | 475 +++++++++++++ .../params/environmentvariables.py | 158 +++++ .../api_server_client/params/headnode.py | 202 ++++++ .../api_server_client/params/jobsubmission.py | 163 +++++ .../api_server_client/params/templates.py | 224 ++++++ .../api_server_client/params/volumes.py | 449 +++++++++++++ .../api_server_client/params/workernode.py | 206 ++++++ .../kfp_support/workflow_support/README.md | 45 ++ .../compile_utils/__init__.py | 3 + .../compile_utils/component.py | 101 +++ .../runtime_utils/__init__.py | 2 + .../runtime_utils/kfp_utils.py | 113 ++++ .../runtime_utils/remote_jobs_utils.py | 527 +++++++++++++++ .../kfp_support/workflow_support_v2/README.md | 36 + .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 + .../comp_utils/component.py | 0 .../workflow_support_v2/utils/__init__.py | 8 + .../utils/workflow_utils.py | 557 +++++++++++++++ .../src/create_ray_cluster.py | 2 +- .../src/delete_ray_cluster.py | 2 +- kfp/kfp_ray_components/src/execute_ray_job.py | 2 +- .../src/execute_ray_job_multi_s3.py | 2 +- kfp/kfp_ray_components/src/subworkflow.py | 2 +- .../workflow_support/comp_utils/__init__.py | 3 - .../pipeline_utils/__init__.py | 3 - .../pipeline_utils/pipeline_utils.py | 121 ---- .../pipelines_tests_utils.py | 16 +- .../test/pipeline_utils_test.py | 2 +- kfp/kfp_support_lib_v2/Makefile | 63 ++ kfp/kfp_support_lib_v2/README.md | 68 ++ .../doc/kfp_support_library.md | 10 + kfp/kfp_support_lib_v2/pyproject.toml | 47 ++ .../kfp_support/api_server_client/README.md | 4 + .../kfp_support/api_server_client/__init__.py | 1 + .../api_server_client/kuberay_apis.py | 636 ++++++++++++++++++ .../api_server_client/params/__init__.py | 53 ++ .../api_server_client/params/cluster.py | 475 +++++++++++++ .../params/environmentvariables.py | 158 +++++ .../api_server_client/params/headnode.py | 202 ++++++ .../api_server_client/params/jobsubmission.py | 163 +++++ .../api_server_client/params/templates.py | 224 ++++++ .../api_server_client/params/volumes.py | 449 +++++++++++++ .../api_server_client/params/workernode.py | 206 ++++++ .../kfp_support/workflow_support/README.md | 45 ++ .../compile_utils/__init__.py | 3 + .../compile_utils/component.py | 101 +++ .../runtime_utils/__init__.py | 2 + .../runtime_utils/kfp_utils.py | 113 ++++ .../runtime_utils/remote_jobs_utils.py | 527 +++++++++++++++ .../kfp_support/workflow_support_v2/README.md | 36 + .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 + .../comp_utils/component.py | 54 ++ .../workflow_support_v2/utils/__init__.py | 8 + .../utils/workflow_utils.py | 557 +++++++++++++++ .../test/api_params_test.py | 433 ++++++++++++ kfp/kfp_support_lib_v2/test/configmaps.py | 72 ++ .../test/kuberay_api_test.py | 297 ++++++++ .../test/ray_remote_jobs_test.py | 90 +++ .../kfp_v1/superworkflow_dedups_sample_wf.py | 2 +- transforms/code/code_quality/Makefile | 10 +- .../kfp_ray/v1/code_quality_wf.py | 2 +- .../code/code_quality/kfp_ray/v2/Makefile | 25 + .../kfp_ray/v2/code_quality_wf.py | 174 +++++ transforms/code/malware/Makefile | 11 +- .../code/malware/kfp_ray/v1/malware_wf.py | 2 +- transforms/code/proglang_select/Makefile | 10 +- .../kfp_ray/v1/proglang_select_wf.py | 2 +- .../code/proglang_select/kfp_ray/v2/Makefile | 25 + .../kfp_ray/v2/proglang_select_wf.py | 165 +++++ transforms/universal/doc_id/Makefile | 10 +- .../universal/doc_id/kfp_ray/v1/doc_id_wf.py | 2 +- .../universal/doc_id/kfp_ray/v2/Makefile | 25 + .../universal/doc_id/kfp_ray/v2/doc_id_wf.py | 163 +++++ transforms/universal/ededup/Makefile | 10 +- .../universal/ededup/kfp_ray/v1/ededup_wf.py | 2 +- .../v1/src/ededup_compute_execution_params.py | 2 +- .../universal/ededup/kfp_ray/v2/Makefile | 25 + .../universal/ededup/kfp_ray/v2/ededup_wf.py | 165 +++++ .../v2/src/ededup_compute_execution_params.py | 98 +++ transforms/universal/fdedup/Makefile | 10 +- .../universal/fdedup/kfp_ray/v1/fdedup_wf.py | 2 +- .../v1/src/fdedup_compute_execution_params.py | 2 +- .../universal/fdedup/kfp_ray/v2/Makefile | 25 + .../universal/fdedup/kfp_ray/v2/fdedup_wf.py | 216 ++++++ .../v2/src/fdedup_compute_execution_params.py | 178 +++++ transforms/universal/filter/Makefile | 10 +- .../universal/filter/kfp_ray/v1/filter_wf.py | 2 +- .../universal/filter/kfp_ray/v2/Makefile | 25 + .../universal/filter/kfp_ray/v2/filter_wf.py | 167 +++++ transforms/universal/noop/Makefile | 6 +- .../noop/kfp_ray/v1/noop_multiple_wf.py | 2 +- .../universal/noop/kfp_ray/v1/noop_wf.py | 2 +- transforms/universal/noop/kfp_ray/v2/Makefile | 32 + .../universal/noop/kfp_ray/v2/noop_wf.py | 164 +++++ transforms/universal/tokenization/Makefile | 10 +- .../kfp_ray/v1/tokenization_wf.py | 2 +- .../tokenization/kfp_ray/v2/Makefile | 25 + .../kfp_ray/v2/tokenization_wf.py | 171 +++++ 112 files changed, 11179 insertions(+), 224 deletions(-) create mode 100644 kfp/kfp_ray_components/Dockerfile_v2 create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py rename kfp/{kfp_support_lib/src/kfp_support/workflow_support => kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2}/comp_utils/component.py (100%) create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py rename kfp/kfp_support_lib/src/kfp_support/workflow_support/{pipeline_utils => utils}/pipelines_tests_utils.py (78%) create mode 100644 kfp/kfp_support_lib_v2/Makefile create mode 100644 kfp/kfp_support_lib_v2/README.md create mode 100644 kfp/kfp_support_lib_v2/doc/kfp_support_library.md create mode 100644 kfp/kfp_support_lib_v2/pyproject.toml create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py create mode 100644 kfp/kfp_support_lib_v2/test/api_params_test.py create mode 100644 kfp/kfp_support_lib_v2/test/configmaps.py create mode 100644 kfp/kfp_support_lib_v2/test/kuberay_api_test.py create mode 100644 kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py create mode 100644 transforms/code/code_quality/kfp_ray/v2/Makefile create mode 100644 transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py create mode 100644 transforms/code/proglang_select/kfp_ray/v2/Makefile create mode 100644 transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py create mode 100644 transforms/universal/doc_id/kfp_ray/v2/Makefile create mode 100644 transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py create mode 100644 transforms/universal/ededup/kfp_ray/v2/Makefile create mode 100644 transforms/universal/ededup/kfp_ray/v2/ededup_wf.py create mode 100644 transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py create mode 100644 transforms/universal/fdedup/kfp_ray/v2/Makefile create mode 100644 transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py create mode 100644 transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py create mode 100644 transforms/universal/filter/kfp_ray/v2/Makefile create mode 100644 transforms/universal/filter/kfp_ray/v2/filter_wf.py create mode 100644 transforms/universal/noop/kfp_ray/v2/Makefile create mode 100644 transforms/universal/noop/kfp_ray/v2/noop_wf.py create mode 100644 transforms/universal/tokenization/kfp_ray/v2/Makefile create mode 100644 transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py diff --git a/.make.defaults b/.make.defaults index 1d9d8d890..d1d065015 100644 --- a/.make.defaults +++ b/.make.defaults @@ -53,6 +53,15 @@ KIND_CLUSTER_NAME=dataprep DPK_PYTHON_LIB_DIR=$(REPOROOT)/data-processing-lib/python DPK_RAY_LIB_DIR=$(REPOROOT)/data-processing-lib/ray DPK_SPARK_LIB_DIR=$(REPOROOT)/data-processing-lib/spark + +KFPv2?=1 + +ifeq ($(KFPv2), 0) + PIPELINE_PATH="kfp_ray/v1" +else + PIPELINE_PATH="kfp_ray/v2" +endif + ####################################################################################### # Lists all targets and optional help text found in the target. # Adapted from https://stackoverflow.com/a/65243296/45375 diff --git a/.make.versions b/.make.versions index 5933c5b2b..070fc9519 100644 --- a/.make.versions +++ b/.make.versions @@ -7,6 +7,7 @@ # Data prep lab wheel version DPK_LIB_VERSION=0.2.0 DPK_LIB_KFP_VERSION=0.2.0 +DPK_LIB_KFP_VERSION_v2=0.1.1-dev1 # Begin transform versions/tags BLOCKLIST_VERSION=0.4.0 @@ -27,5 +28,5 @@ CODE_QUALITY_VERSION=0.4.0 DOC_QUALITY_VERSION=0.4.0 INGEST_TO_PARQUET_VERSION=0.4.0 -KFP_DOCKER_VERSION_v2=0.1.0-v2 +KFP_DOCKER_VERSION_v2=0.1.1 KFP_DOCKER_VERSION=0.2.0 diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 220702cbc..539d3cdf5 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -34,16 +34,16 @@ Note: the project and the explanation below are based on [KFPv1](https://www.kub * Pipeline wiring - definition of the sequence of invocation (with parameter passing) of participating components * Additional configuration -### Imports definition +### Imports definition ```python import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, ) from kubernetes import client as k8s_client ``` @@ -73,8 +73,8 @@ Ray cluster. For each step we have to define a component that will execute them: Note: here we are using shared components described in this [document](../kfp_ray_components/README.md) for `create_ray_op`, `execute_ray_jobs_op` and `cleanup_ray_op`, while `compute_exec_params_op` component is built inline, because it might differ significantly. For "simple" pipeline cases we can use the -[default implementation](../kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py), -while, for example for exact dedup, we are using a very [specialized one](../transform_workflows/universal/ededup/src/ededup_compute_execution_params.py). +[default implementation](../kfp_support_lib/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py), +while, for example for exact dedup, we are using a very [specialized one](../../transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py). ### Input parameters definition diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 23e1c14d8..69f9f0d67 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -8,8 +8,6 @@ LABEL git-commit=$GIT_COMMIT # install libraries COPY requirements.txt requirements.txt -RUN pip install kfp==2.7.0 --extra-index-url https://pypi.org/simple -RUN pip install kfp-kubernetes --extra-index-url https://pypi.org/simple RUN pip install --no-cache-dir -r requirements.txt # Copy and install data processing libraries diff --git a/kfp/kfp_ray_components/Dockerfile_v2 b/kfp/kfp_ray_components/Dockerfile_v2 new file mode 100644 index 000000000..922ac070e --- /dev/null +++ b/kfp/kfp_ray_components/Dockerfile_v2 @@ -0,0 +1,25 @@ +FROM docker.io/rayproject/ray:2.9.3-py310 + +ARG BUILD_DATE +ARG GIT_COMMIT + +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT + +# install libraries +COPY requirements.txt requirements.txt +RUN pip install kfp==2.7.0 --extra-index-url https://pypi.org/simple +RUN pip install kfp-kubernetes --extra-index-url https://pypi.org/simple +RUN pip install --no-cache-dir -r requirements.txt + +# install data processing and kfp support libs +# Copy in the frameworks source/project and install them +# This is expected to be placed in the docker context before this is run (see the make image). +COPY --chown=ray:users data-processing-lib/ data-processing-lib/ +RUN cd data-processing-lib && pip install --no-cache-dir -e . +COPY --chown=ray:users kfp_support_lib_v2/ kfp_support_lib_v2/ +RUN cd kfp_support_lib_v2 && pip install --no-cache-dir -e . +# remove credentials-containing file +RUN rm requirements.txt +# components +COPY ./src /pipelines/component/src diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 129051a3d..30ef36f5a 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -2,16 +2,30 @@ # # know where they are running from. REPOROOT=../.. +# Include the common rules. +# Use "make help" to see them. +include $(REPOROOT)/.make.defaults + IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed 's/=/:=/' | sed 's/^/export /' > makeenv") include makeenv + +ifeq ($(KFPv2), 0) DOCKER_FILE=Dockerfile DOCKER_IMAGE_NAME=kfp-data-processing DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} +KFP_SUPPORT_LIB=kfp_support_lib +else +DOCKER_FILE=Dockerfile_v2 +DOCKER_IMAGE_NAME=kfp-data-processing_v2 +DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION_v2} +KFP_SUPPORT_LIB=kfp_support_lib_v2 +endif + + +#DOCKER_IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_VERSION} +DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) -# Include the common rules. -# Use "make help" to see them. -include $(REPOROOT)/.make.defaults .PHONY: .lib-src-image .lib-src-image:: @@ -24,18 +38,19 @@ include $(REPOROOT)/.make.defaults rm -rf kfp_support_lib .PHONY: image -image: Dockerfile requirements.txt +image: Dockerfile Dockerfile_v2 requirements.txt $(MAKE) reconcile-requirements $(MAKE) .lib-src-image .PHONY: reconcile-requirements reconcile-requirements: @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml - sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml + # TODO remove it for KFPv2 + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml .PHONY: load-image load-image: diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md new file mode 100644 index 000000000..86f3f4360 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md @@ -0,0 +1,68 @@ +# KFP support library + +This provides support for implementing KFP pipelines automating transform's execution. +It comprises 2 main modules + +* [api server client](src/kfp_support/api_server_client/README.md) +* [workflow support](src/kfp_support/workflow_support/README.md) + +## Development + +### Requirements +1. python 3.10 or later +2. git command line tools +3. [pre-commit](https://pre-commit.com/) +4. twine (pip install twine) + * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` + +### Git +Simple clone the repo and set up the pre-commit hooks. +```shell +git clone git@github.com:IBM/data-prep-kit.git +cd kfp/kfp_support_lib +pre-commit install +``` +If you don't have pre-commit, you can install from [here](https://pre-commit.com/) + +## Library Artifact Build and Publish + +The process of creating a release for `fm_data_processing_kfp` package involves the following steps: + +cd to the package directory. + +update the version in [requirements.env](../requirements.env) file. + +run `make build` and `make publish`. + +## Testing + +To run the package tests perform the following: + +To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. + +```bash +make setup +``` + +The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. + +```bash +make build +``` + +lastly, execute the tests: + +```bash +make test +``` + +### Cleanup + +It is advisable to execute the following command prior to running `make test` once more. This will ensure that any +previous test runs resources are removed before starting new tests. + +```bash +kubectl delete workflows -n kubeflow --all +``` + + diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml new file mode 100644 index 000000000..f995d60d7 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml @@ -0,0 +1,47 @@ +[project] +name = "data_prep_toolkit_kfp_v2" +version = "0.1.1" +requires-python = ">=3.10" +description = "Data Preparation Kit Library. KFP v2 support" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, + { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, + { name = "Revital Eres", email = "eres@il.ibm.com" }, +] +dependencies = [ + "kfp==2.7.0", + "kfp-kubernetes==1.2.0", + "requests", + "data-prep-toolkit==0.1.1", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", +] + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/kfp_support"] + +[tool.pytest.ini_options] +addopts = "--cov --cov-report term-missing --cov-fail-under 10" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md new file mode 100644 index 000000000..423f743a1 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md @@ -0,0 +1,4 @@ +# KubeRay API server APIs + +This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) +Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py new file mode 100644 index 000000000..60cbbc2f2 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py @@ -0,0 +1 @@ +from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py new file mode 100644 index 000000000..270815e77 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py @@ -0,0 +1,636 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +import requests +from data_processing.utils import get_logger +from kfp_support.api_server_client.params import ( + Cluster, + RayJobInfo, + RayJobRequest, + Template, + cluster_decoder, + clusters_decoder, + template_decoder, + templates_decoder, +) + + +logger = get_logger(__name__) + + +_headers = {"Content-Type": "application/json", "accept": "application/json"} + +CONNECT_TIMEOUT = 50 +READ_TIMEOUT = 50 +TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) + + +class KubeRayAPIs: + """ + This class implements KubeRay APIs based on the API server. + To create a class, the following parameters are required: + base - the URL of the API server (default is set to the standalone API server) + wait interval - the amount of sec to wait between checking for cluster ready + """ + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + token: str = None, + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initializer + :param server_url: API server url - default assuming running it inside the cluster + :param token: token, only used for API server with security enabled + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.server_url = server_url + if token is not None: + _headers["Authorization"] = token + self.wait_interval = wait_interval + self.api_base = "/apis/v1/" + self.http_retries = http_retries + + def list_compute_templates(self) -> tuple[int, str, list[Template]]: + """ + List compute templates across all namespaces of the k8 cluster + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning(f"Failed to list compute templates, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: + """ + List compute templates across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: return tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning( + f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: + """ + get a compute template + :param ns: namespace + :param name: template name + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, template_decoder(response.json()) + else: + logger.warning( + f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_compute_template(self, template: Template) -> tuple[int, str]: + """ + Create a compute template + :param template - definition of a template + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: + """ + delete a compute template + :param ns: namespace + :param name: template name + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def list_clusters(self) -> tuple[int, str, list[Cluster]]: + """ + List clusters across all namespaces of the k8 cluster + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list cluster, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list cluster, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: + """ + List clusters across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: + """ + get cluster + :param ns: namespace + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + clusters definition + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, cluster_decoder(response.json()) + else: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_cluster(self, cluster: Cluster) -> tuple[int, str]: + """ + create cluster + :param cluster: cluster definition + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" + for i in range(self.http_retries): + try: + response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: + """ + get cluster status + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + # Execute HTTP request + status, error, cluster = self.get_cluster(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error, None + cluster_status = "creating" + if cluster.cluster_status is not None: + cluster_status = cluster.cluster_status + return status, None, cluster_status + + def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: + """ + wait for cluster to be ready + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) + :returns: A tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + current_wait = 0 + while True: + status, error, c_status = self.get_cluster_status(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error + if c_status == "ready": + return status, None + if current_wait > wait > 0: + return 408, f"Timed out waiting for cluster ready in {current_wait} sec" + time.sleep(self.wait_interval) + current_wait += self.wait_interval + + def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: + """ + get cluster endpoint + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) for cluster to be ready + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + endpoint (service for dashboard endpoint) + """ + # Ensure that the cluster is ready + status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) + if status // 100 != 2: + return status, error, None + # Get cluster + status, error, cluster = self.get_cluster(ns=ns, name=name) + if status // 100 != 2: + return status, error, None + return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" + + def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: + """ + delete cluster + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: + """ + submit Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param job_request: job submission + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json()["submissionId"] + else: + logger.warning( + f"Failed to submit job to the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, None + + def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: + """ + get Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, RayJobInfo(response.json()) + else: + logger.warning( + f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: + """ + list Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + job_info_array = response.json().get("submissions", None) + if job_info_array is None: + return response.status_code, None, [] + else: + return response.status_code, None, [RayJobInfo(i) for i in job_info_array] + else: + logger.warning( + f"Failed to list jobs from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, [] + + def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: + """ + get Ray job log + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + log + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json().get("log", "") + else: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" + ) + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + stop Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.post(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + delete Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py new file mode 100644 index 000000000..e5a7d70fa --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py @@ -0,0 +1,53 @@ +from kfp_support.api_server_client.params.templates import ( + TolerationOperation, + TolerationEffect, + Toleration, + Template, + toleration_decoder, + template_decoder, + templates_decoder, +) +from kfp_support.api_server_client.params.volumes import ( + HostPath, + MountPropagationMode, + AccessMode, + BaseVolume, + HostPathVolume, + PVCVolume, + EphemeralVolume, + EmptyDirVolume, + ConfigMapVolume, + SecretVolume, + volume_decoder, +) +from kfp_support.api_server_client.params.environmentvariables import ( + EnvVarSource, + EnvVarFrom, + EnvironmentVariables, + env_var_from_decoder, + environment_variables_decoder, +) +from kfp_support.api_server_client.params.headnode import ( + ServiceType, + HeadNodeSpec, + DEFAULT_HEAD_START_PARAMS, + head_node_spec_decoder, +) +from kfp_support.api_server_client.params.workernode import ( + WorkerNodeSpec, + DEFAULT_WORKER_START_PARAMS, + worker_node_spec_decoder, +) +from kfp_support.api_server_client.params.cluster import ( + Environment, + AutoscalerOptions, + ClusterSpec, + ClusterEvent, + Cluster, + UpscalingMode, + autoscaling_decoder, + cluster_spec_decoder, + cluster_decoder, + clusters_decoder, +) +from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py new file mode 100644 index 000000000..922a14bef --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py @@ -0,0 +1,475 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + HeadNodeSpec, + WorkerNodeSpec, + environment_variables_decoder, + head_node_spec_decoder, + volume_decoder, + worker_node_spec_decoder, +) + + +class Environment(enum.Enum): + """ + Environment definitions + """ + + DEV = 0 # development + TESTING = 1 # testing + STAGING = 2 # staging + PRODUCTION = 3 # production + + +class UpscalingMode(enum.Enum): + """ + Enumeration of autoscaling mode + """ + + Conservative = ( + "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster + ) + Default = "Default" # no rate limitations + Aggressive = "Aggressive" # same as default + + +class AutoscalerOptions: + """ + AutoscalerOptions is used to define Ray cluster autoscaling. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create autoscaling options specification: gets the following parameters: + idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray + resources. Default 60sec (one minute). + upscaling_mode - required autoscaler upscaling mode + image - optional, allows to override the autoscaler's container image + image_pull_policy - optional, allows to override the autoscaler's container image pull policy + cpus - optional, CPUs requirements for autoscaler - default "500m" + memory - optional, memory requirements for autoscaler - default "512Mi" + environment - optional, environment variables for autoscaler container + volumes - optional, a list of volumes to attach to autoscaler container. + This is needed for enabling TLS for the autoscaler container. + """ + + def __init__( + self, + upscaling_mode: UpscalingMode = UpscalingMode.Default, + idle_tmout: int = None, + image: str = None, + image_pull_policy: str = None, + cpus: str = None, + memory: str = None, + environment: EnvironmentVariables = None, + volumes: list[BaseVolume] = None, + ): + """ + Initialization + :param upscaling_mode: upscale mode + :param idle_tmout: idle timeout + :param image: image + :param image_pull_policy: image pull policy + :param cpus: cpu requirement for autoscaling + :param memory: memory requirement for autoscaling + :param environment: autoscaler environment + :param volumes: volumes for autoscaler + """ + self.upscaling_mode = upscaling_mode + self.idle_tmout = idle_tmout + self.image = image + self.image_pull_policy = image_pull_policy + self.cpus = cpus + self.memory = memory + self.environment = environment + self.volumes = volumes + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"upscaling_mode = {self.upscaling_mode}" + if self.idle_tmout is not None: + val += f", idle_timeout = {self.idle_tmout}" + if self.image is not None: + val += f", image = {self.image}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.cpus is not None: + val += f", cpus = {self.cpus}" + if self.memory is not None: + val += f", memory = {self.memory}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"upscalingMode": self.upscaling_mode.value} + if self.idle_tmout is not None: + dct["idleTimeoutSeconds"] = self.idle_tmout + if self.image is not None: + dct["image"] = self.image + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.cpus is not None: + dct["cpu"] = self.cpus + if self.memory is not None: + dct["memory"] = self.memory + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["envs"] = self.environment.to_dict() + return dct + + +class ClusterSpec: + """ + ClusterSpec is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create cluster spec from: gets the following parameters: + head_group_spec - required, specification of the head node + worker_group_spec - optional, list of worker group specs + autoscaler_options - optional, autoscaling options + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + head_node: HeadNodeSpec, + worker_groups: list[WorkerNodeSpec] = None, + autoscaling_options: AutoscalerOptions = None, + ): + """ + Initialization + :param head_node - head node definition + :param worker_groups - worker group definition + :param autoscaling_options - autoscaler options + """ + self.head_node = head_node + self.worker_groups = worker_groups + self.autoscaling_options = autoscaling_options + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster spec + """ + val = f"head_group_spec: {self.head_node.to_string()}" + if self.worker_groups is not None: + val += "\nworker groups: " + for w in self.worker_groups: + val += f"\nworker_group_spec = {w.to_string()}]" + if self.autoscaling_options is not None: + val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: Dictionary representation of cluster spec + """ + dst = {"headGroupSpec": self.head_node.to_dict()} + if self.worker_groups is not None: + dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] + if self.autoscaling_options is not None: + dst["enableInTreeAutoscaling"] = True + dst["autoscalerOptions"] = self.autoscaling_options.to_dict() + return dst + + +class ClusterEvent: + """ + Cluster event is used to define events emitted during cluster creation. + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create event: gets the dictionary with the following parameters: + id - unique Event Id + name - human readable event name + created_at - event creation time + first_timestamp - first time the event occur + last_timestamp - last time the event occur + reason - reason for the transition into the object's current status + message - human-readable description of the status of this operation + type - type of this event (Normal, Warning), new types could be added in the future + count - number of times this event has occurred + """ + + def __init__(self, dst: dict[str, Any]): + """ + Initialization from dictionary + :param dst: dictionary representation of cluster event + """ + self.id = dst.get("id", "") + self.name = dst.get("name", "") + self.created_at = dst.get("created_at", "") + self.first_timestamp = dst.get("first_timestamp", "") + self.last_timestamp = dst.get("last_timestamp", "") + self.reason = dst.get("reason", "") + self.message = dst.get("message", "") + self.type = dst.get("type", "") + self.count = dst.get("count", "0") + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster event + """ + return ( + f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " + f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," + f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" + ) + + +class Cluster: + """ + Cluster is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + name - required, unique (per namespace) cluster name + namespace - required, cluster's namespace (should exist) + user - required, user who owns the cluster + version - required, Ray cluster version - typically Ray version + deployment_environment - optional (see Environment) + cluster_spec - required, ray cluster configuration + annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class + cluster_environment - optional, cluster environment variables + created_at - output, cluster creation ts + deleted_at - output, cluster deletion ts + cluster_status - output, cluster status + events - output, cluster events + service_endpoint - output, cluster service endpoints + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + name: str, + namespace: str, + user: str, + version: str, + cluster_spec: ClusterSpec, + deployment_environment: Environment = None, + annotations: dict[str, str] = None, + cluster_environment: EnvironmentVariables = None, + created_at: str = None, + deleted_at: str = None, + cluster_status: str = None, + events: list[ClusterEvent] = None, + service_endpoint: dict[str, str] = None, + ): + """ + Initialization + :param name: cluster name + :param namespace: cluster namespace + :param user: user name + :param version: version + :param cluster_spec: cluster spec + :param deployment_environment: cluster deployment environment + :param annotations: cluster annotations + :param cluster_environment: cluster environment + :param created_at: created at + :param deleted_at: deleted at + :param cluster_status: status + :param events: cluster events + :param service_endpoint: service endpoint + """ + self.name = name + self.namespace = namespace + self.user = user + self.version = version + self.cluster_spec = cluster_spec + self.environment = deployment_environment + self.annotations = annotations + self.envs = cluster_environment + self.created_at = created_at + self.deleted_at = deleted_at + self.cluster_status = cluster_status + self.events = events + self.service_endpoint = service_endpoint + + def to_string(self) -> str: + """ + convert to string representation + :return: string representation of cluster + """ + val = ( + f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " + f"cluster_spec = {self.cluster_spec.to_string()}" + ) + if self.environment is not None: + val += f"deployment environment = {self.environment.name}" + if self.annotations is not None: + val += f" ,annotations = {str(self.annotations)}" + if self.envs is not None: + val = val + f",cluster environment = {self.envs.to_string()}" + val += "\ncluster output\n" + if self.created_at is not None: + val += f" ,created_at = {self.created_at}" + if self.deleted_at is not None: + val += f" ,deleted_at = {self.deleted_at}" + if self.cluster_status is not None: + val += f" ,cluster status = {self.cluster_status}" + if self.events is not None: + val = val + ",\n cluster events = [" + first = True + for e in self.events: + if first: + first = False + else: + val += ", " + val = val + "{" + e.to_string() + "}" + val = val + "]" + if self.service_endpoint is not None: + val += f" ,service endpoints = {str(self.service_endpoint)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of cluster + """ + # only convert input variables + dst = { + "name": self.name, + "namespace": self.namespace, + "user": self.user, + "version": self.version, + "clusterSpec": self.cluster_spec.to_dict(), + } + if self.environment is not None: + dst["environment"] = self.environment.value + if self.annotations is not None: + dst["annotations"] = self.annotations + if self.envs is not None: + dst["envs"] = self.envs.to_dict() + return dst + + +""" + Creates new cluster from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: + """ + Create autoscaling options from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: autoscaling options + """ + upscaling_mode = UpscalingMode.Default + if "upscalingMode" in dct: + upscaling_mode = UpscalingMode(dct.get("upscalingMode")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("envs")) > 0: + environments = environment_variables_decoder(dct.get("envs")) + return AutoscalerOptions( + upscaling_mode=upscaling_mode, + idle_tmout=dct.get("idleTimeoutSeconds", None), + image=dct.get("image", None), + image_pull_policy=dct.get("imagePullPolicy", None), + cpus=dct.get("cpu", None), + memory=dct.get("memory", None), + environment=environments, + volumes=volumes, + ) + + +def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: + """ + Create cluster spec from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: cluster spec + """ + workers = None + autoscaling_options = None + if "workerGroupSpec" in dct: + workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] + if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): + autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) + return ClusterSpec( + head_node=head_node_spec_decoder(dct.get("headGroupSpec")), + worker_groups=workers, + autoscaling_options=autoscaling_options, + ) + + +def cluster_decoder(dct: dict[str, Any]) -> Cluster: + """ + Create cluster from its dictionary representation + :param dct: dictionary representation of cluster + :return: cluster + """ + environment = None + if "environment" in dct: + environment = Environment(int(dct.get("environment", "0"))) + events = None + if "events" in dct: + events = [ClusterEvent(c) for c in dct["events"]] + envs = None + if "envs" in dct: + envs = environment_variables_decoder(dct.get("envs")) + return Cluster( + name=dct.get("name", ""), + namespace=dct.get("namespace", ""), + user=dct.get("user", ""), + version=dct.get("version", ""), + cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), + deployment_environment=environment, + annotations=dct.get("annotations"), + cluster_environment=envs, + created_at=dct.get("createdAt"), + deleted_at=dct.get("deletedAt"), + cluster_status=dct.get("clusterState"), + events=events, + service_endpoint=dct.get("serviceEndpoint"), + ) + + +def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: + """ + Create list of clusters from its dictionary representation + :param dct: dictionary representation of a list of clusters + :return: list of clusters + """ + return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py new file mode 100644 index 000000000..d1056f6f6 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py @@ -0,0 +1,158 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class EnvVarSource(enum.Enum): + """ + Enumeration of environment sources + """ + + CONFIGMAP = 0 # config map + SECRET = 1 # secret + RESOURCE_FIELD = 2 # resource field + FIELD = 3 # field + + +class EnvVarFrom: + """ + EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + Source required - source of environment variable + name required name for config map or secret, container name for resource, path for field + key required Key for config map or secret, resource name for resource + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, source: EnvVarSource, name: str, key: str): + """ + Initialize + :param source - source + :param name source name + :param key source key + """ + self.source = source + self.name = name + self.key = key + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of environment from + """ + return f"source = {self.source.name}, name = {self.name}, key = {self.key}" + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of environment from + """ + return {"source": self.source.value, "name": self.name, "key": self.key} + + +class EnvironmentVariables: + """ + EnvironmentVariables is used to define environment variables. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + key_value - optional, dictionary of key/value environment variables + from_ref - optional, dictionary of reference environment variables + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): + """ + Initialization + :param key_value: dictionary of key/value pairs for environment variables + :param from_ref: dictionary of key/value pairs for environment from variables + """ + self.key_val = key_value + self.from_ref = from_ref + + def to_string(self) -> str: + """ + convert to string + :return: string representation of environment variables + """ + val = "" + if self.key_val is not None: + val = f"values = {str(self.key_val)}" + if self.from_ref is not None: + if val != "": + val += " , " + val += "valuesFrom = {" + first = True + for k, v in self.from_ref.items(): + if not first: + val += ", " + else: + first = False + val += f"{k} = [{v.to_string()}]" + val += "}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of environment variables + """ + dst = {} + if self.key_val is not None: + dst["values"] = self.key_val + if self.from_ref is not None: + fr = {} + for k, v in self.from_ref.items(): + fr[k] = v.to_dict() + dst["valuesFrom"] = fr + return dst + + +""" + Creates new environment variable from from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: + """ + Create environment from from dictionary + :param dct: dictionary representations of environment from + :return: environment from + """ + return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) + + +def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: + """ + Create environment variables from from dictionary + :param dct: dictionary representations of environment variables + :return: environment variables + """ + keyvalues = None + fr = None + if "values" in dct: + keyvalues = dct.get("values") + if "valuesFrom" in dct: + from_ref = dct.get("valuesFrom") + fr = {} + for k, v in from_ref.items(): + fr[k] = env_var_from_decoder(v) + return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py new file mode 100644 index 000000000..7a9d4120f --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py @@ -0,0 +1,202 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} + + +class ServiceType(enum.Enum): + """ + Enumeration of head node service types + """ + + ClusterIP = "ClusterIP" # cluster IP + NodePort = "NodePort" # node port + LoadBalancer = "LoadBalancer" # load balancer + + +class HeadNodeSpec: + """ + HeadNodeSpec is used to define Ray cluster head node configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create head node specification: gets the following parameters: + compute_template - required, the computeTemplate of head node group + ray_start_params - required, Ray start parameters + image - optional, image used for head node + service_type - optional (ServiceType), service type foe headnode + enable_ingress - optional, allow to enable ingress for dashboard + volumes - optional, a list of volumes to attach to head node + service_account - optional, a service account (has to exist) to run head node + image_pull_secret - optional, secret to pull head node image from registry + environment - optional, environment variables for head pod + annotations - optional, annotations for head node + labels - optional, labels for head node + image_pull_policy - optional, head node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + compute_template: str, + image: str, + ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, + service_type: ServiceType = ServiceType.ClusterIP, + enable_ingress: bool = False, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param compute_template: compute template + :param ray_start_params: ray start parameters + :param image: node image + :param service_type: service type + :param enable_ingress: enable ingress flag + :param volumes: volumes for head node + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: head node environment + :param annotations: head node annotation + :param labels: labels + :param image_pull_policy: image pull policy + """ + + self.compute_template = compute_template + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) + self.image = image + self.service_type = service_type + self.enable_ingress = enable_ingress + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" + if self.image is not None: + val += f", image = {self.image}" + if self.service_type is not None: + val += f", service_type = {self.service_type.name}" + if self.enable_ingress: + val += ", enable_ingress = True" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} + if self.image is not None: + dct["image"] = self.image + if self.service_type is not None: + dct["serviceType"] = self.service_type.value + if self.enable_ingress: + dct["enableIngress"] = True + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["image_pull_secret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new head node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: + """ + Create head node spec from dictionary + :param dct: dictionary representation of head node spec + :return: Head node spec + """ + service_type = None + if "serviceType" in dct: + service_type = ServiceType(dct.get("serviceType", "ClusterIP")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return HeadNodeSpec( + compute_template=dct.get("computeTemplate"), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + service_type=service_type, + enable_ingress=dct.get("enableIngress", False), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py new file mode 100644 index 000000000..a0b2bfcb0 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py @@ -0,0 +1,163 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +from typing import Any + + +class RayJobRequest: + """ + RayJobRequest used to define job to be submitted to a Ray cluster + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - required, the command to start a job on the cluster + submission_id - optional, submission id for the job submission + runtime_env - optional, yaml string specifying job runtime environment + metadata - optional, dictionary of the submission metadata + num_cpus - optional, number of cpus for job execution + num_gpus - optional, number of gpus for job execution + resources - optional, dictionary of the resources for job execution + """ + + def __init__( + self, + entrypoint: str, + submission_id: str = None, + runtime_env: str = None, + metadata: dict[str, str] = None, + num_cpu: float = -1.0, + num_gpu: float = -1.0, + resources: dict[str, str] = None, + ): + """ + Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html + :param entrypoint: entrypoint + :param submission_id: submission id + :param runtime_env: runtime environment + :param metadata: submission metadata + :param num_cpu: job number cpus + :param num_gpu: job number gpus + :param resources: job custom resources + """ + self.entrypoint = entrypoint + self.submission_id = submission_id + self.runtime_env = runtime_env + self.metadata = metadata + self.num_cpu = num_cpu + self.num_gpu = num_gpu + self.resources = resources + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of job submission + """ + val = f"entrypoint = {self.entrypoint}" + if self.submission_id is not None: + val += f", submission_id = {self.submission_id}" + if self.num_cpu > 0: + val += f", num_cpu = {self.num_cpu}" + if self.num_gpu > 0: + val += f", num_gpu = {self.num_gpu}" + if self.runtime_env is not None: + val += f", runtime_env = {self.runtime_env}" + if self.metadata is not None: + val += f", metadata = {self.metadata}" + if self.resources is not None: + val += f", resources = {self.resources}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of job submission + """ + dct = {"entrypoint": self.entrypoint} + if self.submission_id is not None: + dct["submissionId"] = self.submission_id + if self.runtime_env is not None: + dct["runtimeEnv"] = self.runtime_env + if self.metadata is not None: + dct["metadata"] = self.metadata + if self.num_cpu > 0: + dct["numCpus"] = self.num_cpu + if self.num_gpu > 0: + dct["numGpus"] = self.num_gpu + if self.resources is not None: + dct["resources"] = self.resources + return dct + + +class RayJobInfo: + """ + RayJobInfo used to define information about the job in a Ray cluster + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - the command to start a job on the cluster + job_id - job execution id + submission_id - submission id for the job submission + runtime_env - job runtime environment + status - job execution status + message - status message + start_time - job start time + end-time - job ind time + error_type - type of error + metadata - optional, dictionary of the submission metadata + """ + + def __init__(self, dct: dict[str, Any]): + """ + Initialize from dictionary + :param dct: dictionary representation of Ray job info + """ + self.entrypoint = dct.get("entrypoint", "") + self.job_id = dct.get("jobId", "") + self.submission_id = dct.get("submissionId", "") + self.status = dct.get("status", "") + self.message = dct.get("message", None) + self.start_time = int(dct.get("startTime", "0")) + self.end_time = int(dct.get("endTime", "0")) + self.error_type = dct.get("ErrorType", None) + self.metadata = dct.get("Metadata", None) + self.runtime_env = dct.get("runtimeEnv", None) + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of Ray job info + """ + val = ( + f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," + f" status = {self.status}" + ) + if self.message is not None: + val += f" message = {self.message}" + if self.start_time > 0: + val += ( + f" start time = " + f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.end_time > 0: + val += ( + f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.error_type is not None: + val += f" error type = {self.error_type}" + if self.runtime_env is not None: + val += f" runtime env = {str(self.runtime_env)}" + if self.metadata is not None: + val += f" metadata = {str(self.metadata)}" + return val diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py new file mode 100644 index 000000000..0ef4c1583 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py @@ -0,0 +1,224 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class TolerationOperation(enum.Enum): + """ + Toleration operation types + """ + + Exists = "Exists" # exists + Equal = "Equal" # equal + + +class TolerationEffect(enum.Enum): + """ + Toleration effect + """ + + NoSchedule = "NoSchedule" # not schedule + PreferNoSchedule = "PreferNoSchedule" # prefer not schedule + NoExecute = "NoExecute" # not execute + + +class Toleration: + """ + Toleration is used by compute template to pick specific nodes for placing pods. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create toleration: gets the following parameters: + key - required, key created by the node's taint + operator - required, operator to apply, supported operators are "Exists" and "Equal" + effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" + value - optional, value + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): + """ + Initialization + :param key: key + :param operator: operator + :param effect: effect + :param value: value + """ + self.key = key + self.operator = operator + self.value = value + self.effect = effect + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of toleration + """ + val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" + if self.value is None: + return val + else: + return val + f", value = {self.value}" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to string + :return: string representation of toleration + """ + dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} + if self.value is not None: + dct["value"] = self.value + return dct + + +# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. +# If it is specified, it has to be in the format that is understood by kubernetes as a valid +# The following devices are currently supported by kubernetes: +# AMD - gpu accelerator amd.com/gpu +# Intel - gpu accelerator gpu.intel.com/i915 +# NVIDIA - gpu accelerator nvidia.com/gpu + + +class Template: + """ + Template is used to define specific nodes configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create templates: gets the following parameters: + name - required, template name + namespace - required, template namespace + cpus - required, template number of cpus + memory - required, template memory (GB) + gpus - optional, number of GPUs, default 0 + gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed + tolerations - optional, tolerations for pod placing, default none + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + - to_json() -> str convert to json string + """ + + def __init__( + self, + name: str, + namespace: str, + cpu: int, + memory: int, + gpu: int = 0, + gpu_accelerator: str = None, + tolerations: list[Toleration] = None, + ): + """ + Initialization + :param name: name + :param namespace: namespace + :param cpu: cpu + :param memory: memory + :param gpu: gpu + :param gpu_accelerator: accelerator type + :param tolerations: tolerations + """ + self.name = name + self.namespace = namespace + self.cpu = cpu + self.memory = memory + self.gpu = gpu + self.gpu_accelerator = gpu_accelerator + self.tolerations = tolerations + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of template + """ + val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" + if self.gpu > 0: + val = val + f", gpu {self.gpu}" + if self.gpu_accelerator is not None: + val = val + f", gpu accelerator {self.gpu_accelerator}" + if self.tolerations is None: + return val + val = val + ", tolerations [" + first = True + for tol in self.tolerations: + if first: + first = False + val = val + "{" + tol.to_string() + "}" + else: + val = val + ", {" + tol.to_string() + "}" + return val + "]" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of template + """ + dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} + if self.gpu > 0: + dct["gpu"] = self.gpu + if self.gpu_accelerator is not None: + dct["gpu accelerator"] = self.gpu_accelerator + if self.tolerations is not None: + dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] + return dct + + +""" + Creates new toleration from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def toleration_decoder(dct: dict[str, Any]) -> Toleration: + """ + Create toleration from dictionary + :param dct: dictionary representation of toleration + :return: toleration + """ + return Toleration( + key=dct.get("key"), + operator=TolerationOperation(dct.get("operator", "Exists")), + effect=TolerationEffect(dct.get("effect", "NoSchedule")), + value=dct.get("value"), + ) + + +def template_decoder(dct: dict[str, Any]) -> Template: + """ + Create template from dictionary + :param dct: dictionary representation of template + :return: template + """ + tolerations = None + if "tolerations" in dct: + tolerations = [toleration_decoder(d) for d in dct["tolerations"]] + return Template( + name=dct.get("name"), + namespace=dct.get("namespace"), + cpu=int(dct.get("cpu", "0")), + memory=int(dct.get("memory", "0")), + gpu=int(dct.get("gpu", "0")), + gpu_accelerator=dct.get("gpu_accelerator"), + tolerations=tolerations, + ) + + +def templates_decoder(dct: dict[str, Any]) -> list[Template]: + """ + Create list of template from dictionary + :param dct: dictionary representation of list of template + :return: list of template + """ + return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py new file mode 100644 index 000000000..fee0e1ea4 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py @@ -0,0 +1,449 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class HostPath(enum.Enum): + """ + Host path enumeration + """ + + DIRECTORY = 0 # directory + FILE = 1 # files + + +class MountPropagationMode(enum.Enum): + """ + Mount propagation enumeration + """ + + NONE = 0 # None + HOSTTOCONTAINER = 1 # host to container + BIDIRECTIONAL = 2 # bi directional + + +class AccessMode(enum.Enum): + """ + Access mode enumeration + """ + + RWO = 0 # read write once + ROX = 1 # read only many + RWX = 2 # read write many + + +class BaseVolume: + """ + KubeRay currently support several types of volumes, including hostPat, PVC, + ephemeral volumes, config maps, secrets and empty dir. All of them use slightly + different parameters. Base Volume is a base class for all different volume types. + """ + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + +class HostPathVolume(BaseVolume): + """ + This class implements HostPath volume. In addition to name and mount path it requires host + path volume specific parameters: + source - data location on host + hostPathType - host path type: directory (0) or file (1) + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + host_path_type: HostPath = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param host_path_type: host path type + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.host_path_type = host_path_type + self.volume_type = 1 + self.mount_propagation = mount_propagation + + def to_string(self) -> str: + """ + Convert to string + :return: HostPathVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + if self.host_path_type is not None: + val += f", host path type = {self.host_path_type.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: HostPathVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + if self.host_path_type is not None: + dst["hostPathType"] = self.host_path_type.value + return dst + + +class PVCVolume(BaseVolume): + """ + This class implements PVC volume. In addition to name and mount path it requires + PVC volume specific parameters: + source - PVC claim name + read_only - read only flag + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + read_only: bool = False, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param read_only: read only + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.volume_type = 0 + self.mount_propagation = mount_propagation + self.readonly = read_only + + def to_string(self) -> str: + """ + Convert to string + :return: PVCVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" + if self.readonly: + val += ", read only = True" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: PVCVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.readonly: + dst["readOnly"] = True + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + return dst + + +class EphemeralVolume(BaseVolume): + """ + This class implements Ephemeral volume. In addition to name and mount path it requires + Ephemeral volume specific parameters: + storage - disk size (valid k8 value, for example 5Gi) + storageClass - storage class - optional, if not specified, use default + accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) + mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + storage: str, + storage_class: str = None, + access_mode: AccessMode = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param storage: storage + :param storage_class: storage class + :param access_mode: access mode + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 2 + self.mount_propagation = mount_propagation + self.storage_class = storage_class + self.access_mode = access_mode + + def to_string(self) -> str: + """ + Convert to string + :return: EphemeralVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" + ) + if self.storage_class is not None: + val += f", storage class = {self.storage_class}" + if self.access_mode is not None: + val += f", access mode = {self.access_mode.name}" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: EphemeralVolume dictionary representation + """ + dct = { + "name": self.name, + "mountPath": self.mount_path, + "storage": self.storage, + "volumeType": self.volume_type, + } + if self.storage_class is not None: + dct["storageClassName"] = self.storage_class + if self.access_mode is not None: + dct["accessMode"] = self.access_mode.value + if self.mount_propagation is not None: + dct["mountPropagationMode"] = self.mount_propagation.value + return dct + + +class EmptyDirVolume(BaseVolume): + """ + This class implements EmptyDir volume. In addition to name and mount path it requires + Empty Dir specific parameters: + storage - optional max storage size (valid k8 value, for example 5Gi) + """ + + def __init__(self, name: str, mount_path: str, storage: str = None): + """ + Initialization + :param name: name + :param mount_path: mount_path + :param storage: storage + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 5 + + def to_string(self) -> str: + """ + Convert to string + :return: EmptyDirVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" + if self.storage is not None: + val += f", storage = {self.storage}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} + if self.storage is not None: + dct["storage"] = self.storage + return dct + + +class ConfigMapVolume(BaseVolume): + """ + This class implements ConfigMap volume. In addition to name and mount path it requires + configMap volume specific parameters: + source - required, config map name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param items: items + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 3 + + def to_string(self) -> str: + """ + Convert to string + :return: ConfigMapVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" + ) + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: ConfigMapVolume dictionary representation + """ + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +class SecretVolume(BaseVolume): + """ + This class implements Secret volume. In addition to name and mount path it requires + Secret volume specific parameters: + source - required, secret name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 4 + + def to_string(self) -> str: + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +""" + Creates new Volume from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def volume_decoder(dst: dict[str, Any]) -> BaseVolume: + def _get_mount_propagation() -> MountPropagationMode: + if "mountPropagationMode" in dst: + return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) + return None + + def _get_host_path() -> HostPath: + if "hostPathType" in dst: + return HostPath(int(dst.get("hostPathType", "0"))) + return None + + def _get_access_mode() -> AccessMode: + if "accessMode" in dst: + return AccessMode(int(dst.get("accessMode", "0"))) + return None + + match dst["volumeType"]: + case 0: + # PVC + return PVCVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + read_only=dst.get("readOnly", False), + mount_propagation=_get_mount_propagation(), + ) + case 1: + # host path + return HostPathVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + host_path_type=_get_host_path(), + mount_propagation=_get_mount_propagation(), + ) + case 2: + # Ephemeral volume + return EphemeralVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + storage=dst.get("storage", ""), + storage_class=dst.get("storageClassName"), + access_mode=_get_access_mode(), + mount_propagation=_get_mount_propagation(), + ) + case 3: + # ConfigMap Volume + return ConfigMapVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 4: + # Secret Volume + return SecretVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 5: + # Empty dir volume + return EmptyDirVolume( + name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") + ) + case _: + raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py new file mode 100644 index 000000000..ddcf193cc --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py @@ -0,0 +1,206 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} + + +class WorkerNodeSpec: + """ + WorkerNodeSpec is used to define Ray cluster worker node pool configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create worker node pool specification: gets the following parameters: + group_name - required, group name of the worker group + compute_template - required, the computeTemplate of worker node group + replicas - required, desired replicas of the worker group + min_replicas - required Min replicas of the worker group, can't be greater than max_replicas + max_replicas - required, max replicas of the worker group + ray_start_params - required, Ray start parameters + image - optional, image used for worker node + volumes - optional, a list of volumes to attach to worker node + service_account - optional, a service account (has to exist) to run worker node + image_pull_secret - optional, secret to pull worker node image from registry + environment - optional, environment variables for worker pod + annotations - optional, annotations for worker node + labels - optional, labels for worker node + image_pull_policy - optional, worker node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + group_name: str, + compute_template: str, + image: str, + max_replicas: int, + replicas: int = 1, + min_replicas: int = 0, + ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param group_name: name + :param compute_template: compute template + :param replicas: number of replicas + :param min_replicas: min number of replicas + :param max_replicas: max number of replicas + :param ray_start_params: ray start parameters + :param image: image name + :param volumes: volumes + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: environment + :param annotations: annotations + :param labels: labels + :param image_pull_policy: image pull policy + """ + # Validate replicas + if min_replicas > replicas: + raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") + if replicas > max_replicas: + raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") + + self.group_name = group_name + self.compute_template = compute_template + self.replicas = replicas + self.min_replicas = min_replicas + self.max_replicas = max_replicas + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) + self.image = image + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of worker node spec + """ + val = ( + f"group_name = {self.group_name}, compute template = {self.compute_template}, " + f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " + f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" + ) + if self.image is not None: + val += f", image = {self.image}" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of worker node spec + """ + dct = { + "groupName": self.group_name, + "computeTemplate": self.compute_template, + "replicas": self.replicas, + "minReplicas": self.min_replicas, + "maxReplicas": self.max_replicas, + "rayStartParams": self.ray_start_params, + } + if self.image is not None: + dct["image"] = self.image + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["imagePullSecret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new worker node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: + """ + Create worker node spec from dictionary + :param dct: dictionary definition of worker node spec + :return: worker node spec + """ + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return WorkerNodeSpec( + group_name=dct.get("groupName"), + compute_template=dct.get("computeTemplate"), + replicas=dct.get("replicas", 0), + min_replicas=dct.get("minReplicas", 0), + max_replicas=dct.get("maxReplicas", 0), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md new file mode 100644 index 000000000..4943a0b06 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md @@ -0,0 +1,45 @@ +# Workflow Utils for KFP v1 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## PipelinesUtils + +This class provides some higher level functionality based on the capabilities of the python KFP client, including" +* get_experiment_by_name obtains KFP experiment object based on its name +* get_pipeline_by_name obtains KFP pipeline object based on its name +* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a +dictionary of parameters. It returns kfp run ID +* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..bbe1476fb --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.compile_utils.component import ( + ComponentUtils +) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py new file mode 100644 index 000000000..1f66bf59f --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py @@ -0,0 +1,101 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..d2301bd0a --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils +from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py new file mode 100644 index 000000000..ef00b0e92 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py @@ -0,0 +1,113 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json +import os +import re +import sys +from typing import Any + +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if str(value) != "": + if isinstance(value, str): + if '"' in value: + logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") + res += f'--{key}="{value}" ' + elif isinstance(value, bool): + if value: + res += f"--{key} " + else: + res += f"--{key}={value} " + + logger.info(f"request to execute: {res}") + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py new file mode 100644 index 000000000..39d4d9e64 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py @@ -0,0 +1,527 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import re +import sys +import time +from typing import Any + +from data_processing.data_access import DataAccess, DataAccessFactory +from data_processing.utils import ParamsUtils, get_logger +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from kfp_support.workflow_support.runtime_utils import KFPUtils +from ray.job_submission import JobStatus + + +logger = get_logger(__name__) + + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + image_pull_policy: image pull policy, default IfNotPresent + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + image_pull_policy: image pull policy, default IfNotPresent + + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Although the cluster is ready, the service web server might not be ready yet at this point. + # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails + # couple of times, its harmless + _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) + time.sleep(5) + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {job_status}") + if job_status != JobStatus.SUCCEEDED: + sys.exit(1) + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.get_output_folder() + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +def _execute_remote_job( + name: str, + ns: str, + script: str, + params: dict[str, Any], + data_access_params: dict[str, Any], + additional_params: dict[str, Any], + remote_jobs: RayRemoteJobs, +) -> None: + """ + Execute remote job on Ray cluster + :param name: cluster name + :param ns: execution/cluster namespace + :param additional_params: additional parameters for the job + :param data_access_params: data access parameters + :param params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param script: script to run (has to be present in the image) + :param remote_jobs: remote jobs execution support class + :return: + """ + + status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) + if status != 200: + logger.error(f"Failed to submit job - status: {status}, error: {error}") + exit(1) + + logger.info(f"submitted job successfully, submission id {submission}") + # create data access + data_factory = DataAccessFactory() + data_factory.apply_input_params(args=data_access_params) + data_access = data_factory.create_data_access() + # print execution log + remote_jobs.follow_execution( + name=name, + namespace=ns, + submission_id=submission, + data_access=data_access, + print_timeout=additional_params.get("wait_print_tmout", 120), + job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), + ) + + +def execute_ray_jobs( + name: str, # name of Ray cluster + additional_params: dict[str, Any], + e_params: dict[str, Any], + exec_script_name: str, + server_url: str, +) -> None: + """ + Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. + All of the jobs will be executed, although some of the jobs may fail. + :param name: cluster name + :param additional_params: additional parameters for the job + :param e_params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param exec_script_name: script to run (has to be present in the image) + :param server_url: API server url + :return: None + """ + # prepare for execution + ns = KFPUtils.get_namespace() + if ns == "": + logger.warning(f"Failed to get namespace") + sys.exit(1) + # create remote jobs class + remote_jobs = RayRemoteJobs( + server_url=server_url, + http_retries=additional_params.get("http_retries", 5), + wait_interval=additional_params.get("wait_interval", 2), + ) + # find config parameter + config = ParamsUtils.get_config_parameter(e_params) + if config is None: + exit(1) + # get config value + config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) + s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) + if type(config_value) is not list: + # single request + return _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: config_value, "data_s3_cred": s3_creds}, + params=e_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + # remove config key from the dictionary + launch_params = dict(e_params) + del launch_params[config] + # Loop through all configuration + n_launches = 0 + for conf in config_value: + # populate individual config and launch + launch_params[config] = ParamsUtils.convert_to_ast(d=conf) + try: + _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: conf, "data_s3_cred": s3_creds}, + params=launch_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + n_launches += 1 + except SystemExit: + logger.warning(f"Failed to execute job for configuration {conf}") + continue + + if n_launches == 0: + logger.warning("All executions failed") + sys.exit(1) + else: + logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md new file mode 100644 index 000000000..472c39136 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md @@ -0,0 +1,36 @@ +# Workflow Utils for KFPv2 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py new file mode 100644 index 000000000..9297ede66 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.components_utils.component import ( + CompileComponentUtils +) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/component.py rename to kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py new file mode 100644 index 000000000..3a6ab1263 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py @@ -0,0 +1,8 @@ +from kfp_support.workflow_support.runtime_utils.workflow_utils import ( + KFPUtils, + RayRemoteJobs, + ComponentUtils, + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, +) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py new file mode 100644 index 000000000..7328c740d --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py @@ -0,0 +1,557 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import json +import os +import re +import sys +import time +from typing import Any, Optional + +from data_processing.data_access import DataAccess +from data_processing.utils import get_logger +import kfp_server_api +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from ray.job_submission import JobStatus + +logger = get_logger(__name__) + +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if isinstance(value, str): + res += f'--{key}="{value}" ' + else: + res += f"--{key}={value} " + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {status}") + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.output_folder + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + # @staticmethod + # def add_settings_to_component( + # task: dsl.PipelineTask, + # timeout: int, + # image_pull_policy: str = "IfNotPresent", + # cache_strategy: bool = False, + # ) -> None: + # """ + # Add settings to kfp task + # :param task: kfp task + # :param timeout: timeout to set to the component in seconds + # :param image_pull_policy: pull policy to set to the component + # :param cache_strategy: cache strategy + # """ + # + # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # # Set cashing + # task.set_caching_options(enable_caching=cache_strategy) + # # image pull policy + # kubernetes.set_image_pull_policy(task, image_pull_policy) + # # Set the timeout for the task to one day (in seconds) + # kubernetes.set_timeout(task, seconds=timeout) + + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 190acf80b..dec823e4b 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -12,7 +12,7 @@ import sys -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs def start_ray_cluster( diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index fc5016b87..85fbf8dde 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -12,7 +12,7 @@ import sys -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs # Cleans and shutdowns the Ray cluster diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 74d42df1a..8fe53667f 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 1e58a5e66..7cb3cacb8 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 52f8c0da4..78d703a26 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -1,7 +1,7 @@ import sys from data_processing.utils.params_utils import ParamsUtils -from kfp_support.workflow_support.utils import KFPUtils, PipelinesUtils +from kfp_support.workflow_support.runtime_utils import KFPUtils, PipelinesUtils def invoke_sub_workflow( diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py deleted file mode 100644 index 9bc541af8..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/comp_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.comp_utils.component import ( - CompileComponentUtils -) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py deleted file mode 100644 index 654608dc4..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.pipeline_utils.pipeline_utils import ( - PipelinesUtils, -) \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py deleted file mode 100644 index 47d886209..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipeline_utils.py +++ /dev/null @@ -1,121 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -import time - -from typing import Any, Optional -import kfp_server_api -from kfp import Client -from data_processing.utils import get_logger - -logger = get_logger(__name__) - -class PipelinesUtils: - """ - Helper class for pipeline management - """ - - def __init__(self, host: str = "http://localhost:8080"): - """ - Initialization - :param host: host to connect to - """ - self.kfp_client = Client(host=host) - - def start_pipeline( - self, - pipeline: kfp_server_api.V2beta1Pipeline, - experiment: kfp_server_api.V2beta1Experiment, - params: Optional[dict[str, Any]], - ) -> str: - """ - Start a specified pipeline. - :param pipeline: pipeline definition - :param experiment: experiment to use - :param params: pipeline parameters - :return: the id of the run object - """ - job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") - try: - run_id = self.kfp_client.run_pipeline( - experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params - ) - logger.info("Pipeline submitted") - return run_id.id - except Exception as e: - logger.warning(f"Exception starting pipeline {e}") - return None - - def get_experiment_by_name(self, name: str = "Default") -> kfp_server_api.V2beta1Experiment: - """ - Get experiment by name - :param name: name - :return: experiment - """ - try: - return self.kfp_client.get_experiment(experiment_name=name) - except Exception as e: - logger.warning(f"Exception getting experiment {e}") - return None - - def get_pipeline_by_name(self, name: str, np: int = 100) -> kfp_server_api.V2beta1Pipeline: - """ - Given pipeline name, return the pipeline - :param name: pipeline name - :param np: page size for pipeline query. For large clusters with many pipelines, you might need to - increase this number - :return: pipeline - """ - try: - # Get all pipelines - pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines - required = list(filter(lambda p: name in p.name, pipelines)) - if len(required) != 1: - logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") - return None - return required[0] - - except Exception as e: - logger.warning(f"Exception getting pipeline {e}") - return None - - def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: - """ - Waits for a pipeline run to complete - :param run_id: run id - :param timeout: timeout (sec) (-1 wait forever) - :param wait: internal wait (sec) - :return: Completion status and an error message if such exists - """ - try: - if timeout > 0: - end = time.time() + timeout - else: - end = 2**63 - 1 - run_details = self.kfp_client.get_run(run_id=run_id) - status = run_details.run.status - while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: - time.sleep(wait) - if (end - time.time()) < 0: - return "failed", f"Execution is taking too long" - run_details = self.kfp_client.get_run(run_id=run_id) - status = run_details.run.status - logger.info(f"Got pipeline execution status {status}") - - if status.lower() in ["succeeded", "completed"]: - return status, "" - return status, run_details.run.error - - except Exception as e: - logger.warning(f"Failed waiting pipeline completion {e}") - return "failed", str(e) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py similarity index 78% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py index a30003018..1e7ff9cf7 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py @@ -1,23 +1,9 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime - import os import sys from data_processing.utils import get_logger, str2bool -from kfp_support.workflow_support.pipeline_utils import PipelinesUtils +from . import PipelinesUtils logger = get_logger(__name__) diff --git a/kfp/kfp_support_lib/test/pipeline_utils_test.py b/kfp/kfp_support_lib/test/pipeline_utils_test.py index 2630552ee..f0bfd9189 100644 --- a/kfp/kfp_support_lib/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/test/pipeline_utils_test.py @@ -17,7 +17,7 @@ def test_pipelines(): """ Test pipelines utils """ - utils = PipelinesUtils(host="http://localhost:8080/kfp") + utils = PipelinesUtils(host="http://localhost:8080") # get pipeline by name pipeline = utils.get_pipeline_by_name("[Tutorial] Data passing in python components") assert pipeline is not None diff --git a/kfp/kfp_support_lib_v2/Makefile b/kfp/kfp_support_lib_v2/Makefile new file mode 100644 index 000000000..60fd51f15 --- /dev/null +++ b/kfp/kfp_support_lib_v2/Makefile @@ -0,0 +1,63 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ../../.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +update-toml:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml + sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml + +build:: update-toml venv + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build + +publish:: .check-env +publish:: + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv:: pyproject.toml .check-env + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + pip install -e .; \ + pip install ray==${RAY} \ + pip install pytest pytest-cov + +test:: venv + @# Help: Use the already-built virtual environment to run pytest on the test directory. + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; +endif diff --git a/kfp/kfp_support_lib_v2/README.md b/kfp/kfp_support_lib_v2/README.md new file mode 100644 index 000000000..86f3f4360 --- /dev/null +++ b/kfp/kfp_support_lib_v2/README.md @@ -0,0 +1,68 @@ +# KFP support library + +This provides support for implementing KFP pipelines automating transform's execution. +It comprises 2 main modules + +* [api server client](src/kfp_support/api_server_client/README.md) +* [workflow support](src/kfp_support/workflow_support/README.md) + +## Development + +### Requirements +1. python 3.10 or later +2. git command line tools +3. [pre-commit](https://pre-commit.com/) +4. twine (pip install twine) + * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` + +### Git +Simple clone the repo and set up the pre-commit hooks. +```shell +git clone git@github.com:IBM/data-prep-kit.git +cd kfp/kfp_support_lib +pre-commit install +``` +If you don't have pre-commit, you can install from [here](https://pre-commit.com/) + +## Library Artifact Build and Publish + +The process of creating a release for `fm_data_processing_kfp` package involves the following steps: + +cd to the package directory. + +update the version in [requirements.env](../requirements.env) file. + +run `make build` and `make publish`. + +## Testing + +To run the package tests perform the following: + +To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. + +```bash +make setup +``` + +The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. + +```bash +make build +``` + +lastly, execute the tests: + +```bash +make test +``` + +### Cleanup + +It is advisable to execute the following command prior to running `make test` once more. This will ensure that any +previous test runs resources are removed before starting new tests. + +```bash +kubectl delete workflows -n kubeflow --all +``` + + diff --git a/kfp/kfp_support_lib_v2/doc/kfp_support_library.md b/kfp/kfp_support_lib_v2/doc/kfp_support_library.md new file mode 100644 index 000000000..60494b9f9 --- /dev/null +++ b/kfp/kfp_support_lib_v2/doc/kfp_support_library.md @@ -0,0 +1,10 @@ +# KFP Support Library + +This library is aimed to simplify transform pipelines implementations and consists of 3 main parts: + +* [API Server Client](../src/kfp_support/api_server_client/README.md) +* [workflow support](../src/kfp_support/workflow_support/README.md) +* workflow support_v2 + +See also how this library is used for [kfp components](../../kfp_ray_components/README.md) implementation +and implementation of the actual [workflow](../../doc/simple_transform_pipeline.md) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_support_lib_v2/pyproject.toml new file mode 100644 index 000000000..f995d60d7 --- /dev/null +++ b/kfp/kfp_support_lib_v2/pyproject.toml @@ -0,0 +1,47 @@ +[project] +name = "data_prep_toolkit_kfp_v2" +version = "0.1.1" +requires-python = ">=3.10" +description = "Data Preparation Kit Library. KFP v2 support" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, + { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, + { name = "Revital Eres", email = "eres@il.ibm.com" }, +] +dependencies = [ + "kfp==2.7.0", + "kfp-kubernetes==1.2.0", + "requests", + "data-prep-toolkit==0.1.1", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", +] + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/kfp_support"] + +[tool.pytest.ini_options] +addopts = "--cov --cov-report term-missing --cov-fail-under 10" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md new file mode 100644 index 000000000..423f743a1 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md @@ -0,0 +1,4 @@ +# KubeRay API server APIs + +This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) +Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py new file mode 100644 index 000000000..60cbbc2f2 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py @@ -0,0 +1 @@ +from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py new file mode 100644 index 000000000..270815e77 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py @@ -0,0 +1,636 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +import requests +from data_processing.utils import get_logger +from kfp_support.api_server_client.params import ( + Cluster, + RayJobInfo, + RayJobRequest, + Template, + cluster_decoder, + clusters_decoder, + template_decoder, + templates_decoder, +) + + +logger = get_logger(__name__) + + +_headers = {"Content-Type": "application/json", "accept": "application/json"} + +CONNECT_TIMEOUT = 50 +READ_TIMEOUT = 50 +TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) + + +class KubeRayAPIs: + """ + This class implements KubeRay APIs based on the API server. + To create a class, the following parameters are required: + base - the URL of the API server (default is set to the standalone API server) + wait interval - the amount of sec to wait between checking for cluster ready + """ + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + token: str = None, + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initializer + :param server_url: API server url - default assuming running it inside the cluster + :param token: token, only used for API server with security enabled + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.server_url = server_url + if token is not None: + _headers["Authorization"] = token + self.wait_interval = wait_interval + self.api_base = "/apis/v1/" + self.http_retries = http_retries + + def list_compute_templates(self) -> tuple[int, str, list[Template]]: + """ + List compute templates across all namespaces of the k8 cluster + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning(f"Failed to list compute templates, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: + """ + List compute templates across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: return tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning( + f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: + """ + get a compute template + :param ns: namespace + :param name: template name + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, template_decoder(response.json()) + else: + logger.warning( + f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_compute_template(self, template: Template) -> tuple[int, str]: + """ + Create a compute template + :param template - definition of a template + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: + """ + delete a compute template + :param ns: namespace + :param name: template name + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def list_clusters(self) -> tuple[int, str, list[Cluster]]: + """ + List clusters across all namespaces of the k8 cluster + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list cluster, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list cluster, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: + """ + List clusters across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: + """ + get cluster + :param ns: namespace + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + clusters definition + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, cluster_decoder(response.json()) + else: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_cluster(self, cluster: Cluster) -> tuple[int, str]: + """ + create cluster + :param cluster: cluster definition + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" + for i in range(self.http_retries): + try: + response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: + """ + get cluster status + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + # Execute HTTP request + status, error, cluster = self.get_cluster(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error, None + cluster_status = "creating" + if cluster.cluster_status is not None: + cluster_status = cluster.cluster_status + return status, None, cluster_status + + def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: + """ + wait for cluster to be ready + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) + :returns: A tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + current_wait = 0 + while True: + status, error, c_status = self.get_cluster_status(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error + if c_status == "ready": + return status, None + if current_wait > wait > 0: + return 408, f"Timed out waiting for cluster ready in {current_wait} sec" + time.sleep(self.wait_interval) + current_wait += self.wait_interval + + def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: + """ + get cluster endpoint + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) for cluster to be ready + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + endpoint (service for dashboard endpoint) + """ + # Ensure that the cluster is ready + status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) + if status // 100 != 2: + return status, error, None + # Get cluster + status, error, cluster = self.get_cluster(ns=ns, name=name) + if status // 100 != 2: + return status, error, None + return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" + + def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: + """ + delete cluster + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: + """ + submit Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param job_request: job submission + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json()["submissionId"] + else: + logger.warning( + f"Failed to submit job to the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, None + + def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: + """ + get Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, RayJobInfo(response.json()) + else: + logger.warning( + f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: + """ + list Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + job_info_array = response.json().get("submissions", None) + if job_info_array is None: + return response.status_code, None, [] + else: + return response.status_code, None, [RayJobInfo(i) for i in job_info_array] + else: + logger.warning( + f"Failed to list jobs from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, [] + + def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: + """ + get Ray job log + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + log + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json().get("log", "") + else: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" + ) + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + stop Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.post(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + delete Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py new file mode 100644 index 000000000..e5a7d70fa --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py @@ -0,0 +1,53 @@ +from kfp_support.api_server_client.params.templates import ( + TolerationOperation, + TolerationEffect, + Toleration, + Template, + toleration_decoder, + template_decoder, + templates_decoder, +) +from kfp_support.api_server_client.params.volumes import ( + HostPath, + MountPropagationMode, + AccessMode, + BaseVolume, + HostPathVolume, + PVCVolume, + EphemeralVolume, + EmptyDirVolume, + ConfigMapVolume, + SecretVolume, + volume_decoder, +) +from kfp_support.api_server_client.params.environmentvariables import ( + EnvVarSource, + EnvVarFrom, + EnvironmentVariables, + env_var_from_decoder, + environment_variables_decoder, +) +from kfp_support.api_server_client.params.headnode import ( + ServiceType, + HeadNodeSpec, + DEFAULT_HEAD_START_PARAMS, + head_node_spec_decoder, +) +from kfp_support.api_server_client.params.workernode import ( + WorkerNodeSpec, + DEFAULT_WORKER_START_PARAMS, + worker_node_spec_decoder, +) +from kfp_support.api_server_client.params.cluster import ( + Environment, + AutoscalerOptions, + ClusterSpec, + ClusterEvent, + Cluster, + UpscalingMode, + autoscaling_decoder, + cluster_spec_decoder, + cluster_decoder, + clusters_decoder, +) +from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py new file mode 100644 index 000000000..922a14bef --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py @@ -0,0 +1,475 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + HeadNodeSpec, + WorkerNodeSpec, + environment_variables_decoder, + head_node_spec_decoder, + volume_decoder, + worker_node_spec_decoder, +) + + +class Environment(enum.Enum): + """ + Environment definitions + """ + + DEV = 0 # development + TESTING = 1 # testing + STAGING = 2 # staging + PRODUCTION = 3 # production + + +class UpscalingMode(enum.Enum): + """ + Enumeration of autoscaling mode + """ + + Conservative = ( + "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster + ) + Default = "Default" # no rate limitations + Aggressive = "Aggressive" # same as default + + +class AutoscalerOptions: + """ + AutoscalerOptions is used to define Ray cluster autoscaling. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create autoscaling options specification: gets the following parameters: + idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray + resources. Default 60sec (one minute). + upscaling_mode - required autoscaler upscaling mode + image - optional, allows to override the autoscaler's container image + image_pull_policy - optional, allows to override the autoscaler's container image pull policy + cpus - optional, CPUs requirements for autoscaler - default "500m" + memory - optional, memory requirements for autoscaler - default "512Mi" + environment - optional, environment variables for autoscaler container + volumes - optional, a list of volumes to attach to autoscaler container. + This is needed for enabling TLS for the autoscaler container. + """ + + def __init__( + self, + upscaling_mode: UpscalingMode = UpscalingMode.Default, + idle_tmout: int = None, + image: str = None, + image_pull_policy: str = None, + cpus: str = None, + memory: str = None, + environment: EnvironmentVariables = None, + volumes: list[BaseVolume] = None, + ): + """ + Initialization + :param upscaling_mode: upscale mode + :param idle_tmout: idle timeout + :param image: image + :param image_pull_policy: image pull policy + :param cpus: cpu requirement for autoscaling + :param memory: memory requirement for autoscaling + :param environment: autoscaler environment + :param volumes: volumes for autoscaler + """ + self.upscaling_mode = upscaling_mode + self.idle_tmout = idle_tmout + self.image = image + self.image_pull_policy = image_pull_policy + self.cpus = cpus + self.memory = memory + self.environment = environment + self.volumes = volumes + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"upscaling_mode = {self.upscaling_mode}" + if self.idle_tmout is not None: + val += f", idle_timeout = {self.idle_tmout}" + if self.image is not None: + val += f", image = {self.image}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.cpus is not None: + val += f", cpus = {self.cpus}" + if self.memory is not None: + val += f", memory = {self.memory}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"upscalingMode": self.upscaling_mode.value} + if self.idle_tmout is not None: + dct["idleTimeoutSeconds"] = self.idle_tmout + if self.image is not None: + dct["image"] = self.image + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.cpus is not None: + dct["cpu"] = self.cpus + if self.memory is not None: + dct["memory"] = self.memory + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["envs"] = self.environment.to_dict() + return dct + + +class ClusterSpec: + """ + ClusterSpec is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create cluster spec from: gets the following parameters: + head_group_spec - required, specification of the head node + worker_group_spec - optional, list of worker group specs + autoscaler_options - optional, autoscaling options + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + head_node: HeadNodeSpec, + worker_groups: list[WorkerNodeSpec] = None, + autoscaling_options: AutoscalerOptions = None, + ): + """ + Initialization + :param head_node - head node definition + :param worker_groups - worker group definition + :param autoscaling_options - autoscaler options + """ + self.head_node = head_node + self.worker_groups = worker_groups + self.autoscaling_options = autoscaling_options + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster spec + """ + val = f"head_group_spec: {self.head_node.to_string()}" + if self.worker_groups is not None: + val += "\nworker groups: " + for w in self.worker_groups: + val += f"\nworker_group_spec = {w.to_string()}]" + if self.autoscaling_options is not None: + val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: Dictionary representation of cluster spec + """ + dst = {"headGroupSpec": self.head_node.to_dict()} + if self.worker_groups is not None: + dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] + if self.autoscaling_options is not None: + dst["enableInTreeAutoscaling"] = True + dst["autoscalerOptions"] = self.autoscaling_options.to_dict() + return dst + + +class ClusterEvent: + """ + Cluster event is used to define events emitted during cluster creation. + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create event: gets the dictionary with the following parameters: + id - unique Event Id + name - human readable event name + created_at - event creation time + first_timestamp - first time the event occur + last_timestamp - last time the event occur + reason - reason for the transition into the object's current status + message - human-readable description of the status of this operation + type - type of this event (Normal, Warning), new types could be added in the future + count - number of times this event has occurred + """ + + def __init__(self, dst: dict[str, Any]): + """ + Initialization from dictionary + :param dst: dictionary representation of cluster event + """ + self.id = dst.get("id", "") + self.name = dst.get("name", "") + self.created_at = dst.get("created_at", "") + self.first_timestamp = dst.get("first_timestamp", "") + self.last_timestamp = dst.get("last_timestamp", "") + self.reason = dst.get("reason", "") + self.message = dst.get("message", "") + self.type = dst.get("type", "") + self.count = dst.get("count", "0") + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster event + """ + return ( + f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " + f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," + f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" + ) + + +class Cluster: + """ + Cluster is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + name - required, unique (per namespace) cluster name + namespace - required, cluster's namespace (should exist) + user - required, user who owns the cluster + version - required, Ray cluster version - typically Ray version + deployment_environment - optional (see Environment) + cluster_spec - required, ray cluster configuration + annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class + cluster_environment - optional, cluster environment variables + created_at - output, cluster creation ts + deleted_at - output, cluster deletion ts + cluster_status - output, cluster status + events - output, cluster events + service_endpoint - output, cluster service endpoints + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + name: str, + namespace: str, + user: str, + version: str, + cluster_spec: ClusterSpec, + deployment_environment: Environment = None, + annotations: dict[str, str] = None, + cluster_environment: EnvironmentVariables = None, + created_at: str = None, + deleted_at: str = None, + cluster_status: str = None, + events: list[ClusterEvent] = None, + service_endpoint: dict[str, str] = None, + ): + """ + Initialization + :param name: cluster name + :param namespace: cluster namespace + :param user: user name + :param version: version + :param cluster_spec: cluster spec + :param deployment_environment: cluster deployment environment + :param annotations: cluster annotations + :param cluster_environment: cluster environment + :param created_at: created at + :param deleted_at: deleted at + :param cluster_status: status + :param events: cluster events + :param service_endpoint: service endpoint + """ + self.name = name + self.namespace = namespace + self.user = user + self.version = version + self.cluster_spec = cluster_spec + self.environment = deployment_environment + self.annotations = annotations + self.envs = cluster_environment + self.created_at = created_at + self.deleted_at = deleted_at + self.cluster_status = cluster_status + self.events = events + self.service_endpoint = service_endpoint + + def to_string(self) -> str: + """ + convert to string representation + :return: string representation of cluster + """ + val = ( + f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " + f"cluster_spec = {self.cluster_spec.to_string()}" + ) + if self.environment is not None: + val += f"deployment environment = {self.environment.name}" + if self.annotations is not None: + val += f" ,annotations = {str(self.annotations)}" + if self.envs is not None: + val = val + f",cluster environment = {self.envs.to_string()}" + val += "\ncluster output\n" + if self.created_at is not None: + val += f" ,created_at = {self.created_at}" + if self.deleted_at is not None: + val += f" ,deleted_at = {self.deleted_at}" + if self.cluster_status is not None: + val += f" ,cluster status = {self.cluster_status}" + if self.events is not None: + val = val + ",\n cluster events = [" + first = True + for e in self.events: + if first: + first = False + else: + val += ", " + val = val + "{" + e.to_string() + "}" + val = val + "]" + if self.service_endpoint is not None: + val += f" ,service endpoints = {str(self.service_endpoint)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of cluster + """ + # only convert input variables + dst = { + "name": self.name, + "namespace": self.namespace, + "user": self.user, + "version": self.version, + "clusterSpec": self.cluster_spec.to_dict(), + } + if self.environment is not None: + dst["environment"] = self.environment.value + if self.annotations is not None: + dst["annotations"] = self.annotations + if self.envs is not None: + dst["envs"] = self.envs.to_dict() + return dst + + +""" + Creates new cluster from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: + """ + Create autoscaling options from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: autoscaling options + """ + upscaling_mode = UpscalingMode.Default + if "upscalingMode" in dct: + upscaling_mode = UpscalingMode(dct.get("upscalingMode")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("envs")) > 0: + environments = environment_variables_decoder(dct.get("envs")) + return AutoscalerOptions( + upscaling_mode=upscaling_mode, + idle_tmout=dct.get("idleTimeoutSeconds", None), + image=dct.get("image", None), + image_pull_policy=dct.get("imagePullPolicy", None), + cpus=dct.get("cpu", None), + memory=dct.get("memory", None), + environment=environments, + volumes=volumes, + ) + + +def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: + """ + Create cluster spec from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: cluster spec + """ + workers = None + autoscaling_options = None + if "workerGroupSpec" in dct: + workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] + if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): + autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) + return ClusterSpec( + head_node=head_node_spec_decoder(dct.get("headGroupSpec")), + worker_groups=workers, + autoscaling_options=autoscaling_options, + ) + + +def cluster_decoder(dct: dict[str, Any]) -> Cluster: + """ + Create cluster from its dictionary representation + :param dct: dictionary representation of cluster + :return: cluster + """ + environment = None + if "environment" in dct: + environment = Environment(int(dct.get("environment", "0"))) + events = None + if "events" in dct: + events = [ClusterEvent(c) for c in dct["events"]] + envs = None + if "envs" in dct: + envs = environment_variables_decoder(dct.get("envs")) + return Cluster( + name=dct.get("name", ""), + namespace=dct.get("namespace", ""), + user=dct.get("user", ""), + version=dct.get("version", ""), + cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), + deployment_environment=environment, + annotations=dct.get("annotations"), + cluster_environment=envs, + created_at=dct.get("createdAt"), + deleted_at=dct.get("deletedAt"), + cluster_status=dct.get("clusterState"), + events=events, + service_endpoint=dct.get("serviceEndpoint"), + ) + + +def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: + """ + Create list of clusters from its dictionary representation + :param dct: dictionary representation of a list of clusters + :return: list of clusters + """ + return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py new file mode 100644 index 000000000..d1056f6f6 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py @@ -0,0 +1,158 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class EnvVarSource(enum.Enum): + """ + Enumeration of environment sources + """ + + CONFIGMAP = 0 # config map + SECRET = 1 # secret + RESOURCE_FIELD = 2 # resource field + FIELD = 3 # field + + +class EnvVarFrom: + """ + EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + Source required - source of environment variable + name required name for config map or secret, container name for resource, path for field + key required Key for config map or secret, resource name for resource + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, source: EnvVarSource, name: str, key: str): + """ + Initialize + :param source - source + :param name source name + :param key source key + """ + self.source = source + self.name = name + self.key = key + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of environment from + """ + return f"source = {self.source.name}, name = {self.name}, key = {self.key}" + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of environment from + """ + return {"source": self.source.value, "name": self.name, "key": self.key} + + +class EnvironmentVariables: + """ + EnvironmentVariables is used to define environment variables. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + key_value - optional, dictionary of key/value environment variables + from_ref - optional, dictionary of reference environment variables + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): + """ + Initialization + :param key_value: dictionary of key/value pairs for environment variables + :param from_ref: dictionary of key/value pairs for environment from variables + """ + self.key_val = key_value + self.from_ref = from_ref + + def to_string(self) -> str: + """ + convert to string + :return: string representation of environment variables + """ + val = "" + if self.key_val is not None: + val = f"values = {str(self.key_val)}" + if self.from_ref is not None: + if val != "": + val += " , " + val += "valuesFrom = {" + first = True + for k, v in self.from_ref.items(): + if not first: + val += ", " + else: + first = False + val += f"{k} = [{v.to_string()}]" + val += "}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of environment variables + """ + dst = {} + if self.key_val is not None: + dst["values"] = self.key_val + if self.from_ref is not None: + fr = {} + for k, v in self.from_ref.items(): + fr[k] = v.to_dict() + dst["valuesFrom"] = fr + return dst + + +""" + Creates new environment variable from from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: + """ + Create environment from from dictionary + :param dct: dictionary representations of environment from + :return: environment from + """ + return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) + + +def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: + """ + Create environment variables from from dictionary + :param dct: dictionary representations of environment variables + :return: environment variables + """ + keyvalues = None + fr = None + if "values" in dct: + keyvalues = dct.get("values") + if "valuesFrom" in dct: + from_ref = dct.get("valuesFrom") + fr = {} + for k, v in from_ref.items(): + fr[k] = env_var_from_decoder(v) + return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py new file mode 100644 index 000000000..7a9d4120f --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py @@ -0,0 +1,202 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} + + +class ServiceType(enum.Enum): + """ + Enumeration of head node service types + """ + + ClusterIP = "ClusterIP" # cluster IP + NodePort = "NodePort" # node port + LoadBalancer = "LoadBalancer" # load balancer + + +class HeadNodeSpec: + """ + HeadNodeSpec is used to define Ray cluster head node configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create head node specification: gets the following parameters: + compute_template - required, the computeTemplate of head node group + ray_start_params - required, Ray start parameters + image - optional, image used for head node + service_type - optional (ServiceType), service type foe headnode + enable_ingress - optional, allow to enable ingress for dashboard + volumes - optional, a list of volumes to attach to head node + service_account - optional, a service account (has to exist) to run head node + image_pull_secret - optional, secret to pull head node image from registry + environment - optional, environment variables for head pod + annotations - optional, annotations for head node + labels - optional, labels for head node + image_pull_policy - optional, head node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + compute_template: str, + image: str, + ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, + service_type: ServiceType = ServiceType.ClusterIP, + enable_ingress: bool = False, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param compute_template: compute template + :param ray_start_params: ray start parameters + :param image: node image + :param service_type: service type + :param enable_ingress: enable ingress flag + :param volumes: volumes for head node + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: head node environment + :param annotations: head node annotation + :param labels: labels + :param image_pull_policy: image pull policy + """ + + self.compute_template = compute_template + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) + self.image = image + self.service_type = service_type + self.enable_ingress = enable_ingress + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" + if self.image is not None: + val += f", image = {self.image}" + if self.service_type is not None: + val += f", service_type = {self.service_type.name}" + if self.enable_ingress: + val += ", enable_ingress = True" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} + if self.image is not None: + dct["image"] = self.image + if self.service_type is not None: + dct["serviceType"] = self.service_type.value + if self.enable_ingress: + dct["enableIngress"] = True + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["image_pull_secret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new head node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: + """ + Create head node spec from dictionary + :param dct: dictionary representation of head node spec + :return: Head node spec + """ + service_type = None + if "serviceType" in dct: + service_type = ServiceType(dct.get("serviceType", "ClusterIP")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return HeadNodeSpec( + compute_template=dct.get("computeTemplate"), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + service_type=service_type, + enable_ingress=dct.get("enableIngress", False), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py new file mode 100644 index 000000000..a0b2bfcb0 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py @@ -0,0 +1,163 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +from typing import Any + + +class RayJobRequest: + """ + RayJobRequest used to define job to be submitted to a Ray cluster + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - required, the command to start a job on the cluster + submission_id - optional, submission id for the job submission + runtime_env - optional, yaml string specifying job runtime environment + metadata - optional, dictionary of the submission metadata + num_cpus - optional, number of cpus for job execution + num_gpus - optional, number of gpus for job execution + resources - optional, dictionary of the resources for job execution + """ + + def __init__( + self, + entrypoint: str, + submission_id: str = None, + runtime_env: str = None, + metadata: dict[str, str] = None, + num_cpu: float = -1.0, + num_gpu: float = -1.0, + resources: dict[str, str] = None, + ): + """ + Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html + :param entrypoint: entrypoint + :param submission_id: submission id + :param runtime_env: runtime environment + :param metadata: submission metadata + :param num_cpu: job number cpus + :param num_gpu: job number gpus + :param resources: job custom resources + """ + self.entrypoint = entrypoint + self.submission_id = submission_id + self.runtime_env = runtime_env + self.metadata = metadata + self.num_cpu = num_cpu + self.num_gpu = num_gpu + self.resources = resources + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of job submission + """ + val = f"entrypoint = {self.entrypoint}" + if self.submission_id is not None: + val += f", submission_id = {self.submission_id}" + if self.num_cpu > 0: + val += f", num_cpu = {self.num_cpu}" + if self.num_gpu > 0: + val += f", num_gpu = {self.num_gpu}" + if self.runtime_env is not None: + val += f", runtime_env = {self.runtime_env}" + if self.metadata is not None: + val += f", metadata = {self.metadata}" + if self.resources is not None: + val += f", resources = {self.resources}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of job submission + """ + dct = {"entrypoint": self.entrypoint} + if self.submission_id is not None: + dct["submissionId"] = self.submission_id + if self.runtime_env is not None: + dct["runtimeEnv"] = self.runtime_env + if self.metadata is not None: + dct["metadata"] = self.metadata + if self.num_cpu > 0: + dct["numCpus"] = self.num_cpu + if self.num_gpu > 0: + dct["numGpus"] = self.num_gpu + if self.resources is not None: + dct["resources"] = self.resources + return dct + + +class RayJobInfo: + """ + RayJobInfo used to define information about the job in a Ray cluster + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - the command to start a job on the cluster + job_id - job execution id + submission_id - submission id for the job submission + runtime_env - job runtime environment + status - job execution status + message - status message + start_time - job start time + end-time - job ind time + error_type - type of error + metadata - optional, dictionary of the submission metadata + """ + + def __init__(self, dct: dict[str, Any]): + """ + Initialize from dictionary + :param dct: dictionary representation of Ray job info + """ + self.entrypoint = dct.get("entrypoint", "") + self.job_id = dct.get("jobId", "") + self.submission_id = dct.get("submissionId", "") + self.status = dct.get("status", "") + self.message = dct.get("message", None) + self.start_time = int(dct.get("startTime", "0")) + self.end_time = int(dct.get("endTime", "0")) + self.error_type = dct.get("ErrorType", None) + self.metadata = dct.get("Metadata", None) + self.runtime_env = dct.get("runtimeEnv", None) + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of Ray job info + """ + val = ( + f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," + f" status = {self.status}" + ) + if self.message is not None: + val += f" message = {self.message}" + if self.start_time > 0: + val += ( + f" start time = " + f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.end_time > 0: + val += ( + f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.error_type is not None: + val += f" error type = {self.error_type}" + if self.runtime_env is not None: + val += f" runtime env = {str(self.runtime_env)}" + if self.metadata is not None: + val += f" metadata = {str(self.metadata)}" + return val diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py new file mode 100644 index 000000000..0ef4c1583 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py @@ -0,0 +1,224 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class TolerationOperation(enum.Enum): + """ + Toleration operation types + """ + + Exists = "Exists" # exists + Equal = "Equal" # equal + + +class TolerationEffect(enum.Enum): + """ + Toleration effect + """ + + NoSchedule = "NoSchedule" # not schedule + PreferNoSchedule = "PreferNoSchedule" # prefer not schedule + NoExecute = "NoExecute" # not execute + + +class Toleration: + """ + Toleration is used by compute template to pick specific nodes for placing pods. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create toleration: gets the following parameters: + key - required, key created by the node's taint + operator - required, operator to apply, supported operators are "Exists" and "Equal" + effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" + value - optional, value + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): + """ + Initialization + :param key: key + :param operator: operator + :param effect: effect + :param value: value + """ + self.key = key + self.operator = operator + self.value = value + self.effect = effect + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of toleration + """ + val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" + if self.value is None: + return val + else: + return val + f", value = {self.value}" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to string + :return: string representation of toleration + """ + dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} + if self.value is not None: + dct["value"] = self.value + return dct + + +# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. +# If it is specified, it has to be in the format that is understood by kubernetes as a valid +# The following devices are currently supported by kubernetes: +# AMD - gpu accelerator amd.com/gpu +# Intel - gpu accelerator gpu.intel.com/i915 +# NVIDIA - gpu accelerator nvidia.com/gpu + + +class Template: + """ + Template is used to define specific nodes configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create templates: gets the following parameters: + name - required, template name + namespace - required, template namespace + cpus - required, template number of cpus + memory - required, template memory (GB) + gpus - optional, number of GPUs, default 0 + gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed + tolerations - optional, tolerations for pod placing, default none + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + - to_json() -> str convert to json string + """ + + def __init__( + self, + name: str, + namespace: str, + cpu: int, + memory: int, + gpu: int = 0, + gpu_accelerator: str = None, + tolerations: list[Toleration] = None, + ): + """ + Initialization + :param name: name + :param namespace: namespace + :param cpu: cpu + :param memory: memory + :param gpu: gpu + :param gpu_accelerator: accelerator type + :param tolerations: tolerations + """ + self.name = name + self.namespace = namespace + self.cpu = cpu + self.memory = memory + self.gpu = gpu + self.gpu_accelerator = gpu_accelerator + self.tolerations = tolerations + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of template + """ + val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" + if self.gpu > 0: + val = val + f", gpu {self.gpu}" + if self.gpu_accelerator is not None: + val = val + f", gpu accelerator {self.gpu_accelerator}" + if self.tolerations is None: + return val + val = val + ", tolerations [" + first = True + for tol in self.tolerations: + if first: + first = False + val = val + "{" + tol.to_string() + "}" + else: + val = val + ", {" + tol.to_string() + "}" + return val + "]" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of template + """ + dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} + if self.gpu > 0: + dct["gpu"] = self.gpu + if self.gpu_accelerator is not None: + dct["gpu accelerator"] = self.gpu_accelerator + if self.tolerations is not None: + dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] + return dct + + +""" + Creates new toleration from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def toleration_decoder(dct: dict[str, Any]) -> Toleration: + """ + Create toleration from dictionary + :param dct: dictionary representation of toleration + :return: toleration + """ + return Toleration( + key=dct.get("key"), + operator=TolerationOperation(dct.get("operator", "Exists")), + effect=TolerationEffect(dct.get("effect", "NoSchedule")), + value=dct.get("value"), + ) + + +def template_decoder(dct: dict[str, Any]) -> Template: + """ + Create template from dictionary + :param dct: dictionary representation of template + :return: template + """ + tolerations = None + if "tolerations" in dct: + tolerations = [toleration_decoder(d) for d in dct["tolerations"]] + return Template( + name=dct.get("name"), + namespace=dct.get("namespace"), + cpu=int(dct.get("cpu", "0")), + memory=int(dct.get("memory", "0")), + gpu=int(dct.get("gpu", "0")), + gpu_accelerator=dct.get("gpu_accelerator"), + tolerations=tolerations, + ) + + +def templates_decoder(dct: dict[str, Any]) -> list[Template]: + """ + Create list of template from dictionary + :param dct: dictionary representation of list of template + :return: list of template + """ + return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py new file mode 100644 index 000000000..fee0e1ea4 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py @@ -0,0 +1,449 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class HostPath(enum.Enum): + """ + Host path enumeration + """ + + DIRECTORY = 0 # directory + FILE = 1 # files + + +class MountPropagationMode(enum.Enum): + """ + Mount propagation enumeration + """ + + NONE = 0 # None + HOSTTOCONTAINER = 1 # host to container + BIDIRECTIONAL = 2 # bi directional + + +class AccessMode(enum.Enum): + """ + Access mode enumeration + """ + + RWO = 0 # read write once + ROX = 1 # read only many + RWX = 2 # read write many + + +class BaseVolume: + """ + KubeRay currently support several types of volumes, including hostPat, PVC, + ephemeral volumes, config maps, secrets and empty dir. All of them use slightly + different parameters. Base Volume is a base class for all different volume types. + """ + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + +class HostPathVolume(BaseVolume): + """ + This class implements HostPath volume. In addition to name and mount path it requires host + path volume specific parameters: + source - data location on host + hostPathType - host path type: directory (0) or file (1) + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + host_path_type: HostPath = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param host_path_type: host path type + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.host_path_type = host_path_type + self.volume_type = 1 + self.mount_propagation = mount_propagation + + def to_string(self) -> str: + """ + Convert to string + :return: HostPathVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + if self.host_path_type is not None: + val += f", host path type = {self.host_path_type.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: HostPathVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + if self.host_path_type is not None: + dst["hostPathType"] = self.host_path_type.value + return dst + + +class PVCVolume(BaseVolume): + """ + This class implements PVC volume. In addition to name and mount path it requires + PVC volume specific parameters: + source - PVC claim name + read_only - read only flag + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + read_only: bool = False, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param read_only: read only + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.volume_type = 0 + self.mount_propagation = mount_propagation + self.readonly = read_only + + def to_string(self) -> str: + """ + Convert to string + :return: PVCVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" + if self.readonly: + val += ", read only = True" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: PVCVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.readonly: + dst["readOnly"] = True + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + return dst + + +class EphemeralVolume(BaseVolume): + """ + This class implements Ephemeral volume. In addition to name and mount path it requires + Ephemeral volume specific parameters: + storage - disk size (valid k8 value, for example 5Gi) + storageClass - storage class - optional, if not specified, use default + accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) + mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + storage: str, + storage_class: str = None, + access_mode: AccessMode = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param storage: storage + :param storage_class: storage class + :param access_mode: access mode + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 2 + self.mount_propagation = mount_propagation + self.storage_class = storage_class + self.access_mode = access_mode + + def to_string(self) -> str: + """ + Convert to string + :return: EphemeralVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" + ) + if self.storage_class is not None: + val += f", storage class = {self.storage_class}" + if self.access_mode is not None: + val += f", access mode = {self.access_mode.name}" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: EphemeralVolume dictionary representation + """ + dct = { + "name": self.name, + "mountPath": self.mount_path, + "storage": self.storage, + "volumeType": self.volume_type, + } + if self.storage_class is not None: + dct["storageClassName"] = self.storage_class + if self.access_mode is not None: + dct["accessMode"] = self.access_mode.value + if self.mount_propagation is not None: + dct["mountPropagationMode"] = self.mount_propagation.value + return dct + + +class EmptyDirVolume(BaseVolume): + """ + This class implements EmptyDir volume. In addition to name and mount path it requires + Empty Dir specific parameters: + storage - optional max storage size (valid k8 value, for example 5Gi) + """ + + def __init__(self, name: str, mount_path: str, storage: str = None): + """ + Initialization + :param name: name + :param mount_path: mount_path + :param storage: storage + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 5 + + def to_string(self) -> str: + """ + Convert to string + :return: EmptyDirVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" + if self.storage is not None: + val += f", storage = {self.storage}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} + if self.storage is not None: + dct["storage"] = self.storage + return dct + + +class ConfigMapVolume(BaseVolume): + """ + This class implements ConfigMap volume. In addition to name and mount path it requires + configMap volume specific parameters: + source - required, config map name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param items: items + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 3 + + def to_string(self) -> str: + """ + Convert to string + :return: ConfigMapVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" + ) + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: ConfigMapVolume dictionary representation + """ + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +class SecretVolume(BaseVolume): + """ + This class implements Secret volume. In addition to name and mount path it requires + Secret volume specific parameters: + source - required, secret name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 4 + + def to_string(self) -> str: + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +""" + Creates new Volume from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def volume_decoder(dst: dict[str, Any]) -> BaseVolume: + def _get_mount_propagation() -> MountPropagationMode: + if "mountPropagationMode" in dst: + return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) + return None + + def _get_host_path() -> HostPath: + if "hostPathType" in dst: + return HostPath(int(dst.get("hostPathType", "0"))) + return None + + def _get_access_mode() -> AccessMode: + if "accessMode" in dst: + return AccessMode(int(dst.get("accessMode", "0"))) + return None + + match dst["volumeType"]: + case 0: + # PVC + return PVCVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + read_only=dst.get("readOnly", False), + mount_propagation=_get_mount_propagation(), + ) + case 1: + # host path + return HostPathVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + host_path_type=_get_host_path(), + mount_propagation=_get_mount_propagation(), + ) + case 2: + # Ephemeral volume + return EphemeralVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + storage=dst.get("storage", ""), + storage_class=dst.get("storageClassName"), + access_mode=_get_access_mode(), + mount_propagation=_get_mount_propagation(), + ) + case 3: + # ConfigMap Volume + return ConfigMapVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 4: + # Secret Volume + return SecretVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 5: + # Empty dir volume + return EmptyDirVolume( + name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") + ) + case _: + raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py new file mode 100644 index 000000000..ddcf193cc --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py @@ -0,0 +1,206 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} + + +class WorkerNodeSpec: + """ + WorkerNodeSpec is used to define Ray cluster worker node pool configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create worker node pool specification: gets the following parameters: + group_name - required, group name of the worker group + compute_template - required, the computeTemplate of worker node group + replicas - required, desired replicas of the worker group + min_replicas - required Min replicas of the worker group, can't be greater than max_replicas + max_replicas - required, max replicas of the worker group + ray_start_params - required, Ray start parameters + image - optional, image used for worker node + volumes - optional, a list of volumes to attach to worker node + service_account - optional, a service account (has to exist) to run worker node + image_pull_secret - optional, secret to pull worker node image from registry + environment - optional, environment variables for worker pod + annotations - optional, annotations for worker node + labels - optional, labels for worker node + image_pull_policy - optional, worker node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + group_name: str, + compute_template: str, + image: str, + max_replicas: int, + replicas: int = 1, + min_replicas: int = 0, + ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param group_name: name + :param compute_template: compute template + :param replicas: number of replicas + :param min_replicas: min number of replicas + :param max_replicas: max number of replicas + :param ray_start_params: ray start parameters + :param image: image name + :param volumes: volumes + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: environment + :param annotations: annotations + :param labels: labels + :param image_pull_policy: image pull policy + """ + # Validate replicas + if min_replicas > replicas: + raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") + if replicas > max_replicas: + raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") + + self.group_name = group_name + self.compute_template = compute_template + self.replicas = replicas + self.min_replicas = min_replicas + self.max_replicas = max_replicas + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) + self.image = image + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of worker node spec + """ + val = ( + f"group_name = {self.group_name}, compute template = {self.compute_template}, " + f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " + f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" + ) + if self.image is not None: + val += f", image = {self.image}" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of worker node spec + """ + dct = { + "groupName": self.group_name, + "computeTemplate": self.compute_template, + "replicas": self.replicas, + "minReplicas": self.min_replicas, + "maxReplicas": self.max_replicas, + "rayStartParams": self.ray_start_params, + } + if self.image is not None: + dct["image"] = self.image + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["imagePullSecret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new worker node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: + """ + Create worker node spec from dictionary + :param dct: dictionary definition of worker node spec + :return: worker node spec + """ + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return WorkerNodeSpec( + group_name=dct.get("groupName"), + compute_template=dct.get("computeTemplate"), + replicas=dct.get("replicas", 0), + min_replicas=dct.get("minReplicas", 0), + max_replicas=dct.get("maxReplicas", 0), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md new file mode 100644 index 000000000..4943a0b06 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md @@ -0,0 +1,45 @@ +# Workflow Utils for KFP v1 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## PipelinesUtils + +This class provides some higher level functionality based on the capabilities of the python KFP client, including" +* get_experiment_by_name obtains KFP experiment object based on its name +* get_pipeline_by_name obtains KFP pipeline object based on its name +* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a +dictionary of parameters. It returns kfp run ID +* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..bbe1476fb --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.compile_utils.component import ( + ComponentUtils +) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py new file mode 100644 index 000000000..1f66bf59f --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py @@ -0,0 +1,101 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..d2301bd0a --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils +from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py new file mode 100644 index 000000000..ef00b0e92 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py @@ -0,0 +1,113 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json +import os +import re +import sys +from typing import Any + +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if str(value) != "": + if isinstance(value, str): + if '"' in value: + logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") + res += f'--{key}="{value}" ' + elif isinstance(value, bool): + if value: + res += f"--{key} " + else: + res += f"--{key}={value} " + + logger.info(f"request to execute: {res}") + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py new file mode 100644 index 000000000..39d4d9e64 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py @@ -0,0 +1,527 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import re +import sys +import time +from typing import Any + +from data_processing.data_access import DataAccess, DataAccessFactory +from data_processing.utils import ParamsUtils, get_logger +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from kfp_support.workflow_support.runtime_utils import KFPUtils +from ray.job_submission import JobStatus + + +logger = get_logger(__name__) + + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + image_pull_policy: image pull policy, default IfNotPresent + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + image_pull_policy: image pull policy, default IfNotPresent + + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Although the cluster is ready, the service web server might not be ready yet at this point. + # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails + # couple of times, its harmless + _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) + time.sleep(5) + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {job_status}") + if job_status != JobStatus.SUCCEEDED: + sys.exit(1) + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.get_output_folder() + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +def _execute_remote_job( + name: str, + ns: str, + script: str, + params: dict[str, Any], + data_access_params: dict[str, Any], + additional_params: dict[str, Any], + remote_jobs: RayRemoteJobs, +) -> None: + """ + Execute remote job on Ray cluster + :param name: cluster name + :param ns: execution/cluster namespace + :param additional_params: additional parameters for the job + :param data_access_params: data access parameters + :param params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param script: script to run (has to be present in the image) + :param remote_jobs: remote jobs execution support class + :return: + """ + + status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) + if status != 200: + logger.error(f"Failed to submit job - status: {status}, error: {error}") + exit(1) + + logger.info(f"submitted job successfully, submission id {submission}") + # create data access + data_factory = DataAccessFactory() + data_factory.apply_input_params(args=data_access_params) + data_access = data_factory.create_data_access() + # print execution log + remote_jobs.follow_execution( + name=name, + namespace=ns, + submission_id=submission, + data_access=data_access, + print_timeout=additional_params.get("wait_print_tmout", 120), + job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), + ) + + +def execute_ray_jobs( + name: str, # name of Ray cluster + additional_params: dict[str, Any], + e_params: dict[str, Any], + exec_script_name: str, + server_url: str, +) -> None: + """ + Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. + All of the jobs will be executed, although some of the jobs may fail. + :param name: cluster name + :param additional_params: additional parameters for the job + :param e_params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param exec_script_name: script to run (has to be present in the image) + :param server_url: API server url + :return: None + """ + # prepare for execution + ns = KFPUtils.get_namespace() + if ns == "": + logger.warning(f"Failed to get namespace") + sys.exit(1) + # create remote jobs class + remote_jobs = RayRemoteJobs( + server_url=server_url, + http_retries=additional_params.get("http_retries", 5), + wait_interval=additional_params.get("wait_interval", 2), + ) + # find config parameter + config = ParamsUtils.get_config_parameter(e_params) + if config is None: + exit(1) + # get config value + config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) + s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) + if type(config_value) is not list: + # single request + return _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: config_value, "data_s3_cred": s3_creds}, + params=e_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + # remove config key from the dictionary + launch_params = dict(e_params) + del launch_params[config] + # Loop through all configuration + n_launches = 0 + for conf in config_value: + # populate individual config and launch + launch_params[config] = ParamsUtils.convert_to_ast(d=conf) + try: + _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: conf, "data_s3_cred": s3_creds}, + params=launch_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + n_launches += 1 + except SystemExit: + logger.warning(f"Failed to execute job for configuration {conf}") + continue + + if n_launches == 0: + logger.warning("All executions failed") + sys.exit(1) + else: + logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md new file mode 100644 index 000000000..472c39136 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md @@ -0,0 +1,36 @@ +# Workflow Utils for KFPv2 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py new file mode 100644 index 000000000..9297ede66 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.components_utils.component import ( + CompileComponentUtils +) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py new file mode 100644 index 000000000..adaa971c1 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py @@ -0,0 +1,54 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class CompileComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py new file mode 100644 index 000000000..3a6ab1263 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py @@ -0,0 +1,8 @@ +from kfp_support.workflow_support.runtime_utils.workflow_utils import ( + KFPUtils, + RayRemoteJobs, + ComponentUtils, + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, +) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py new file mode 100644 index 000000000..7328c740d --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py @@ -0,0 +1,557 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import json +import os +import re +import sys +import time +from typing import Any, Optional + +from data_processing.data_access import DataAccess +from data_processing.utils import get_logger +import kfp_server_api +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from ray.job_submission import JobStatus + +logger = get_logger(__name__) + +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if isinstance(value, str): + res += f'--{key}="{value}" ' + else: + res += f"--{key}={value} " + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {status}") + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.output_folder + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + # @staticmethod + # def add_settings_to_component( + # task: dsl.PipelineTask, + # timeout: int, + # image_pull_policy: str = "IfNotPresent", + # cache_strategy: bool = False, + # ) -> None: + # """ + # Add settings to kfp task + # :param task: kfp task + # :param timeout: timeout to set to the component in seconds + # :param image_pull_policy: pull policy to set to the component + # :param cache_strategy: cache strategy + # """ + # + # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # # Set cashing + # task.set_caching_options(enable_caching=cache_strategy) + # # image pull policy + # kubernetes.set_image_pull_policy(task, image_pull_policy) + # # Set the timeout for the task to one day (in seconds) + # kubernetes.set_timeout(task, seconds=timeout) + + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) diff --git a/kfp/kfp_support_lib_v2/test/api_params_test.py b/kfp/kfp_support_lib_v2/test/api_params_test.py new file mode 100644 index 000000000..804c84aad --- /dev/null +++ b/kfp/kfp_support_lib_v2/test/api_params_test.py @@ -0,0 +1,433 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json + +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + AccessMode, + AutoscalerOptions, + Cluster, + ClusterEvent, + ClusterSpec, + ConfigMapVolume, + EmptyDirVolume, + Environment, + EnvironmentVariables, + EnvVarFrom, + EnvVarSource, + EphemeralVolume, + HeadNodeSpec, + HostPath, + HostPathVolume, + MountPropagationMode, + PVCVolume, + RayJobInfo, + RayJobRequest, + SecretVolume, + ServiceType, + Template, + Toleration, + TolerationEffect, + TolerationOperation, + WorkerNodeSpec, + autoscaling_decoder, + cluster_decoder, + cluster_spec_decoder, + env_var_from_decoder, + environment_variables_decoder, + head_node_spec_decoder, + template_decoder, + toleration_decoder, + volume_decoder, + worker_node_spec_decoder, +) + + +def test_toleration(): + + tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) + print(f"\ntoleration 1: {tol1.to_string()}") + t1_json = json.dumps(tol1.to_dict()) + print(f"toleration 1 JSON: {t1_json}") + + tol2 = Toleration( + key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" + ) + print(f"toleration 2: {tol2.to_string()}") + t2_json = json.dumps(tol2.to_dict()) + print(f"toleration 2 JSON: {t2_json}") + + assert tol1.to_string() == toleration_decoder(json.loads(t1_json)).to_string() + assert tol2.to_string() == toleration_decoder(json.loads(t2_json)).to_string() + + +def test_templates(): + + tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) + tol2 = Toleration( + key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" + ) + + temp1 = Template(name="template1", namespace="namespace", cpu=1, memory=4, tolerations=[tol1, tol2]) + print(f"\ntemplate 1: {temp1.to_string()}") + tm1_json = json.dumps(temp1.to_dict()) + print(f"template 1 JSON: {tm1_json}") + + temp2 = Template(name="template2", namespace="namespace", cpu=2, memory=8, gpu=1) + print(f"template 2: {temp2.to_string()}") + tm2_json = json.dumps(temp2.to_dict()) + print(f"template 2 JSON: {tm2_json}") + + assert temp1.to_string() == template_decoder(json.loads(tm1_json)).to_string() + assert temp2.to_string() == template_decoder(json.loads(tm2_json)).to_string() + + +def test_volumes(): + + # hostPath + vol = HostPathVolume( + name="hostPath", + mount_path="tmp/hostPath", + source="source", + host_path_type=HostPath.FILE, + mount_propagation=MountPropagationMode.NONE, + ) + print(f"\nhostPath volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"host path volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + vol = PVCVolume( + name="pvc", + mount_path="tmp/pvc", + source="claim", + read_only=True, + mount_propagation=MountPropagationMode.BIDIRECTIONAL, + ) + print(f"PVC volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"PVC volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + vol = EphemeralVolume( + name="ephemeral", mount_path="tmp/ephemeral", storage="5Gi", storage_class="blah", access_mode=AccessMode.RWX + ) + print(f"Ephemeral volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"Ephemeral volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + vol = EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir") + print(f"Empty dir volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"Empty dir volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + vol = ConfigMapVolume( + name="confmap", mount_path="tmp/confmap", source="my-map", items={"sample_code.py": "sample_code.py"} + ) + print(f"config map volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"config map volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + vol = SecretVolume(name="secret", mount_path="tmp/secret", source="my-secret") + print(f"secret volume: {vol.to_string()}") + vol_json = json.dumps(vol.to_dict()) + print(f"secret volume json: {vol_json}") + assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() + + +def test_environment(): + + env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") + print(f"\nEnv variable from: {env_v.to_string()}") + env_v_json = json.dumps(env_v.to_dict()) + print(f"Env variable from JSON: {env_v_json}") + assert env_var_from_decoder(json.loads(env_v_json)).to_string() == env_v.to_string() + + envs = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) + print(f"Env variables: {envs.to_string()}") + envs_json = json.dumps(envs.to_dict()) + print(f"Env variables JSON: {envs_json}") + assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() + + envs = EnvironmentVariables(from_ref={"key_ref": env_v}) + print(f"Env variables: {envs.to_string()}") + envs_json = json.dumps(envs.to_dict()) + print(f"Env variables JSON: {envs_json}") + assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() + + envs = EnvironmentVariables(key_value={"key": "val"}) + print(f"Env variables: {envs.to_string()}") + envs_json = json.dumps(envs.to_dict()) + print(f"Env variables JSON: {envs_json}") + assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() + + +def test_head_node_spec(): + + env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") + env_s = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) + volumes = [ + PVCVolume( + name="pvc", + mount_path="tmp/pvc", + source="claim", + read_only=True, + mount_propagation=MountPropagationMode.BIDIRECTIONAL, + ), + EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), + ] + + head = HeadNodeSpec( + compute_template="template", + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_HEAD_START_PARAMS, + enable_ingress=True, + service_type=ServiceType.ClusterIP, + volumes=volumes, + environment=env_s, + image_pull_policy="Always", + ) + print(f"\nhead node: {head.to_string()}") + head_json = json.dumps(head.to_dict()) + print(f"head node JSON: {head_json}") + assert head_node_spec_decoder(json.loads(head_json)).to_string() == head.to_string() + + +def test_worker_node_spec(): + + env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") + env_s = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) + volumes = [ + PVCVolume( + name="pvc", + mount_path="tmp/pvc", + source="claim", + read_only=True, + mount_propagation=MountPropagationMode.BIDIRECTIONAL, + ), + EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), + ] + + worker = WorkerNodeSpec( + group_name="group", + compute_template="template", + image="rayproject/ray:2.9.0-py310", + replicas=2, + min_replicas=2, + max_replicas=2, + volumes=volumes, + ray_start_params=DEFAULT_WORKER_START_PARAMS, + environment=env_s, + labels={"key": "value"}, + image_pull_policy="IfNotPresent", + ) + print(f"\nworker node: {worker.to_string()}") + worker_json = json.dumps(worker.to_dict()) + print(f"worker node JSON: {worker_json}") + assert worker_node_spec_decoder(json.loads(worker_json)).to_string() == worker.to_string() + + +def test_autoscaler_options(): + options = AutoscalerOptions() + print(f"\nautoscaler options: {options.to_string()}") + options_json = json.dumps(options.to_dict()) + print(f"autoscaler options JSON: {options_json}") + assert autoscaling_decoder(json.loads(options_json)).to_string() == options.to_string() + + options = AutoscalerOptions(cpus="1.0", memory="64GB") + print(f"\nautoscaler options: {options.to_string()}") + options_json = json.dumps(options.to_dict()) + print(f"autoscaler options JSON: {options_json}") + assert autoscaling_decoder(json.loads(options_json)).to_string() == options.to_string() + + +def test_cluster_spec(): + env_s = EnvironmentVariables( + key_value={"key": "val"}, + from_ref={"key_ref": EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key")}, + ) + volumes = [ + PVCVolume( + name="pvc", + mount_path="tmp/pvc", + source="claim", + read_only=True, + mount_propagation=MountPropagationMode.BIDIRECTIONAL, + ), + EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), + ] + spec = ClusterSpec( + head_node=HeadNodeSpec( + compute_template="template", + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_HEAD_START_PARAMS, + volumes=volumes, + enable_ingress=True, + service_type=ServiceType.ClusterIP, + environment=env_s, + ), + worker_groups=[ + WorkerNodeSpec( + group_name="group", + compute_template="template", + replicas=2, + min_replicas=2, + max_replicas=2, + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_WORKER_START_PARAMS, + volumes=volumes, + environment=env_s, + labels={"key": "value"}, + ), + WorkerNodeSpec( + group_name="group1", + compute_template="template1", + replicas=2, + min_replicas=2, + max_replicas=2, + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_WORKER_START_PARAMS, + volumes=volumes, + environment=env_s, + labels={"key": "value"}, + ), + ], + autoscaling_options=AutoscalerOptions(), + ) + print(f"\ncluster spec: {spec.to_string()}") + spec_json = json.dumps(spec.to_dict()) + print(f"cluster spec JSON: {spec_json}") + assert cluster_spec_decoder(json.loads(spec_json)).to_string() == spec.to_string() + + +def test_cluster(): + + event = { + "id": "id", + "name": "name", + "created_at": "ts", + "first_timestamp": "ts", + "last_timestamp": "ts", + "reason": "reason", + "message": "message", + "type": "warning", + "count": "1", + } + print(f"\ncluster event: {ClusterEvent(event).to_string()}") + env_s = EnvironmentVariables( + key_value={"key": "val"}, + from_ref={"key_ref": EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key")}, + ) + volumes = [ + PVCVolume( + name="pvc", + mount_path="tmp/pvc", + source="claim", + read_only=True, + mount_propagation=MountPropagationMode.BIDIRECTIONAL, + ), + EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), + ] + spec = ClusterSpec( + head_node=HeadNodeSpec( + compute_template="template", + ray_start_params=DEFAULT_HEAD_START_PARAMS, + enable_ingress=True, + service_type=ServiceType.ClusterIP, + volumes=volumes, + environment=env_s, + annotations={"a_key": "a_val"}, + image="rayproject/ray:2.9.0-py310", + ), + worker_groups=[ + WorkerNodeSpec( + group_name="group", + compute_template="template", + replicas=2, + min_replicas=2, + max_replicas=2, + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_WORKER_START_PARAMS, + volumes=volumes, + environment=env_s, + labels={"key": "value"}, + ), + WorkerNodeSpec( + group_name="group1", + compute_template="template1", + replicas=2, + min_replicas=2, + max_replicas=2, + image="rayproject/ray:2.9.0-py310", + ray_start_params=DEFAULT_WORKER_START_PARAMS, + volumes=volumes, + environment=env_s, + labels={"key": "value"}, + ), + ], + ) + cluster = Cluster( + name="test", + namespace="default", + user="boris", + version="2.9.0", + cluster_spec=spec, + deployment_environment=Environment.DEV, + cluster_environment=env_s, + ) + print(f"cluster: {cluster.to_string()}") + cluster_json = json.dumps(cluster.to_dict()) + print(f"cluster JSON: {cluster_json}") + assert cluster_decoder(json.loads(cluster_json)).to_string() == cluster.to_string() + + cluster_dict = cluster.to_dict() + cluster_dict["created_at"] = "created" + cluster_dict["created_status"] = "status" + cluster_dict["events"] = [event] + print(f"cluster with output: {cluster_decoder(cluster_dict).to_string()}") + + +def test_submission(): + yaml = """ + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: test_counter + """ + request = RayJobRequest(entrypoint="python /home/ray/samples/sample_code.py", runtime_env=yaml, num_cpu=0.5) + print(f"job request: {request.to_string()}") + request_json = json.dumps(request.to_dict()) + print(f"request JSON: {request_json}") + + info_json = """ + { + "entrypoint":"python /home/ray/samples/sample_code.py", + "jobId":"02000000", + "submissionId":"raysubmit_KWZLwme56esG3Wcr", + "status":"SUCCEEDED", + "message":"Job finished successfully.", + "startTime":"1699442662879", + "endTime":"1699442682405", + "runtimeEnv":{ + "env_vars":"map[counter_name:test_counter]", + "pip":"[requests==2.26.0 pendulum==2.1.2]" + } + } + """ + job_info = RayJobInfo(json.loads(info_json)) + print(job_info.to_string()) diff --git a/kfp/kfp_support_lib_v2/test/configmaps.py b/kfp/kfp_support_lib_v2/test/configmaps.py new file mode 100644 index 000000000..65e53e828 --- /dev/null +++ b/kfp/kfp_support_lib_v2/test/configmaps.py @@ -0,0 +1,72 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from kubernetes import client, config + + +CMAP_VALUE = """ +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0" +""" +CMAP_NAME = "ray-job-code-sample" + + +class ConfigmapsManager: + """ + Simple support class to manage config maps. Assumes local access to Kubectl + """ + + def __init__(self): + config.load_kube_config() + self.api_instance = client.CoreV1Api() + + def list_configmaps(self) -> list[str]: + cm_list = self.api_instance.list_namespaced_config_map(namespace="default").items + return [cm.metadata.name for cm in cm_list] + + def create_code_map(self) -> None: + cmap = client.V1ConfigMap() + cmap.metadata = client.V1ObjectMeta(name=CMAP_NAME) + cmap.data = {"sample_code.py": CMAP_VALUE} + self.api_instance.create_namespaced_config_map(namespace="default", body=cmap) + + def delete_code_map(self) -> None: + try: + self.api_instance.delete_namespaced_config_map(name="ray-job-code-sample", namespace="default") + except Exception as e: + print("config map ray-job-code-sample does not exist") diff --git a/kfp/kfp_support_lib_v2/test/kuberay_api_test.py b/kfp/kfp_support_lib_v2/test/kuberay_api_test.py new file mode 100644 index 000000000..b2a444ce3 --- /dev/null +++ b/kfp/kfp_support_lib_v2/test/kuberay_api_test.py @@ -0,0 +1,297 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from configmaps import ConfigmapsManager +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + ConfigMapVolume, + EnvironmentVariables, + HeadNodeSpec, + RayJobRequest, + ServiceType, + Template, + Toleration, + TolerationEffect, + TolerationOperation, + WorkerNodeSpec, +) + + +def test_templates(): + """ + Test template + """ + # create API server + apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + # cleanup + _, _ = apis.delete_compute_template(ns="default", name="default-template") + # create + toleration = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) + template = Template(name="default-template", namespace="default", cpu=2, memory=8, tolerations=[toleration]) + status, error = apis.create_compute_template(template) + assert status == 200 + assert error is None + # duplicate create should fail + status, error = apis.create_compute_template(template) + assert status != 200 + assert error is not None + print(f"\nstatus {status}, error code: {str(error)}") + # get + status, error, t = apis.get_compute_template(ns="default", name="default-template") + assert status == 200 + assert error is None + assert template.to_string() == t.to_string() + # list + status, error, template_array = apis.list_compute_templates() + assert status == 200 + assert error is None + assert template.to_string() == template_array[0].to_string() + # list ns + status, error, template_array = apis.list_compute_templates_namespace(ns="default") + assert status == 200 + assert error is None + assert template.to_string() == template_array[0].to_string() + # delete + status, error = apis.delete_compute_template(ns="default", name="default-template") + assert status == 200 + assert error is None + # duplicate delete should fail + status, error = apis.delete_compute_template(ns="default", name="default-template") + assert status != 200 + assert error is not None + print(f"status: {status}, err = {str(error)}") + + +def test_cluster(): + """ + Test cluster + """ + # create API server + apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + # cleanup + _, _ = apis.delete_compute_template(ns="default", name="default-template") + _, _ = apis.delete_cluster(ns="default", name="test") + # Create configmap + cm_manager = ConfigmapsManager() + cm_manager.delete_code_map() + cm_manager.create_code_map() + # Create template first + template = Template(name="default-template", namespace="default", cpu=2, memory=4) + status, error = apis.create_compute_template(template) + assert status == 200 + assert error is None + # cluster + volume = ConfigMapVolume( + name="code-sample", + mount_path="/home/ray/samples", + source="ray-job-code-sample", + items={"sample_code.py": "sample_code.py"}, + ) + environment = EnvironmentVariables(key_value={"key": "value"}) + head = HeadNodeSpec( + compute_template="default-template", + ray_start_params={"metrics-export-port": "8080", "num-cpus": "0"}, + image="rayproject/ray:2.9.3-py310", + service_type=ServiceType.ClusterIP, + volumes=[volume], + environment=environment, + image_pull_policy="Always", + ) + worker = WorkerNodeSpec( + group_name="small", + compute_template="default-template", + replicas=1, + min_replicas=1, + max_replicas=1, + ray_start_params=DEFAULT_WORKER_START_PARAMS, + image="rayproject/ray:2.9.3-py310", + volumes=[volume], + environment=environment, + image_pull_policy="Always", + ) + t_cluster = Cluster( + name="test", + namespace="default", + user="boris", + version="2.9.0", + cluster_spec=ClusterSpec(head_node=head, worker_groups=[worker]), + ) + # create + status, error = apis.create_cluster(t_cluster) + assert status == 200 + assert error is None + # get + status, error, c = apis.get_cluster(ns="default", name="test") + assert status == 200 + assert error is None + print(f"\ngot cluster: {c.to_string()}") + # list + status, error, clusters = apis.list_clusters() + assert status == 200 + assert error is None + assert len(clusters) == 1 + print(f"got cluster: {clusters[0].to_string()}") + # list namespace + status, error, clusters = apis.list_clusters_namespace(ns="default") + assert status == 200 + assert error is None + assert len(clusters) == 1 + print(f"got cluster: {clusters[0].to_string()}") + # get cluster status + status, error, cs = apis.get_cluster_status(ns="default", name="test") + assert status == 200 + assert error is None + print(f"cluster status is {cs}") + # Wait for the cluster to get ready + status, error = apis.wait_cluster_ready(ns="default", name="test") + assert status == 200 + assert error is None + # get endpoints + status, error, endpoint = apis.get_cluster_endpoints(ns="default", name="test") + assert status == 200 + assert error is None + print(f"cluster endpoints is {endpoint}") + # delete cluster + status, error = apis.delete_cluster(ns="default", name="test") + assert status == 200 + assert error is None + # delete template + status, error = apis.delete_compute_template(ns="default", name="default-template") + assert status == 200 + assert error is None + + +def test_job_submission(): + """ + Test job submission + :return: + """ + # create API server + apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + # cleanup + _, _ = apis.delete_compute_template(ns="default", name="default-template") + _, _ = apis.delete_cluster(ns="default", name="test-job") + # Create configmap + cm_manager = ConfigmapsManager() + cm_manager.delete_code_map() + cm_manager.create_code_map() + # Create template first + template = Template(name="default-template", namespace="default", cpu=2, memory=4) + status, error = apis.create_compute_template(template) + assert status == 200 + assert error is None + # cluster + volume = ConfigMapVolume( + name="code-sample", + mount_path="/home/ray/samples", + source="ray-job-code-sample", + items={"sample_code.py": "sample_code.py"}, + ) + environment = EnvironmentVariables(key_value={"key": "value"}) + head = HeadNodeSpec( + compute_template="default-template", + ray_start_params={"metrics-export-port": "8080", "num-cpus": "0"}, + image="rayproject/ray:2.9.3-py310", + service_type=ServiceType.ClusterIP, + volumes=[volume], + environment=environment, + image_pull_policy="IfNotPresent", + ) + worker = WorkerNodeSpec( + group_name="small", + compute_template="default-template", + replicas=1, + min_replicas=1, + max_replicas=1, + ray_start_params=DEFAULT_WORKER_START_PARAMS, + image="rayproject/ray:2.9.3-py310", + volumes=[volume], + environment=environment, + image_pull_policy="IfNotPresent", + ) + t_cluster = Cluster( + name="test-job", + namespace="default", + user="boris", + version="2.9.0", + cluster_spec=ClusterSpec(head_node=head, worker_groups=[worker]), + ) + # create + status, error = apis.create_cluster(t_cluster) + assert status == 200 + assert error is None + # Wait for the cluster to get ready + status, error = apis.wait_cluster_ready(ns="default", name="test-job") + assert status == 200 + assert error is None + # submit Ray job + resource_yaml = """ + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: test_counter + """ + job_request = RayJobRequest( + entrypoint="python /home/ray/samples/sample_code.py", runtime_env=resource_yaml, num_cpu=0.5 + ) + # To ensure that Ray cluster HTTP is ready try to get jobs info from the cluster + status, error, job_info_array = apis.list_job_info(ns="default", name="test-job") + assert status == 200 + assert error is None + print("\n initial jobs info") + for inf in job_info_array: + print(f" {inf.to_string()}") + time.sleep(5) + status, error, sid = apis.submit_job(ns="default", name="test-job", job_request=job_request) + assert status == 200 + assert error is None + time.sleep(10) + # get Ray job info + status, error, jinfo = apis.get_job_info(ns="default", name="test-job", sid=sid) + assert status == 200 + assert error is None + print(f"\njobs info {jinfo.to_string()}") + # get Ray jobs info + status, error, job_info_array = apis.list_job_info(ns="default", name="test-job") + assert status == 200 + assert error is None + print("jobs info") + for inf in job_info_array: + print(f" {inf.to_string()}") + # get Ray job log + time.sleep(5) # wait till log is available + status, error, jlog = apis.get_job_log(ns="default", name="test-job", sid=sid) + assert status == 200 + assert error is None + print(f"job log {jlog}") + # stop Ray job + status, error = apis.stop_ray_job(ns="default", name="test-job", sid=sid) + assert status == 200 + assert error is None + # delete Ray job + status, error = apis.delete_ray_job(ns="default", name="test-job", sid=sid) + assert status == 200 + assert error is None + # delete cluster + status, error = apis.delete_cluster(ns="default", name="test-job") + assert status == 200 + assert error is None + # delete template + status, error = apis.delete_compute_template(ns="default", name="default-template") + assert status == 200 + assert error is None diff --git a/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py new file mode 100644 index 000000000..f9a5cfee8 --- /dev/null +++ b/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py @@ -0,0 +1,90 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from configmaps import ConfigmapsManager +from kfp_support.api_server_client.params import ConfigMapVolume +from kfp_support.workflow_support.runtime_utils import RayRemoteJobs + + +def test_ray_remote_jobs(): + """ + Test the full cycle of job submission + :return: + """ + # This shows how to create volumes dictionary + volumes = [ + ConfigMapVolume( + name="code-sample", + mount_path="/home/ray/samples", + source="ray-job-code-sample", + items={"sample_code.py": "sample_code.py"}, + ) + ] + dct_volumes = {"volumes": [v.to_dict() for v in volumes]} + + head_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + # Ray start params, just to show + "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"}, + "image_pull_policy": "Always", + } | dct_volumes + + worker_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + "replicas": 1, + "min_replicas": 1, + "max_replicas": 1, + "image_pull_policy": "Always", + } | dct_volumes + + # Create configmap for testing + cm_manager = ConfigmapsManager() + cm_manager.delete_code_map() + cm_manager.create_code_map() + + # create cluster + remote_jobs = RayRemoteJobs(server_url="http://localhost:8080/ray") + status, error = remote_jobs.create_ray_cluster( + name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] + ) + print(f"Created cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None + # submitting ray job + runtime_env = """ + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: test_counter + """ + status, error, submission = remote_jobs.submit_job( + name="job-test", + namespace="default", + request={}, + runtime_env=runtime_env, + executor="/home/ray/samples/sample_code.py", + ) + print(f"submit job - status: {status}, error: {error}, submission id {submission}") + assert status == 200 + assert error is None + # print execution log + remote_jobs.follow_execution(name="job-test", namespace="default", submission_id=submission, print_timeout=20) + # cleanup + status, error = remote_jobs.delete_ray_cluster(name="job-test", namespace="default") + print(f"Deleted cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py index f63bb0638..a64154237 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py @@ -1,7 +1,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ONE_WEEK_SEC +from kfp_support.workflow_support.runtime_utils import ONE_WEEK_SEC # Components diff --git a/transforms/code/code_quality/Makefile b/transforms/code/code_quality/Makefile index 923d29185..b2ddc7269 100644 --- a/transforms/code/code_quality/Makefile +++ b/transforms/code/code_quality/Makefile @@ -43,21 +43,21 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py index d6a9938e7..a3d650c57 100644 --- a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py @@ -15,7 +15,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/code/code_quality/kfp_ray/v2/Makefile b/transforms/code/code_quality/kfp_ray/v2/Makefile new file mode 100644 index 000000000..bd34d6f3c --- /dev/null +++ b/transforms/code/code_quality/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=code_quality_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=code_quality_wf.py \ No newline at end of file diff --git a/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py new file mode 100644 index 000000000..9de9a9e55 --- /dev/null +++ b/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py @@ -0,0 +1,174 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# NOTE: This file is auto generated by Pipeline Generator. + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) +from kubernetes import client as k8s_client + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "code_quality_transform.py" +PREFIX: str = "" + +task_image = "quay.io/dataprep1/data-prep-kit/code_quality:0.3.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1" +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, insted of creating a component we are creating it in place here. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "code_quality" + + +# Pipeline to invoke execution on remote resource +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for code quality task", +) +def code_quality( + # Ray cluster + ray_name: str = "code_quality-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "",\ + "image": "' + + task_image + + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "",\ + "image": "' + + task_image + + '" }', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/code_quality/input/', 'output_folder': 'test/code_quality/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "runtime_pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # code quality parameters + cq_contents_column_name: str = "contents", + cq_language_column_name: str = "language", + cq_tokenizer: str = "codeparrot/codeparrot", + cq_hf_token: str = "None", + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute Code Quality transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param cq_contents_column_name - Name of the column holds the data to process + :param cq_language_column_name - Name of the column holds the programming language details + :param cq_tokenizer - Name or path to the tokenizer + :param cq_hf_token - Huggingface auth token to download and use the tokenizer + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "cq_contents_column_name": cq_contents_column_name, + "cq_language_column_name": cq_language_column_name, + "cq_tokenizer": cq_tokenizer, + "cq_hf_token": cq_hf_token, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(code_quality, __file__.replace(".py", ".yaml")) diff --git a/transforms/code/malware/Makefile b/transforms/code/malware/Makefile index 923d29185..3e277f0d7 100644 --- a/transforms/code/malware/Makefile +++ b/transforms/code/malware/Makefile @@ -43,21 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements - + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/code/malware/kfp_ray/v1/malware_wf.py b/transforms/code/malware/kfp_ray/v1/malware_wf.py index 50489e83a..71c5a63f5 100644 --- a/transforms/code/malware/kfp_ray/v1/malware_wf.py +++ b/transforms/code/malware/kfp_ray/v1/malware_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile index bfd98404d..3e277f0d7 100644 --- a/transforms/code/proglang_select/Makefile +++ b/transforms/code/proglang_select/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py index 2680666f6..99663693d 100644 --- a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/code/proglang_select/kfp_ray/v2/Makefile b/transforms/code/proglang_select/kfp_ray/v2/Makefile new file mode 100644 index 000000000..3cf6c4084 --- /dev/null +++ b/transforms/code/proglang_select/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=proglang_select_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py new file mode 100644 index 000000000..b9bca1cfc --- /dev/null +++ b/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py @@ -0,0 +1,165 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "proglang_select_transform.py" + +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select:0.3.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1" + +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, insted of creating a component we are creating it in place here. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent_multi_s3.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "proglang_select" +PREFIX: str = "proglang_select" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for select language", +) +def lang_select( + ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/proglang_select/input/', 'output_folder': 'test/proglang_select/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # Proglang match parameters + proglang_select_allowed_langs_file: str = "test/proglang_select/languages/allowed-code-languages.txt", + proglang_select_language_column: str = "language", + proglang_select_s3_access_secret: str = "s3-secret", + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +) -> None: + """ + Pipeline to execute NOOP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - httpt retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param proglang_select_allowed_langs_file - file to store allowed languages + :param proglang_select_language_column - name of select language annotation column + :param proglang_select_s3_access_secret - block list access secret + (here we are assuming that select language info is in S3, but potentially in the different bucket) + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + # note that the parameters below are specific for NOOP transform + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, + "proglang_select_language_column": proglang_select_language_column, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + prefix=PREFIX, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component(execute_job, proglang_select_s3_access_secret, prefix=PREFIX) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(lang_select, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index bfd98404d..3e277f0d7 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py index 3b2943e41..40d87cb33 100644 --- a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/doc_id/kfp_ray/v2/Makefile b/transforms/universal/doc_id/kfp_ray/v2/Makefile new file mode 100644 index 000000000..3cf6c4084 --- /dev/null +++ b/transforms/universal/doc_id/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=proglang_select_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py new file mode 100644 index 000000000..493dae400 --- /dev/null +++ b/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py @@ -0,0 +1,163 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) + + +task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.3.0" + +# the name of the job script +EXEC_SCRIPT_NAME: str = "doc_id_transform.py" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" + +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, instead of creating a component we are creating it in place here. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "doc_id" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for doc_id", +) +def doc_id( + # Ray cluster + ray_name: str = "doc_id-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/doc_id/input/', 'output_folder': 'test/doc_id/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # doc id parameters + doc_id_doc_column: str = "contents", + doc_id_hash_column: str = "hash_column", + doc_id_int_column: str = "int_id_column", + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute NOOP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param doc_id_doc_column - document column + :param doc_id_hash_column - hash id column + :param doc_id_int_column - integer id column + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + # note that the parameters below are specific for NOOP transform + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "doc_id_doc_column": doc_id_doc_column, + "doc_id_hash_column": doc_id_hash_column, + "doc_id_int_column": doc_id_int_column, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(doc_id, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile index 182af234c..3e277f0d7 100644 --- a/transforms/universal/ededup/Makefile +++ b/transforms/universal/ededup/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements \ No newline at end of file + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py b/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py index e008b1119..1d11e1ed0 100644 --- a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py index 529a6ace3..8d2fc6180 100644 --- a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py +++ b/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py @@ -35,7 +35,7 @@ def ededup_compute_execution_params( from data_processing.data_access import DataAccessS3 from data_processing.utils import GB, KB - from kfp_support.workflow_support.utils import KFPUtils + from kfp_support.workflow_support.runtime_utils import KFPUtils EXECUTION_OF_KB_DOC = 0.00025 diff --git a/transforms/universal/ededup/kfp_ray/v2/Makefile b/transforms/universal/ededup/kfp_ray/v2/Makefile new file mode 100644 index 000000000..98b0b5332 --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=ededup_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=ededup_wf.py diff --git a/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py b/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py new file mode 100644 index 000000000..6c8fc4e0d --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py @@ -0,0 +1,165 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) +from src.ededup_compute_execution_params import ededup_compute_execution_params + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "ededup_transform.py" + +task_image = "quay.io/dataprep1/data-prep-kit/ededup:0.3.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" + +# compute execution parameters +compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "ededup" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for ededup", +) +def ededup( + # Ray cluster + ray_name: str = "ededup-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access. checkpointing is not supported by dedup + data_s3_config: str = "{'input_folder': 'test/ededup/input/', 'output_folder': 'test/ededup/output'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # ededup + ededup_hash_cpu: float = 0.5, + ededup_doc_column: str = "contents", + # data sampling + ededup_n_samples: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute EDEDUP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param ededup_hash_cpu - number of CPUs per hash + :param ededup_doc_column - key for accessing data + :param ededup_n_samples - number of samples for parameters computation + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, + n_samples=ededup_n_samples, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.outputs["workers"], + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "ededup_doc_column": ededup_doc_column, + "ededup_hash_cpu": ededup_hash_cpu, + "ededup_num_hashes": compute_exec_params.outputs["hashes"], + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(ededup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py new file mode 100644 index 000000000..5304def12 --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py @@ -0,0 +1,98 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any, NamedTuple + + +def ededup_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # actor's resource requirements + params: dict[str, Any], # exact dedup specific parameters + n_samples: int = 10, # number of samples to use +) -> NamedTuple("Output", [("workers", int), ("hashes", int)]): + """ + Compute exact dedup execution parameters + :param worker_options: cluster parameters + :param actor_options: actor request requirements + :param n_samples: number of samples to use + :param params: exact dedup specific parameters containing the following keys: + s3_config - s3 config + hash_cpu - hash cpu requirements + :return: json string, containing computed number of workers and hashes + """ + # required import + import json + import math + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB, KB + from kfp_support.workflow_support.runtime_utils import KFPUtils + + EXECUTION_OF_KB_DOC = 0.00025 + + # Get cluster parameters + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_memory = w_options["replicas"] * w_options["memory"] + print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") + cluster_cpu *= 0.85 + cluster_memory *= 0.85 + # get actor requirements + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + actor_cpu = a_options["num_cpus"] + print(f"actor required cpu {actor_cpu}") + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling = data_access.sample_input_data(n_samples=n_samples) + avg_doc_size = sampling.get("average doc size KB") + number_of_docs = sampling.get("estimated number of docs") + avg_table_size = sampling.get("average table size MB") / KB + # compute number of hashes + n_hashes = math.ceil(number_of_docs * 32 / GB) + print(f"Estimated Required hashes {n_hashes}") + print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") + hash_cpu: float = float(params.get("hash_cpu")) + required_hash_cpu = n_hashes * hash_cpu + required_hash_mem = n_hashes * 2 + if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: + print( + f"Cluster is too small - hashes required cpus {required_hash_cpu}; " + f"hashes required memory {required_hash_mem}" + ) + sys.exit(1) + # Define number of workers + n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu) + print(f"Number of workers - {n_workers}") + if n_workers < 2: + print(f"Cluster is too small - estimated number of workers {n_workers}") + sys.exit(1) + # Limit amount of workers and processors to prevent S3 saturation + if n_workers > 1000: + n_workers = 1000 + # validate that we have enough memory + r_mem = required_hash_mem * 2 + avg_table_size * 4 * n_workers + print(f"Required execution memory {r_mem} GB") + if r_mem > cluster_memory: + print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") + print(f"Try to increase the size of the cluster or increase size of the cpu per worker") + sys.exit(1) + print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") + # return json.dumps({"workers": n_workers, "hashes": n_hashes}) + return (n_workers, n_hashes) + # return (1, 1) diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index 182af234c..3e277f0d7 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements \ No newline at end of file + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py index 22eea51bf..603bae5d1 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py index a9f8b8d66..3f332432d 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py @@ -45,7 +45,7 @@ def fdedup_compute_execution_params( from data_processing.data_access import DataAccessS3 from data_processing.utils import GB, KB - from kfp_support.workflow_support.utils import KFPUtils + from kfp_support.workflow_support.runtime_utils import KFPUtils from scipy.integrate import quad as integrate EXECUTION_OF_KB_DOC = 0.003 diff --git a/transforms/universal/fdedup/kfp_ray/v2/Makefile b/transforms/universal/fdedup/kfp_ray/v2/Makefile new file mode 100644 index 000000000..22ef10ef6 --- /dev/null +++ b/transforms/universal/fdedup/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=fdedup_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=fdedup_wf.py diff --git a/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py new file mode 100644 index 000000000..d27d5b2ea --- /dev/null +++ b/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py @@ -0,0 +1,216 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) +from src.fdedup_compute_execution_params import fdedup_compute_execution_params + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "fdedup_transform.py" + +task_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.3.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" + +# compute execution parameters +compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "fdedup" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for fdedup", +) +def fdedup( + # Ray cluster + ray_name: str = "fdedup-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access. checkpointing is not supported by dedup + data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # columns used + fdedup_doc_column: str = "contents", + fdedup_id_column: str = "int_id_column", + fdedup_cluster_column: str = "cluster", + # infrastructure + fdedup_bucket_cpu: float = 0.5, + fdedup_doc_cpu: float = 0.5, + fdedup_mhash_cpu: float = 0.5, + # fuzzy parameters + fdedup_num_permutations: int = 64, + fdedup_threshold: float = 0.8, + fdedup_shingles_size: int = 5, + fdedup_delimiters: str = " ", + # Random delay between reads + fdedup_random_delay_limit: int = 5, + # snapshotting + fdedup_snapshot_delay: int = 1, + fdedup_use_doc_snapshot: bool = False, + fdedup_use_bucket_snapshot: bool = False, + # data sampling + fdedup_n_samples: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute FDEDUP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param fdedup_doc_column - document column name + :param fdedup_id_column - integer document id column name + :param fdedup_cluster_column - cluster column name + :param fdedup_bucket_cpu - number of CPUs per bucket hash + :param fdedup_doc_cpu - number of CPUs per doc hash + :param fdedup_mhash_cpu - number of CPUs per minhash hash + :param fdedup_num_permutations - number of permutations + :param fdedup_threshold - threshold + :param fdedup_shingles_size - number of words in shingle + :param fdedup_delimiters - delimiter for splitting document + :param fdedup_random_delay_limit - delay between reads to reduce S3 load. + A random number between 0 and random_delay_limit is used + :param fdedup_snapshot_delay - delay between restoring individual actors + :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots + :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots + :param fdedup_n_samples - number of samples for parameters computation + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + params={ + "threshold": fdedup_threshold, + "num_permutations": fdedup_num_permutations, + "s3_config": data_s3_config, + "bucket_cpu": fdedup_bucket_cpu, + "doc_cpu": fdedup_doc_cpu, + "minhash_cpu": fdedup_mhash_cpu, + }, + n_samples=fdedup_n_samples, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.outputs["workers"], + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "fdedup_doc_column": fdedup_doc_column, + "fdedup_id_column": fdedup_id_column, + "fdedup_cluster_column": fdedup_cluster_column, + "fdedup_bucket_cpu": fdedup_bucket_cpu, + "fdedup_doc_cpu": fdedup_doc_cpu, + "fdedup_mhash_cpu": fdedup_mhash_cpu, + "fdedup_num_doc_actors": compute_exec_params.outputs["docs"], + "fdedup_num_bucket_actors": compute_exec_params.outputs["buckets"], + "fdedup_num_minhash_actors": compute_exec_params.outputs["min_hashes"], + "fdedup_num_preprocessors": compute_exec_params.outputs["preprocessors"], + "fdedup_num_permutations": fdedup_num_permutations, + "fdedup_threshold": fdedup_threshold, + "fdedup_shingles_size": fdedup_shingles_size, + "fdedup_delimiters": fdedup_delimiters, + "fdedup_random_delay_limit": fdedup_random_delay_limit, + "fdedup_snapshot_delay": fdedup_snapshot_delay, + "fdedup_use_doc_snapshot": fdedup_use_doc_snapshot, + "fdedup_use_bucket_snapshot": fdedup_use_bucket_snapshot, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py new file mode 100644 index 000000000..f511784c4 --- /dev/null +++ b/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py @@ -0,0 +1,178 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any, NamedTuple + + +def fdedup_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # actor's resource requirements + params: dict[str, Any], # fuzzy dedup specific parameters + n_samples: int = 10, # number of samples to use +) -> NamedTuple( + "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] +): + """ + Compute fuzzy dedup execution parameters + :param worker_options: cluster parameters + :param actor_options: actor request requirements + :param n_samples: number of samples to use + :param params: fuzzy dedup specific parameters containing the following keys: + threshold - threshold for fuzzy computations + num_permutations - number of permutation + s3_config - s3 config + bucket_cpu - bucket actor cpu requirements + minhash_cpu - minhash actor cpu requirements + doc_cpu - doc actor cpu requirements + :return: json string, containing + workers - number of workers + preprocessors - number of preprocessors + docs - number of doc actors + buckets - number of bucket actors + min_hashes - number of minhash actors + """ + import math + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB, KB + from kfp_support.workflow_support.runtime_utils import KFPUtils + from scipy.integrate import quad as integrate + + EXECUTION_OF_KB_DOC = 0.003 + + def fuzzy_optimal_param( + threshold: float, + num_perm: int, + false_positive_weight: float, + false_negative_weight: float, + ) -> tuple[int, int]: + """ + Computes parameters for fuzzy dedup + :param threshold: filtering threshold + :param num_perm: number of permutations + :param false_positive_weight: false positive weight + :param false_negative_weight: false negative weight + :return: number of buckets and bucket length + """ + + def _false_positive_probability(ths: float, b: int, r: int) -> float: + """ + Compute false positive probability + :param ths: filtering threshold + :param b: permutation + :param r: rel permutation + :return: probability + """ + _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) + a, err = integrate(_probability, 0.0, ths) + return a + + def _false_negative_probability(ths: float, b: int, r: int) -> float: + """ + Compute false negative probability + :param ths: filtering threshold + :param b: permutation + :param r: rel permutation + :return: probability + """ + _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) + a, err = integrate(_probability, ths, 1.0) + return a + + min_error = float("inf") + opt = (0, 0) + for perm in range(1, num_perm + 1): + max_r = int(num_perm / perm) + for rel in range(1, max_r + 1): + fp = _false_positive_probability(threshold, perm, rel) + fn = _false_negative_probability(threshold, perm, rel) + error = fp * false_positive_weight + fn * false_negative_weight + if error < min_error: + min_error = error + opt = (perm, rel) + return opt + + # fuzzy parameters + num_buckets, length_bucket = fuzzy_optimal_param( + threshold=float(params.get("threshold")), + num_perm=int(params.get("num_permutations")), + false_positive_weight=0.5, + false_negative_weight=0.5, + ) + print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") + # Get cluster parameters + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_memory = w_options["replicas"] * w_options["memory"] + print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") + cluster_cpu *= 0.85 + cluster_memory *= 0.85 + # get actor requirements + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + actor_cpu = a_options["num_cpus"] + print(f"actor required cpu {actor_cpu}") + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling = data_access.sample_input_data(n_samples=n_samples) + avg_doc_size = sampling.get("average doc size KB") + number_of_docs = sampling.get("estimated number of docs") + avg_table_size = sampling.get("average table size MB") / KB + # we are creating more buckets actors, so that we get better parallelization for bucket processing + b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) + d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) + m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) + # compute cpu requirements + bucket_cpu = float(params.get("bucket_cpu")) + min_hash_cpu = float(params.get("minhash_cpu")) + doc_cpu = float(params.get("doc_cpu")) + # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount + # of CPUs + n_preprocessors = int( + (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * min_hash_cpu - d_actors * doc_cpu) / actor_cpu + ) + if n_preprocessors < 0: + print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") + print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") + print("Try to increase the size of the cluster") + sys.exit(1) + # compute the amount of workers + n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) + # Ensure that we do not overwhelm S3 + if n_workers > 2000: + n_workers = 2000 + print( + f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " + f"minhash actors {m_actors}, document actors {d_actors}" + ) + + # Make sure that we have enough memory + r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) + print(f"Required execution memory {r_mem} GB") + if r_mem > cluster_memory: + print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") + print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") + sys.exit(1) + + print( + f"Required cpu : " + f"{b_actors * bucket_cpu + m_actors * min_hash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" + ) + + projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 + print(f"Projected execution time {projected_execution} min") + return (n_workers, n_preprocessors, d_actors, b_actors, m_actors) diff --git a/transforms/universal/filter/Makefile b/transforms/universal/filter/Makefile index 182af234c..3e277f0d7 100644 --- a/transforms/universal/filter/Makefile +++ b/transforms/universal/filter/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements \ No newline at end of file + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/filter/kfp_ray/v1/filter_wf.py b/transforms/universal/filter/kfp_ray/v1/filter_wf.py index 9b5a970c0..648606ecd 100644 --- a/transforms/universal/filter/kfp_ray/v1/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/v1/filter_wf.py @@ -15,7 +15,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/filter/kfp_ray/v2/Makefile b/transforms/universal/filter/kfp_ray/v2/Makefile new file mode 100644 index 000000000..c64f90af8 --- /dev/null +++ b/transforms/universal/filter/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=filter_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=filter_wf.py diff --git a/transforms/universal/filter/kfp_ray/v2/filter_wf.py b/transforms/universal/filter/kfp_ray/v2/filter_wf.py new file mode 100644 index 000000000..11cf20b9b --- /dev/null +++ b/transforms/universal/filter/kfp_ray/v2/filter_wf.py @@ -0,0 +1,167 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# NOTE: This file is auto generated by Pipeline Generator. + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "filter_transform.py" +PREFIX: str = "" + +task_image = "quay.io/dataprep1/data-prep-kit/filter:0.3.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, insted of creating a component we are creating it in place here. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "filter" + + +# Pipeline to invoke execution on remote resource +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for filtering task", +) +def filtering( + # Ray cluster + ray_name: str = "filter-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/filter/input/', 'output_folder': 'test/filter/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # filtering parameters + filter_criteria_list: str = "['docq_total_words > 100 AND docq_total_words < 200', 'ibmkenlm_docq_perplex_score < 230']", + filter_logical_operator: str = "AND", + filter_columns_to_drop: str = "['extra', 'cluster']", + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute Filtering transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param filter_criteria_list - list of filter criteria (in SQL WHERE clause format) + :param filter_logical_operator - logical operator (AND or OR) that joins filter criteria + :param filter_columns_to_drop - list of columns to drop after filtering + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "filter_criteria_list": filter_criteria_list, + "filter_logical_operator": filter_logical_operator, + "filter_columns_to_drop": filter_columns_to_drop, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(filtering, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 182af234c..02fd06dc2 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -51,12 +51,12 @@ workflow-build: .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements \ No newline at end of file + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py index 320fe204d..dd8eaa513 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_wf.py index e505ea900..872e98238 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v1/noop_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/noop/kfp_ray/v2/Makefile b/transforms/universal/noop/kfp_ray/v2/Makefile new file mode 100644 index 000000000..1a49cbd49 --- /dev/null +++ b/transforms/universal/noop/kfp_ray/v2/Makefile @@ -0,0 +1,32 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + @for file in $(YAML_WF); do \ + $(MAKE) $$file; \ + done + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py new file mode 100644 index 000000000..b3aba7cdb --- /dev/null +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -0,0 +1,164 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) +import uuid + +# FIXME: create a component to get run id +RUN_ID = uuid.uuid4().hex + +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" + +# the name of the job script +EXEC_SCRIPT_NAME: str = "noop_transform.py" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0-kfp-v21" + +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, instead of creating a component we are creating it in place here. +@dsl.component(base_image=base_kfp_image) +def compute_exec_params_op(worker_options: str, actor_options: str) -> str: + from kfp_support.workflow_support.runtime_utils import ComponentUtils + + return ComponentUtils.default_compute_execution_params(worker_options, actor_options) + + +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "noop" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for noop", +) +def noop( + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute NOOP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param noop_sleep_sec - noop sleep time + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60, image_pull_policy="Always") + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params +# compute_exec_params = compute_exec_params_op( + # worker_options=ray_worker_options, + # actor_options=runtime_actor_options, + # ) + # ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2,image_pull_policy="Always") + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=RUN_ID, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2, image_pull_policy="Always") + #ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=RUN_ID, + additional_params=additional_params, + # note that the parameters below are specific for NOOP transform + exec_params={ + "data_s3_config": "{'input_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/input/', 'output_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/output/'}", + "data_max_files": -1, + "data_num_samples": -1, + "runtime_num_workers": "1", + "runtime_worker_options": "{'num_cpus': 0.8}", + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": RUN_ID, + "runtime_code_location": "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + "noop_sleep_sec": 10, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC,image_pull_policy="Always") + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) +# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile index bfd98404d..3e277f0d7 100644 --- a/transforms/universal/tokenization/Makefile +++ b/transforms/universal/tokenization/Makefile @@ -43,20 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C kfp_ray/v1 workflow-venv + $(MAKE) -C $(PIPELINE_PATH) workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build + $(MAKE) -C $(PIPELINE_PATH) workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C kfp_ray/v1 workflow-test + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C kfp_ray/v1 workflow-upload + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C kfp_ray/v1 workflow-reconcile-requirements + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py index f366bb91d..86ba9e0f1 100644 --- a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, diff --git a/transforms/universal/tokenization/kfp_ray/v2/Makefile b/transforms/universal/tokenization/kfp_ray/v2/Makefile new file mode 100644 index 000000000..232c8b44a --- /dev/null +++ b/transforms/universal/tokenization/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=tokenization_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=tokenization_wf.py diff --git a/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py new file mode 100644 index 000000000..bed08e80a --- /dev/null +++ b/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py @@ -0,0 +1,171 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "tokenization_transform.py" + +task_image = "quay.io/dataprep1/data-prep-kit/tokenization:0.2.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" +# compute execution parameters. Use default one for now. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "tokenization" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for tokenization", +) +def tokenization( + # Ray cluster + ray_name: str = "tkn-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/tokenization/ds01/input/', 'output_folder': 'test/tokenization/ds01/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # tokenizer parameters + tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", + tkn_doc_id_column: str = "document_id", + tkn_doc_content_column: str = "contents", + tkn_text_lang: str = "en", + tkn_tokenizer_args: str = "cache_dir=/tmp/hf", + tkn_chunk_size: int = 0, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute tokenization transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param tkn_tokenizer - Tokenizer used for tokenization + :param tkn_tokenizer_args - Arguments for tokenizer. + :param tkn_doc_id_column - Column contains document id which values should be unique across dataset + :param tkn_doc_content_column - Column contains document content + :param tkn_text_lang - Specify language used in the text content for better text splitting if needed + :param tkn_chunk_size - Specify >0 value to tokenize each row/text in chunks of characters (rounded in words) + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "tkn_tokenizer": tkn_tokenizer, + "tkn_tokenizer_args": tkn_tokenizer_args, + "tkn_doc_id_column": tkn_doc_id_column, + "tkn_doc_content_column": tkn_doc_content_column, + "tkn_text_lang": tkn_text_lang, + "tkn_chunk_size": tkn_chunk_size, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(tokenization, __file__.replace(".py", ".yaml")) From ea8e8c4f05c7861b51f5ba1cab093bd4e9cf0c72 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 30 May 2024 05:59:39 +0300 Subject: [PATCH 05/64] add missing files Signed-off-by: Alexey Roytman --- transforms/code/malware/kfp_ray/v2/Makefile | 25 +++ .../code/malware/kfp_ray/v2/malware_wf.py | 162 ++++++++++++++++++ .../universal/filter/src/local_pipeline.py | 51 ++++++ 3 files changed, 238 insertions(+) create mode 100644 transforms/code/malware/kfp_ray/v2/Makefile create mode 100644 transforms/code/malware/kfp_ray/v2/malware_wf.py create mode 100644 transforms/universal/filter/src/local_pipeline.py diff --git a/transforms/code/malware/kfp_ray/v2/Makefile b/transforms/code/malware/kfp_ray/v2/Makefile new file mode 100644 index 000000000..8bf51274c --- /dev/null +++ b/transforms/code/malware/kfp_ray/v2/Makefile @@ -0,0 +1,25 @@ +REPOROOT=${CURDIR}/../../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../../ray/ + +YAML_FILE=malware_wf.yaml + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) ${YAML_FILE} + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=malware_wf.py diff --git a/transforms/code/malware/kfp_ray/v2/malware_wf.py b/transforms/code/malware/kfp_ray/v2/malware_wf.py new file mode 100644 index 000000000..3f0e3d1d4 --- /dev/null +++ b/transforms/code/malware/kfp_ray/v2/malware_wf.py @@ -0,0 +1,162 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from kfp_support.workflow_support.runtime_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, +) + + +# the name of the job script +EXEC_SCRIPT_NAME: str = "malware_transform.py" + +task_image = "quay.io/dataprep1/data-prep-kit/malware:0.4.0" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1" + +# path to kfp component specifications files +component_spec_path = "../../../../../kfp/kfp_ray_components/" + +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, instead of creating a component we are creating it in place here. +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image +) +# create Ray cluster +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "malware" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for malware", +) +def malware( + ray_name: str = "malware-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/malware/input', 'output_folder': 'test/malware/output'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # mallware + malware_input_column: str = "contents", + malware_output_column: str = "virus_detection", + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute malware transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - httpt retries for API server calls + :param data_s3_config - s3 configuration + :param data_s3_access_secret - s3 access secret + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param malware_input_column - input column name + :param malware_output_column - output column name + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=dsl.RUN_ID_PLACEHOLDER, + additional_params=additional_params, + # note that the parameters below are specific for malware transform + exec_params={ + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": compute_exec_params.output, + "runtime_worker_options": runtime_actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, + "runtime_code_location": runtime_code_location, + "malware_input_column": malware_input_column, + "malware_output_column": malware_output_column, + }, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # Configure the pipeline level to one week (in seconds) + dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(malware, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/filter/src/local_pipeline.py b/transforms/universal/filter/src/local_pipeline.py new file mode 100644 index 000000000..9a77e780b --- /dev/null +++ b/transforms/universal/filter/src/local_pipeline.py @@ -0,0 +1,51 @@ +import os +import sys + +from data_processing.data_access import DataAccessLocal +sys.path.append('../../noop/src') +sys.path.append(os.path.dirname(os.path.abspath(__file__), '../..')) +from noop_transform import NOOPTransform + +from filter_transform import ( + FilterTransform, + filter_columns_to_drop_key, + filter_criteria_key, + filter_logical_operator_key, +) + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} + + +filter_criteria = [ + "docq_total_words > 100 AND docq_total_words < 200", + "ibmkenlm_docq_perplex_score < 230", +] +filter_logical_operator = "AND" +filter_columns_to_drop = ["extra", "cluster"] + +filter_params = { + filter_criteria_key: filter_criteria, + filter_columns_to_drop_key: filter_columns_to_drop, + filter_logical_operator_key: filter_logical_operator, +} + +if __name__ == "__main__": + # Here we show how to run outside of ray + # Filter transform needs a DataAccess to ready the domain list. + data_access = DataAccessLocal(local_conf) + # Create and configure the transform. + transform = FilterTransform(filter_params) + # Use the local data access to read a parquet table. + table = data_access.get_table(os.path.join(input_folder, "test1.parquet")) + print(f"input table has {table.num_rows} rows") + # Transform the table + table_list, metadata = transform.transform(table) + print(f"\noutput table has {table_list[0].num_rows} rows") + print(f"output metadata : {metadata}") + From b6e280ec66eea2fa94725cb66c4459fad265c5ff Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 30 May 2024 10:20:51 +0300 Subject: [PATCH 06/64] remove kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/ Signed-off-by: Alexey Roytman --- .../kfp_support/workflow_support_v2/README.md | 36 -- .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 - .../comp_utils/component.py | 54 -- .../workflow_support_v2/utils/__init__.py | 8 - .../utils/workflow_utils.py | 557 ------------------ 6 files changed, 658 deletions(-) delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md deleted file mode 100644 index 472c39136..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Workflow Utils for KFPv2 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py deleted file mode 100644 index 9297ede66..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.components_utils.component import ( - CompileComponentUtils -) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py deleted file mode 100644 index adaa971c1..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py +++ /dev/null @@ -1,54 +0,0 @@ -import kfp.dsl as dsl -from kfp import kubernetes -from typing import Dict - -RUN_NAME = "KFP_RUN_NAME" - -class CompileComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - @staticmethod - def add_settings_to_component( - task: dsl.PipelineTask, - timeout: int, - image_pull_policy: str = "IfNotPresent", - cache_strategy: bool = False, - ) -> None: - """ - Add settings to kfp task - :param task: kfp task - :param timeout: timeout to set to the component in seconds - :param image_pull_policy: pull policy to set to the component - :param cache_strategy: cache strategy - """ - - kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, - field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # Set cashing - task.set_caching_options(enable_caching=cache_strategy) - # image pull policy - kubernetes.set_image_pull_policy(task, image_pull_policy) - # Set the timeout for the task to one day (in seconds) - kubernetes.set_timeout(task, seconds=timeout) - - @staticmethod - def set_s3_env_vars_to_component( - task: dsl.PipelineTask, - secret: str = '', - env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, - prefix: str = None, - ) -> None: - """ - Set S3 env variables to KFP component - :param task: kfp task - :param secret: secret name with the S3 credentials - :param env2key: dict with mapping each env variable to a key in the secret - :param prefix: prefix to add to env name - """ - - if prefix is not None: - for env_name, _ in env2key.items(): - env2key[prefix + "_" + env_name] = env2key.pop(env_name) - kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py deleted file mode 100644 index 3a6ab1263..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.workflow_utils import ( - KFPUtils, - RayRemoteJobs, - ComponentUtils, - ONE_HOUR_SEC, - ONE_DAY_SEC, - ONE_WEEK_SEC, -) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py deleted file mode 100644 index 7328c740d..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py +++ /dev/null @@ -1,557 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -import json -import os -import re -import sys -import time -from typing import Any, Optional - -from data_processing.data_access import DataAccess -from data_processing.utils import get_logger -import kfp_server_api -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - HeadNodeSpec, - RayJobRequest, - Template, - WorkerNodeSpec, - environment_variables_decoder, - volume_decoder, -) -from ray.job_submission import JobStatus - -logger = get_logger(__name__) - -ONE_HOUR_SEC = 60 * 60 -ONE_DAY_SEC = ONE_HOUR_SEC * 24 -ONE_WEEK_SEC = ONE_DAY_SEC * 7 - -class KFPUtils: - """ - Helper utilities for KFP implementations - """ - - @staticmethod - def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" - ) -> tuple[str, str, str]: - """ - Get credentials from the environment - :param access_key: environment variable for access key - :param secret_key: environment variable for secret key - :param endpoint: environment variable for S3 endpoint - :return: - """ - s3_key = os.getenv(access_key, None) - s3_secret = os.getenv(secret_key, None) - s3_endpoint = os.getenv(endpoint, None) - if s3_key is None or s3_secret is None or s3_endpoint is None: - logger.warning("Failed to load s3 credentials") - return s3_key, s3_secret, s3_endpoint - - @staticmethod - def get_namespace() -> str: - """ - Get k8 namespace that we are running it - :return: - """ - ns = "" - try: - file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") - except Exception as e: - logger.warning( - f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" - ) - else: - with file: - ns = file.read() - return ns - - @staticmethod - def runtime_name(ray_name: str = "", run_id: str = "") -> str: - """ - Get unique runtime name - :param ray_name: - :param run_id: - :return: runtime name - """ - # K8s objects cannot contain special characters, except '_', All characters should be in lower case. - if ray_name != "": - ray_name = ray_name.replace("_", "-").lower() - pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. - ray_name = re.sub(pattern, "", ray_name) - else: - ray_name = "a" - # the return value plus namespace name will be the name of the Ray Route, - # which length is restricted to 64 characters, - # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] - - @staticmethod - def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: - res = f"python {executor} " - for key, value in d.items(): - if isinstance(value, str): - res += f'--{key}="{value}" ' - else: - res += f"--{key}={value} " - return res - - # Load a string that represents a json to python dictionary - @staticmethod - def load_from_json(js: str) -> dict[str, Any]: - try: - return json.loads(js) - except Exception as e: - logger.warning(f"Failed to load parameters {js} with error {e}") - sys.exit(1) - -class RayRemoteJobs: - """ - class supporting Ray remote jobs - """ - - ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - default_image: str = "rayproject/ray:2.9.3-py310", - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initialization - :param server_url: API server URL. Default value is assuming running inside the cluster - :param default_image - default Ray image - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval - ) - self.default_image = default_image - - def create_ray_cluster( - self, - name: str, - namespace: str, - head_node: dict[str, Any], - worker_nodes: list[dict[str, Any]], - wait_cluster_ready: int = -1, - ) -> tuple[int, str]: - """ - Create Ray cluster - :param name: name, _ are not allowed in the name - :param namespace: namespace - :param head_node: head node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for head node - service_account - service account to use (has to be created) - environment - dictionary of head node environment - annotations: dictionary of head node annotation - labels: dictionary of head node labels - - :param worker_nodes: an array of worker node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - max_replicas - max replicas for this worker group - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - replicas - number of replicas to create for this group (default 1) - min_replicas - min number of replicas for this group (default 0) - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for this group - service_account - service account to use (has to be created) - environment - dictionary of node of this group environment - annotations: dictionary of node of this group annotation - labels: dictionary of node of this group labels - :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # start with templates - # head_node - cpus = head_node.get("cpu", 1) - memory = head_node.get("memory", 1) - gpus = head_node.get("gpu", 0) - accelerator = head_node.get("gpu_accelerator", None) - head_node_template_name = f"{name}-head-template" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) - head_template = Template( - name=head_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(head_template) - if status != 200: - return status, error - worker_template_names = [""] * len(worker_nodes) - index = 0 - # For every worker group - for worker_node in worker_nodes: - cpus = worker_node.get("cpu", 1) - memory = worker_node.get("memory", 1) - gpus = worker_node.get("gpu", 0) - accelerator = worker_node.get("gpu_accelerator", None) - worker_node_template_name = f"{name}-worker-template-{index}" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) - worker_template = Template( - name=worker_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(worker_template) - if status != 200: - return status, error - worker_template_names[index] = worker_node_template_name - index += 1 - # Build head node spec - image = head_node.get("image", self.default_image) - image_pull_secret = head_node.get("image_pull_secret", None) - ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) - volumes_dict = head_node.get("volumes", None) - service_account = head_node.get("service_account", None) - environment_dict = head_node.get("environment", None) - annotations = head_node.get("annotations", None) - labels = head_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - head_node_spec = HeadNodeSpec( - compute_template=head_node_template_name, - image=image, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - # build worker nodes - worker_groups = [] - index = 0 - for worker_node in worker_nodes: - max_replicas = worker_node.get("max_replicas", 1) - replicas = worker_node.get("replicas", 1) - min_replicas = worker_node.get("min_replicas", 0) - image = worker_node.get("image", self.default_image) - image_pull_secret = worker_node.get("image_pull_secret", None) - ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) - volumes_dict = worker_node.get("volumes", None) - service_account = worker_node.get("service_account", None) - environment_dict = worker_node.get("environment", None) - annotations = worker_node.get("annotations", None) - labels = worker_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - worker_groups.append( - WorkerNodeSpec( - group_name=f"worker-group-{index}", - compute_template=worker_template_names[index], - image=image, - max_replicas=max_replicas, - replicas=replicas, - min_replicas=min_replicas, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - ) - index += 1 - # Build cluster spec - cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) - # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) - status, error = self.api_server_client.create_cluster(cluster) - if status != 200: - return status, error - # Wait for cluster ready - return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) - - def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: - """ - Clean up Ray cluster and supporting template - :param name: cluster name - :param namespace: cluster namespace - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # delete cluster - status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) - if status != 200: - return status, error - # clean up templates - status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) - if status != 200: - return status, error - for template in template_array: - if template.name.startswith(name): - status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) - if status != 200: - return status, error - return status, error - - def submit_job( - self, - name: str, - namespace: str, - request: dict[str, Any], - runtime_env: str = None, - executor: str = "transformer_launcher.py", - ) -> tuple[int, str, str]: - """ - Submit job for execution - :param name: cluster name - :param namespace: cluster namespace - :param request: dictionary of the remote job request - :param runtime_env: runtime environment string - :param executor: python file to execute - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - submission id - """ - # Build job request - job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) - if runtime_env is not None: - job_request.runtime_env = runtime_env - return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) - - def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: - """ - Get job status - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - status - job status - """ - # get job info - status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - return status, error, "" - return status, error, info.status - - @staticmethod - def _print_log(log: str, previous_log_len: int) -> None: - """ - Prints the delta between current and previous logs - :param log: current log - :param previous_log_len: previous log length - :return: None - """ - l_to_print = log[previous_log_len:] - if len(l_to_print) > 0: - l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) - print(l_to_print) - - def follow_execution( - self, - name: str, - namespace: str, - submission_id: str, - data_access: DataAccess = None, - job_ready_timeout: int = 600, - print_timeout: int = 120, - ) -> None: - """ - Follow remote job execution - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :param data_access - data access class - :param job_ready_timeout: timeout to wait for fob to become ready - :param print_timeout: print interval - :return: None - """ - # Wait for job to start running - job_status = JobStatus.PENDING - while job_status != JobStatus.RUNNING and job_ready_timeout > 0: - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: - break - time.sleep(self.api_server_client.wait_interval) - job_ready_timeout -= self.api_server_client.wait_interval - logger.info(f"job status is {job_status}") - if job_ready_timeout <= 0: - logger.warning("timed out waiting for job become ready, exiting") - sys.exit(1) - # While job is running print log - previous_log_len = 0 - # At this point job could succeeded, failed, stop or running. So print log regardless - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - # continue printing log, while job is running - while job_status == JobStatus.RUNNING: - time.sleep(print_timeout) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - # Print the final log and execution status - # Sleep here to avoid racing conditions - time.sleep(2) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - logger.info(f"Job completed with execution status {status}") - if data_access is None: - return - # Here data access is either S3 or lakehouse both of which contain self.output_folder - try: - output_folder = data_access.output_folder - except Exception as e: - logger.warning(f"failed to get output folder {e}") - return - output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" - execution_log_path = f"{output_folder}execution.log" - logger.info(f"saving execution log to {execution_log_path}") - data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) - - -class ComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - # @staticmethod - # def add_settings_to_component( - # task: dsl.PipelineTask, - # timeout: int, - # image_pull_policy: str = "IfNotPresent", - # cache_strategy: bool = False, - # ) -> None: - # """ - # Add settings to kfp task - # :param task: kfp task - # :param timeout: timeout to set to the component in seconds - # :param image_pull_policy: pull policy to set to the component - # :param cache_strategy: cache strategy - # """ - # - # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # # Set cashing - # task.set_caching_options(enable_caching=cache_strategy) - # # image pull policy - # kubernetes.set_image_pull_policy(task, image_pull_policy) - # # Set the timeout for the task to one day (in seconds) - # kubernetes.set_timeout(task, seconds=timeout) - - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) From 997a1ed0320827f631355a2c922995b405412d18 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 30 May 2024 00:20:23 +0300 Subject: [PATCH 07/64] Update kfp_support_lib_v2. Signed-off-by: Revital Sur --- .make.defaults | 1 + kfp/kfp_ray_components/Makefile | 8 +- .../compile_utils/__init__.py | 3 - .../compile_utils/component.py | 101 ---- .../runtime_utils/__init__.py | 2 - .../runtime_utils/remote_jobs_utils.py | 527 ----------------- .../kfp_support/workflow_support_v2/README.md | 36 -- .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 - .../comp_utils/component.py | 54 -- .../workflow_support_v2/utils/__init__.py | 8 - .../utils/workflow_utils.py | 557 ------------------ .../compile_utils/__init__.py | 3 - .../runtime_utils/__init__.py | 2 - .../runtime_utils/kfp_utils.py | 113 ---- .../workflow_support/utils/__init__.py | 4 + .../components_utils.py} | 32 +- .../workflow_support/utils}/kfp_utils.py | 0 .../workflow_support/utils/pipeline_utils.py | 173 ++++++ .../utils/pipelines_tests_utils.py | 75 +++ .../remote_jobs_utils.py | 2 +- transforms/.make.transforms_workflows | 12 + transforms/universal/noop/Makefile | 31 +- .../universal/noop/kfp_ray/v2/noop_wf.py | 15 +- 24 files changed, 336 insertions(+), 1426 deletions(-) delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py rename kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/{compile_utils/component.py => utils/components_utils.py} (78%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils => kfp_support_lib_v2/src/kfp_support/workflow_support/utils}/kfp_utils.py (100%) create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py rename kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/{runtime_utils => utils}/remote_jobs_utils.py (99%) diff --git a/.make.defaults b/.make.defaults index d1d065015..47504d95e 100644 --- a/.make.defaults +++ b/.make.defaults @@ -191,6 +191,7 @@ __check_defined = \ .defaults.image:: # Must be called with a DOCKER_IMAGE= settings. @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) \ + -f $(DOCKER_FILE) \ --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 30ef36f5a..454c141ba 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -44,13 +44,19 @@ image: Dockerfile Dockerfile_v2 requirements.txt .PHONY: reconcile-requirements reconcile-requirements: +ifeq ($(KFPv2), 1) + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" createRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" deleteRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent.yaml + sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent_multi_s3.yaml +else @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml - # TODO remove it for KFPv2 sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml +endif .PHONY: load-image load-image: diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py deleted file mode 100644 index bbe1476fb..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.compile_utils.component import ( - ComponentUtils -) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py deleted file mode 100644 index 1f66bf59f..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py +++ /dev/null @@ -1,101 +0,0 @@ -import kfp.dsl as dsl -from kfp import kubernetes -from typing import Dict - -RUN_NAME = "KFP_RUN_NAME" - -class ComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - @staticmethod - def add_settings_to_component( - task: dsl.PipelineTask, - timeout: int, - image_pull_policy: str = "IfNotPresent", - cache_strategy: bool = False, - ) -> None: - """ - Add settings to kfp task - :param task: kfp task - :param timeout: timeout to set to the component in seconds - :param image_pull_policy: pull policy to set to the component - :param cache_strategy: cache strategy - """ - - kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, - field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # Set cashing - task.set_caching_options(enable_caching=cache_strategy) - # image pull policy - kubernetes.set_image_pull_policy(task, image_pull_policy) - # Set the timeout for the task to one day (in seconds) - kubernetes.set_timeout(task, seconds=timeout) - - @staticmethod - def set_s3_env_vars_to_component( - task: dsl.PipelineTask, - secret: str = '', - env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, - prefix: str = None, - ) -> None: - """ - Set S3 env variables to KFP component - :param task: kfp task - :param secret: secret name with the S3 credentials - :param env2key: dict with mapping each env variable to a key in the secret - :param prefix: prefix to add to env name - """ - - if prefix is not None: - for env_name, _ in env2key.items(): - env2key[prefix + "_" + env_name] = env2key.pop(env_name) - kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py deleted file mode 100644 index d2301bd0a..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py deleted file mode 100644 index 39d4d9e64..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py +++ /dev/null @@ -1,527 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import re -import sys -import time -from typing import Any - -from data_processing.data_access import DataAccess, DataAccessFactory -from data_processing.utils import ParamsUtils, get_logger -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - HeadNodeSpec, - RayJobRequest, - Template, - WorkerNodeSpec, - environment_variables_decoder, - volume_decoder, -) -from kfp_support.workflow_support.runtime_utils import KFPUtils -from ray.job_submission import JobStatus - - -logger = get_logger(__name__) - - -class RayRemoteJobs: - """ - class supporting Ray remote jobs - """ - - ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - default_image: str = "rayproject/ray:2.9.3-py310", - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initialization - :param server_url: API server URL. Default value is assuming running inside the cluster - :param default_image - default Ray image - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval - ) - self.default_image = default_image - - def create_ray_cluster( - self, - name: str, - namespace: str, - head_node: dict[str, Any], - worker_nodes: list[dict[str, Any]], - wait_cluster_ready: int = -1, - ) -> tuple[int, str]: - """ - Create Ray cluster - :param name: name, _ are not allowed in the name - :param namespace: namespace - :param head_node: head node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for head node - service_account - service account to use (has to be created) - environment - dictionary of head node environment - annotations: dictionary of head node annotation - labels: dictionary of head node labels - image_pull_policy: image pull policy, default IfNotPresent - - :param worker_nodes: an array of worker node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - max_replicas - max replicas for this worker group - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - replicas - number of replicas to create for this group (default 1) - min_replicas - min number of replicas for this group (default 0) - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for this group - service_account - service account to use (has to be created) - environment - dictionary of node of this group environment - annotations: dictionary of node of this group annotation - labels: dictionary of node of this group labels - image_pull_policy: image pull policy, default IfNotPresent - - :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # start with templates - # head_node - cpus = head_node.get("cpu", 1) - memory = head_node.get("memory", 1) - gpus = head_node.get("gpu", 0) - accelerator = head_node.get("gpu_accelerator", None) - head_node_template_name = f"{name}-head-template" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) - head_template = Template( - name=head_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(head_template) - if status != 200: - return status, error - worker_template_names = [""] * len(worker_nodes) - index = 0 - # For every worker group - for worker_node in worker_nodes: - cpus = worker_node.get("cpu", 1) - memory = worker_node.get("memory", 1) - gpus = worker_node.get("gpu", 0) - accelerator = worker_node.get("gpu_accelerator", None) - worker_node_template_name = f"{name}-worker-template-{index}" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) - worker_template = Template( - name=worker_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(worker_template) - if status != 200: - return status, error - worker_template_names[index] = worker_node_template_name - index += 1 - # Build head node spec - image = head_node.get("image", self.default_image) - image_pull_secret = head_node.get("image_pull_secret", None) - image_pull_policy = head_node.get("image_pull_policy", None) - ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) - volumes_dict = head_node.get("volumes", None) - service_account = head_node.get("service_account", None) - environment_dict = head_node.get("environment", None) - annotations = head_node.get("annotations", None) - labels = head_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - head_node_spec = HeadNodeSpec( - compute_template=head_node_template_name, - image=image, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - image_pull_policy=image_pull_policy, - ) - # build worker nodes - worker_groups = [] - index = 0 - for worker_node in worker_nodes: - max_replicas = worker_node.get("max_replicas", 1) - replicas = worker_node.get("replicas", 1) - min_replicas = worker_node.get("min_replicas", 0) - image = worker_node.get("image", self.default_image) - image_pull_secret = worker_node.get("image_pull_secret", None) - image_pull_policy = head_node.get("image_pull_policy", None) - ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) - volumes_dict = worker_node.get("volumes", None) - service_account = worker_node.get("service_account", None) - environment_dict = worker_node.get("environment", None) - annotations = worker_node.get("annotations", None) - labels = worker_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - worker_groups.append( - WorkerNodeSpec( - group_name=f"worker-group-{index}", - compute_template=worker_template_names[index], - image=image, - max_replicas=max_replicas, - replicas=replicas, - min_replicas=min_replicas, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - image_pull_policy=image_pull_policy, - ) - ) - index += 1 - # Build cluster spec - cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) - # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) - status, error = self.api_server_client.create_cluster(cluster) - if status != 200: - return status, error - # Wait for cluster ready - return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) - - def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: - """ - Clean up Ray cluster and supporting template - :param name: cluster name - :param namespace: cluster namespace - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # delete cluster - status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) - if status != 200: - return status, error - # clean up templates - status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) - if status != 200: - return status, error - for template in template_array: - if template.name.startswith(name): - status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) - if status != 200: - return status, error - return status, error - - def submit_job( - self, - name: str, - namespace: str, - request: dict[str, Any], - runtime_env: str = None, - executor: str = "transformer_launcher.py", - ) -> tuple[int, str, str]: - """ - Submit job for execution - :param name: cluster name - :param namespace: cluster namespace - :param request: dictionary of the remote job request - :param runtime_env: runtime environment string - :param executor: python file to execute - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - submission id - """ - # Although the cluster is ready, the service web server might not be ready yet at this point. - # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails - # couple of times, its harmless - _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) - time.sleep(5) - # Build job request - job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) - if runtime_env is not None: - job_request.runtime_env = runtime_env - return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) - - def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: - """ - Get job status - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - status - job status - """ - # get job info - status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - return status, error, "" - return status, error, info.status - - @staticmethod - def _print_log(log: str, previous_log_len: int) -> None: - """ - Prints the delta between current and previous logs - :param log: current log - :param previous_log_len: previous log length - :return: None - """ - l_to_print = log[previous_log_len:] - if len(l_to_print) > 0: - l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) - print(l_to_print) - - def follow_execution( - self, - name: str, - namespace: str, - submission_id: str, - data_access: DataAccess = None, - job_ready_timeout: int = 600, - print_timeout: int = 120, - ) -> None: - """ - Follow remote job execution - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :param data_access - data access class - :param job_ready_timeout: timeout to wait for fob to become ready - :param print_timeout: print interval - :return: None - """ - # Wait for job to start running - job_status = JobStatus.PENDING - while job_status != JobStatus.RUNNING and job_ready_timeout > 0: - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: - break - time.sleep(self.api_server_client.wait_interval) - job_ready_timeout -= self.api_server_client.wait_interval - logger.info(f"job status is {job_status}") - if job_ready_timeout <= 0: - logger.warning("timed out waiting for job become ready, exiting") - sys.exit(1) - # While job is running print log - previous_log_len = 0 - # At this point job could succeeded, failed, stop or running. So print log regardless - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - # continue printing log, while job is running - while job_status == JobStatus.RUNNING: - time.sleep(print_timeout) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - # Print the final log and execution status - # Sleep here to avoid racing conditions - time.sleep(2) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - logger.info(f"Job completed with execution status {job_status}") - if job_status != JobStatus.SUCCEEDED: - sys.exit(1) - if data_access is None: - return - # Here data access is either S3 or lakehouse both of which contain self.output_folder - try: - output_folder = data_access.get_output_folder() - except Exception as e: - logger.warning(f"failed to get output folder {e}") - return - output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" - execution_log_path = f"{output_folder}execution.log" - logger.info(f"saving execution log to {execution_log_path}") - data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) - - -def _execute_remote_job( - name: str, - ns: str, - script: str, - params: dict[str, Any], - data_access_params: dict[str, Any], - additional_params: dict[str, Any], - remote_jobs: RayRemoteJobs, -) -> None: - """ - Execute remote job on Ray cluster - :param name: cluster name - :param ns: execution/cluster namespace - :param additional_params: additional parameters for the job - :param data_access_params: data access parameters - :param params: job execution parameters (specific for a specific transform, - generated by the transform workflow) - :param script: script to run (has to be present in the image) - :param remote_jobs: remote jobs execution support class - :return: - """ - - status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) - if status != 200: - logger.error(f"Failed to submit job - status: {status}, error: {error}") - exit(1) - - logger.info(f"submitted job successfully, submission id {submission}") - # create data access - data_factory = DataAccessFactory() - data_factory.apply_input_params(args=data_access_params) - data_access = data_factory.create_data_access() - # print execution log - remote_jobs.follow_execution( - name=name, - namespace=ns, - submission_id=submission, - data_access=data_access, - print_timeout=additional_params.get("wait_print_tmout", 120), - job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), - ) - - -def execute_ray_jobs( - name: str, # name of Ray cluster - additional_params: dict[str, Any], - e_params: dict[str, Any], - exec_script_name: str, - server_url: str, -) -> None: - """ - Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. - All of the jobs will be executed, although some of the jobs may fail. - :param name: cluster name - :param additional_params: additional parameters for the job - :param e_params: job execution parameters (specific for a specific transform, - generated by the transform workflow) - :param exec_script_name: script to run (has to be present in the image) - :param server_url: API server url - :return: None - """ - # prepare for execution - ns = KFPUtils.get_namespace() - if ns == "": - logger.warning(f"Failed to get namespace") - sys.exit(1) - # create remote jobs class - remote_jobs = RayRemoteJobs( - server_url=server_url, - http_retries=additional_params.get("http_retries", 5), - wait_interval=additional_params.get("wait_interval", 2), - ) - # find config parameter - config = ParamsUtils.get_config_parameter(e_params) - if config is None: - exit(1) - # get config value - config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) - s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) - if type(config_value) is not list: - # single request - return _execute_remote_job( - name=name, - ns=ns, - script=exec_script_name, - data_access_params={config: config_value, "data_s3_cred": s3_creds}, - params=e_params, - additional_params=additional_params, - remote_jobs=remote_jobs, - ) - # remove config key from the dictionary - launch_params = dict(e_params) - del launch_params[config] - # Loop through all configuration - n_launches = 0 - for conf in config_value: - # populate individual config and launch - launch_params[config] = ParamsUtils.convert_to_ast(d=conf) - try: - _execute_remote_job( - name=name, - ns=ns, - script=exec_script_name, - data_access_params={config: conf, "data_s3_cred": s3_creds}, - params=launch_params, - additional_params=additional_params, - remote_jobs=remote_jobs, - ) - n_launches += 1 - except SystemExit: - logger.warning(f"Failed to execute job for configuration {conf}") - continue - - if n_launches == 0: - logger.warning("All executions failed") - sys.exit(1) - else: - logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md deleted file mode 100644 index 472c39136..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Workflow Utils for KFPv2 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py deleted file mode 100644 index 9297ede66..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.components_utils.component import ( - CompileComponentUtils -) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py deleted file mode 100644 index adaa971c1..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py +++ /dev/null @@ -1,54 +0,0 @@ -import kfp.dsl as dsl -from kfp import kubernetes -from typing import Dict - -RUN_NAME = "KFP_RUN_NAME" - -class CompileComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - @staticmethod - def add_settings_to_component( - task: dsl.PipelineTask, - timeout: int, - image_pull_policy: str = "IfNotPresent", - cache_strategy: bool = False, - ) -> None: - """ - Add settings to kfp task - :param task: kfp task - :param timeout: timeout to set to the component in seconds - :param image_pull_policy: pull policy to set to the component - :param cache_strategy: cache strategy - """ - - kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, - field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # Set cashing - task.set_caching_options(enable_caching=cache_strategy) - # image pull policy - kubernetes.set_image_pull_policy(task, image_pull_policy) - # Set the timeout for the task to one day (in seconds) - kubernetes.set_timeout(task, seconds=timeout) - - @staticmethod - def set_s3_env_vars_to_component( - task: dsl.PipelineTask, - secret: str = '', - env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, - prefix: str = None, - ) -> None: - """ - Set S3 env variables to KFP component - :param task: kfp task - :param secret: secret name with the S3 credentials - :param env2key: dict with mapping each env variable to a key in the secret - :param prefix: prefix to add to env name - """ - - if prefix is not None: - for env_name, _ in env2key.items(): - env2key[prefix + "_" + env_name] = env2key.pop(env_name) - kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py deleted file mode 100644 index 3a6ab1263..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.workflow_utils import ( - KFPUtils, - RayRemoteJobs, - ComponentUtils, - ONE_HOUR_SEC, - ONE_DAY_SEC, - ONE_WEEK_SEC, -) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py deleted file mode 100644 index 7328c740d..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py +++ /dev/null @@ -1,557 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -import json -import os -import re -import sys -import time -from typing import Any, Optional - -from data_processing.data_access import DataAccess -from data_processing.utils import get_logger -import kfp_server_api -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - HeadNodeSpec, - RayJobRequest, - Template, - WorkerNodeSpec, - environment_variables_decoder, - volume_decoder, -) -from ray.job_submission import JobStatus - -logger = get_logger(__name__) - -ONE_HOUR_SEC = 60 * 60 -ONE_DAY_SEC = ONE_HOUR_SEC * 24 -ONE_WEEK_SEC = ONE_DAY_SEC * 7 - -class KFPUtils: - """ - Helper utilities for KFP implementations - """ - - @staticmethod - def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" - ) -> tuple[str, str, str]: - """ - Get credentials from the environment - :param access_key: environment variable for access key - :param secret_key: environment variable for secret key - :param endpoint: environment variable for S3 endpoint - :return: - """ - s3_key = os.getenv(access_key, None) - s3_secret = os.getenv(secret_key, None) - s3_endpoint = os.getenv(endpoint, None) - if s3_key is None or s3_secret is None or s3_endpoint is None: - logger.warning("Failed to load s3 credentials") - return s3_key, s3_secret, s3_endpoint - - @staticmethod - def get_namespace() -> str: - """ - Get k8 namespace that we are running it - :return: - """ - ns = "" - try: - file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") - except Exception as e: - logger.warning( - f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" - ) - else: - with file: - ns = file.read() - return ns - - @staticmethod - def runtime_name(ray_name: str = "", run_id: str = "") -> str: - """ - Get unique runtime name - :param ray_name: - :param run_id: - :return: runtime name - """ - # K8s objects cannot contain special characters, except '_', All characters should be in lower case. - if ray_name != "": - ray_name = ray_name.replace("_", "-").lower() - pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. - ray_name = re.sub(pattern, "", ray_name) - else: - ray_name = "a" - # the return value plus namespace name will be the name of the Ray Route, - # which length is restricted to 64 characters, - # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] - - @staticmethod - def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: - res = f"python {executor} " - for key, value in d.items(): - if isinstance(value, str): - res += f'--{key}="{value}" ' - else: - res += f"--{key}={value} " - return res - - # Load a string that represents a json to python dictionary - @staticmethod - def load_from_json(js: str) -> dict[str, Any]: - try: - return json.loads(js) - except Exception as e: - logger.warning(f"Failed to load parameters {js} with error {e}") - sys.exit(1) - -class RayRemoteJobs: - """ - class supporting Ray remote jobs - """ - - ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - default_image: str = "rayproject/ray:2.9.3-py310", - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initialization - :param server_url: API server URL. Default value is assuming running inside the cluster - :param default_image - default Ray image - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval - ) - self.default_image = default_image - - def create_ray_cluster( - self, - name: str, - namespace: str, - head_node: dict[str, Any], - worker_nodes: list[dict[str, Any]], - wait_cluster_ready: int = -1, - ) -> tuple[int, str]: - """ - Create Ray cluster - :param name: name, _ are not allowed in the name - :param namespace: namespace - :param head_node: head node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for head node - service_account - service account to use (has to be created) - environment - dictionary of head node environment - annotations: dictionary of head node annotation - labels: dictionary of head node labels - - :param worker_nodes: an array of worker node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - max_replicas - max replicas for this worker group - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - replicas - number of replicas to create for this group (default 1) - min_replicas - min number of replicas for this group (default 0) - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for this group - service_account - service account to use (has to be created) - environment - dictionary of node of this group environment - annotations: dictionary of node of this group annotation - labels: dictionary of node of this group labels - :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # start with templates - # head_node - cpus = head_node.get("cpu", 1) - memory = head_node.get("memory", 1) - gpus = head_node.get("gpu", 0) - accelerator = head_node.get("gpu_accelerator", None) - head_node_template_name = f"{name}-head-template" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) - head_template = Template( - name=head_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(head_template) - if status != 200: - return status, error - worker_template_names = [""] * len(worker_nodes) - index = 0 - # For every worker group - for worker_node in worker_nodes: - cpus = worker_node.get("cpu", 1) - memory = worker_node.get("memory", 1) - gpus = worker_node.get("gpu", 0) - accelerator = worker_node.get("gpu_accelerator", None) - worker_node_template_name = f"{name}-worker-template-{index}" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) - worker_template = Template( - name=worker_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(worker_template) - if status != 200: - return status, error - worker_template_names[index] = worker_node_template_name - index += 1 - # Build head node spec - image = head_node.get("image", self.default_image) - image_pull_secret = head_node.get("image_pull_secret", None) - ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) - volumes_dict = head_node.get("volumes", None) - service_account = head_node.get("service_account", None) - environment_dict = head_node.get("environment", None) - annotations = head_node.get("annotations", None) - labels = head_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - head_node_spec = HeadNodeSpec( - compute_template=head_node_template_name, - image=image, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - # build worker nodes - worker_groups = [] - index = 0 - for worker_node in worker_nodes: - max_replicas = worker_node.get("max_replicas", 1) - replicas = worker_node.get("replicas", 1) - min_replicas = worker_node.get("min_replicas", 0) - image = worker_node.get("image", self.default_image) - image_pull_secret = worker_node.get("image_pull_secret", None) - ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) - volumes_dict = worker_node.get("volumes", None) - service_account = worker_node.get("service_account", None) - environment_dict = worker_node.get("environment", None) - annotations = worker_node.get("annotations", None) - labels = worker_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - worker_groups.append( - WorkerNodeSpec( - group_name=f"worker-group-{index}", - compute_template=worker_template_names[index], - image=image, - max_replicas=max_replicas, - replicas=replicas, - min_replicas=min_replicas, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - ) - index += 1 - # Build cluster spec - cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) - # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) - status, error = self.api_server_client.create_cluster(cluster) - if status != 200: - return status, error - # Wait for cluster ready - return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) - - def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: - """ - Clean up Ray cluster and supporting template - :param name: cluster name - :param namespace: cluster namespace - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # delete cluster - status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) - if status != 200: - return status, error - # clean up templates - status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) - if status != 200: - return status, error - for template in template_array: - if template.name.startswith(name): - status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) - if status != 200: - return status, error - return status, error - - def submit_job( - self, - name: str, - namespace: str, - request: dict[str, Any], - runtime_env: str = None, - executor: str = "transformer_launcher.py", - ) -> tuple[int, str, str]: - """ - Submit job for execution - :param name: cluster name - :param namespace: cluster namespace - :param request: dictionary of the remote job request - :param runtime_env: runtime environment string - :param executor: python file to execute - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - submission id - """ - # Build job request - job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) - if runtime_env is not None: - job_request.runtime_env = runtime_env - return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) - - def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: - """ - Get job status - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - status - job status - """ - # get job info - status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - return status, error, "" - return status, error, info.status - - @staticmethod - def _print_log(log: str, previous_log_len: int) -> None: - """ - Prints the delta between current and previous logs - :param log: current log - :param previous_log_len: previous log length - :return: None - """ - l_to_print = log[previous_log_len:] - if len(l_to_print) > 0: - l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) - print(l_to_print) - - def follow_execution( - self, - name: str, - namespace: str, - submission_id: str, - data_access: DataAccess = None, - job_ready_timeout: int = 600, - print_timeout: int = 120, - ) -> None: - """ - Follow remote job execution - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :param data_access - data access class - :param job_ready_timeout: timeout to wait for fob to become ready - :param print_timeout: print interval - :return: None - """ - # Wait for job to start running - job_status = JobStatus.PENDING - while job_status != JobStatus.RUNNING and job_ready_timeout > 0: - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: - break - time.sleep(self.api_server_client.wait_interval) - job_ready_timeout -= self.api_server_client.wait_interval - logger.info(f"job status is {job_status}") - if job_ready_timeout <= 0: - logger.warning("timed out waiting for job become ready, exiting") - sys.exit(1) - # While job is running print log - previous_log_len = 0 - # At this point job could succeeded, failed, stop or running. So print log regardless - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - # continue printing log, while job is running - while job_status == JobStatus.RUNNING: - time.sleep(print_timeout) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - # Print the final log and execution status - # Sleep here to avoid racing conditions - time.sleep(2) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - logger.info(f"Job completed with execution status {status}") - if data_access is None: - return - # Here data access is either S3 or lakehouse both of which contain self.output_folder - try: - output_folder = data_access.output_folder - except Exception as e: - logger.warning(f"failed to get output folder {e}") - return - output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" - execution_log_path = f"{output_folder}execution.log" - logger.info(f"saving execution log to {execution_log_path}") - data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) - - -class ComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - # @staticmethod - # def add_settings_to_component( - # task: dsl.PipelineTask, - # timeout: int, - # image_pull_policy: str = "IfNotPresent", - # cache_strategy: bool = False, - # ) -> None: - # """ - # Add settings to kfp task - # :param task: kfp task - # :param timeout: timeout to set to the component in seconds - # :param image_pull_policy: pull policy to set to the component - # :param cache_strategy: cache strategy - # """ - # - # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # # Set cashing - # task.set_caching_options(enable_caching=cache_strategy) - # # image pull policy - # kubernetes.set_image_pull_policy(task, image_pull_policy) - # # Set the timeout for the task to one day (in seconds) - # kubernetes.set_timeout(task, seconds=timeout) - - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py deleted file mode 100644 index bbe1476fb..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.compile_utils.component import ( - ComponentUtils -) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py deleted file mode 100644 index d2301bd0a..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py deleted file mode 100644 index ef00b0e92..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py +++ /dev/null @@ -1,113 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import json -import os -import re -import sys -from typing import Any - -from data_processing.utils import get_logger - - -logger = get_logger(__name__) - - -class KFPUtils: - """ - Helper utilities for KFP implementations - """ - - @staticmethod - def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" - ) -> tuple[str, str, str]: - """ - Get credentials from the environment - :param access_key: environment variable for access key - :param secret_key: environment variable for secret key - :param endpoint: environment variable for S3 endpoint - :return: - """ - s3_key = os.getenv(access_key, None) - s3_secret = os.getenv(secret_key, None) - s3_endpoint = os.getenv(endpoint, None) - if s3_key is None or s3_secret is None or s3_endpoint is None: - logger.warning("Failed to load s3 credentials") - return s3_key, s3_secret, s3_endpoint - - @staticmethod - def get_namespace() -> str: - """ - Get k8 namespace that we are running it - :return: - """ - ns = "" - try: - file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") - except Exception as e: - logger.warning( - f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" - ) - else: - with file: - ns = file.read() - return ns - - @staticmethod - def runtime_name(ray_name: str = "", run_id: str = "") -> str: - """ - Get unique runtime name - :param ray_name: - :param run_id: - :return: runtime name - """ - # K8s objects cannot contain special characters, except '_', All characters should be in lower case. - if ray_name != "": - ray_name = ray_name.replace("_", "-").lower() - pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. - ray_name = re.sub(pattern, "", ray_name) - else: - ray_name = "a" - # the return value plus namespace name will be the name of the Ray Route, - # which length is restricted to 64 characters, - # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] - - @staticmethod - def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: - res = f"python {executor} " - for key, value in d.items(): - if str(value) != "": - if isinstance(value, str): - if '"' in value: - logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") - res += f'--{key}="{value}" ' - elif isinstance(value, bool): - if value: - res += f"--{key} " - else: - res += f"--{key}={value} " - - logger.info(f"request to execute: {res}") - return res - - # Load a string that represents a json to python dictionary - @staticmethod - def load_from_json(js: str) -> dict[str, Any]: - try: - return json.loads(js) - except Exception as e: - logger.warning(f"Failed to load parameters {js} with error {e}") - sys.exit(1) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py new file mode 100644 index 000000000..166032380 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py @@ -0,0 +1,4 @@ +from kfp_support.workflow_support.utils.kfp_utils import KFPUtils +from kfp_support.workflow_support.utils.pipeline_utils import PipelinesUtils +from kfp_support.workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC +from kfp_support.workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py similarity index 78% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py rename to kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py index 1f66bf59f..71583b8f2 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py @@ -1,15 +1,37 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os import kfp.dsl as dsl from kfp import kubernetes from typing import Dict RUN_NAME = "KFP_RUN_NAME" +from data_processing.utils import get_logger +from kubernetes import client as k8s_client + + +logger = get_logger(__name__) + +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + class ComponentUtils: """ Class containing methods supporting building pipelines """ - @staticmethod def add_settings_to_component( task: dsl.PipelineTask, timeout: int, @@ -55,8 +77,8 @@ def set_s3_env_vars_to_component( @staticmethod def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor ) -> str: """ This is the most simplistic transform execution parameters computation @@ -67,7 +89,7 @@ def default_compute_execution_params( import sys from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils + from kfp_support.workflow_support.utils import KFPUtils logger = get_logger(__name__) @@ -98,4 +120,4 @@ def default_compute_execution_params( ) sys.exit(1) - return str(n_actors) \ No newline at end of file + return str(n_actors) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/kfp_utils.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py rename to kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/kfp_utils.py diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py new file mode 100644 index 000000000..714205129 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py @@ -0,0 +1,173 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import time +from typing import Any, Optional + +from data_processing.utils import get_logger +import kfp_server_api + +from kfp import Client + + +logger = get_logger(__name__) + + +class PipelinesUtils: + """ + Helper class for pipeline management + """ + + def __init__(self, host: str = "http://localhost:8080"): + """ + Initialization + :param host: host to connect to + """ + self.kfp_client = Client(host=host) + + def upload_pipeline( + self, + pipeline_package_path: str = None, + pipeline_name: str = None, + overwrite: bool = False, + description: str = None, + ) -> kfp_server_api.V2beta1Pipeline: + """ + Uploads the pipeline + :param pipeline_package_path: Local path to the pipeline package. + :param pipeline_name: Optional. Name of the pipeline to be shown in the UI + :param overwrite: Optional. If pipeline exists, delete it before creating a new one. + :param description: Optional. Description of the pipeline to be shown in the UI. + :return: Server response object containing pipeline id and other information. + """ + if overwrite: + pipeline = self.get_pipeline_by_name(name=pipeline_name) + if pipeline is not None: + try: + logger.info(f"pipeline {pipeline_name} already exists. Trying to delete it.") + self.kfp_client.delete_pipeline(pipeline_id=pipeline.id) + except Exception as e: + logger.warning(f"Exception deleting pipeline {e} before uploading") + return None + try: + pipeline = self.kfp_client.upload_pipeline( + pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description + ) + except Exception as e: + logger.warning(f"Exception uploading pipeline {e}") + return None + if pipeline is None: + logger.warning(f"Failed to upload pipeline {pipeline_name}.") + return None + logger.info("Pipeline uploaded") + return pipeline + + def delete_pipeline(self, pipeline_id): + """ + Delete pipeline. + :param pipeline_id: id of the pipeline. + :return + Returns: + Object. If the method is called asynchronously, returns the request thread. + Raises: + kfp_server_api.ApiException: If pipeline is not found. + """ + return self.kfp_client.delete_pipeline(pipeline_id) + + def start_pipeline( + self, + pipeline: kfp_server_api.V2beta1Pipeline, + experiment: kfp_server_api.V2beta1Experiment, + params: Optional[dict[str, Any]], + ) -> str: + """ + Start a specified pipeline. + :param pipeline: pipeline definition + :param experiment: experiment to use + :param params: pipeline parameters + :return: the id of the run object + """ + job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + try: + run_id = self.kfp_client.run_pipeline( + experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params + ) + logger.info(f"Pipeline run {job_name} submitted") + return run_id.id + except Exception as e: + logger.warning(f"Exception starting pipeline {e}") + return None + + def get_experiment_by_name(self, name: str = "Default") -> kfp_server_api.V2beta1Experiment: + """ + Get experiment by name + :param name: name + :return: experiment + """ + try: + return self.kfp_client.get_experiment(experiment_name=name) + except Exception as e: + logger.warning(f"Exception getting experiment {e}") + return None + + def get_pipeline_by_name(self, name: str, np: int = 100) -> kfp_server_api.V2beta1Pipeline: + """ + Given pipeline name, return the pipeline + :param name: pipeline name + :param np: page size for pipeline query. For large clusters with many pipelines, you might need to + increase this number + :return: pipeline + """ + try: + # Get all pipelines + pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines + required = list(filter(lambda p: name in p.name, pipelines)) + if len(required) != 1: + logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") + return None + return required[0] + + except Exception as e: + logger.warning(f"Exception getting pipeline {e}") + return None + + def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: + """ + Waits for a pipeline run to complete + :param run_id: run id + :param timeout: timeout (sec) (-1 wait forever) + :param wait: internal wait (sec) + :return: Completion status and an error message if such exists + """ + try: + if timeout > 0: + end = time.time() + timeout + else: + end = 2**63 - 1 + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: + time.sleep(wait) + if (end - time.time()) < 0: + return "failed", f"Execution is taking too long" + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + logger.info(f"Got pipeline execution status {status}") + + if status.lower() in ["succeeded", "completed"]: + return status, "" + return status, run_details.run.error + + except Exception as e: + logger.warning(f"Failed waiting pipeline completion {e}") + return "failed", str(e) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py new file mode 100644 index 000000000..1e7ff9cf7 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py @@ -0,0 +1,75 @@ +import os +import sys + +from data_processing.utils import get_logger, str2bool + +from . import PipelinesUtils + + +logger = get_logger(__name__) + + +def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/", overwrite: bool = True): + """ + Upload and run a single pipeline + + :param pipeline_package_path: Local path to the pipeline package. + :param endpoint: endpoint to kfp service. + :return the pipeline name as it appears in the kfp GUI. + """ + tmout: int = 800 + wait: int = 60 + file_name = os.path.basename(pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=overwrite, + ) + if pipeline is None: + return None + experiment = utils.get_experiment_by_name() + run_id = utils.start_pipeline(pipeline, experiment, params=[]) + status, error = utils.wait_pipeline_completion(run_id=run_id, timeout=tmout, wait=wait) + if status.lower() not in ["succeeded", "completed"]: + # Execution failed + logger.warning(f"Pipeline {pipeline_name} failed with error {error} and status {status}") + return None + logger.info(f"Pipeline {pipeline_name} successfully completed") + return pipeline_name + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run sanity test") + parser.add_argument("-c", "--command", type=str, choices=["upload", "sanity-test"]) + parser.add_argument("-e", "--endpoint", type=str, default="http://localhost:8080/") + parser.add_argument("-p", "--pipeline_package_path", type=str, default="") + parser.add_argument("-o", "--overwrite", type=str, default="True") + + args = parser.parse_args() + match args.command: + case "upload": + file_name = os.path.basename(args.pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=args.endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=args.pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=str2bool(args.overwrite), + ) + if pipeline is None: + sys.exit(1) + case "sanity-test": + run = run_test( + endpoint=args.endpoint, + pipeline_package_path=args.pipeline_package_path, + overwrite=str2bool(args.overwrite), + ) + if run is None: + sys.exit(1) + case _: + logger.warning("Unsupported command") + exit(1) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py rename to kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py index 39d4d9e64..40b26c7a1 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from kfp_support.workflow_support.runtime_utils import KFPUtils +from kfp_support.workflow_support.utils import KFPUtils from ray.job_submission import JobStatus diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index 0e1ba3540..586053de4 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -6,6 +6,7 @@ include ${REPOROOT}/kfp/requirements.env include ${REPOROOT}/.make.defaults USE_DEV_IMAGES ?= 1 +KFPv2 ?= 1 define set_env_var $(eval export $(1)=$(2)) @@ -21,7 +22,12 @@ endef export DOCKER_IMAGE_VERSION=$$(echo $$line |cut -d "=" -f 2); \ sed -i.back "s/data-prep-kit\/$$DOCKER_IMAGE_NAME:.*/data-prep-kit\/$$DOCKER_IMAGE_NAME:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ done < ${REPOROOT}/.make.versions +ifeq ($(KFPv2), 1) + @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION_v2}\"/" ${PIPELINE_FILE} +else @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION}\"/" ${PIPELINE_FILE} +endif + .PHONY: .transforms_workflows.compile-pipeline .transforms_workflows.compile-pipeline: @@ -53,6 +59,12 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen pip install -e $(REPOROOT)/kfp/kfp_support_lib/; @# Help: Create the virtual environment common to all workflows + pip install -e $(DPK_RAY_LIB_DIR) +ifeq ($(KFPv2), 1) + . ${WORKFLOW_VENV_ACTIVATE} && pip install -e $(REPOROOT)/kfp/kfp_support_lib_v2/ +else + . ${WORKFLOW_VENV_ACTIVATE} && pip install -e $(REPOROOT)/kfp/kfp_support_lib/ +endif #TODO KFPv2 ${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 02fd06dc2..502eea306 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -1,4 +1,6 @@ REPOROOT=../../.. + +KFPv2 ?= 1 # Use make help, to see the available rules include $(REPOROOT)/.make.defaults @@ -43,20 +45,41 @@ load-image:: .PHONY: workflow-venv workflow-venv: +ifeq ($(KFPv2), 1) + $(MAKE) -C kfp_ray/v2 workflow-venv +else $(MAKE) -C kfp_ray/v1 workflow-venv +endif .PHONY: workflow-build workflow-build: - $(MAKE) -C kfp_ray/v1 workflow-build +ifeq ($(KFPv2), 1) + $(MAKE) -C kfp_ray/v2 workflow-build +else + $(MAKE) -C kfp_ray/v1 workflow-build +endif + .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test +ifeq ($(KFPv2), 1) + $(MAKE) -C kfp_ray/v2 workflow-build +else + $(MAKE) -C kfp_ray/v1 workflow-test +endif .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload +ifeq ($(KFPv2), 1) + $(MAKE) -C kfp_ray/v2 workflow-upload +else + $(MAKE) -C kfp_ray/v1 workflow-upload +endif .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements +ifeq ($(KFPv2), 1) + $(MAKE) -C kfp_ray/v2 reconcile-requirements +else + $(MAKE) -C kfp_ray/v1 reconcile-requirements +endif diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index b3aba7cdb..03610a1bb 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( +from kfp_support.workflow_support.utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, @@ -29,7 +29,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0-kfp-v21" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1-kfp-v21" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @@ -40,13 +40,16 @@ def compute_exec_params_op(worker_options: str, actor_options: str) -> str: return ComponentUtils.default_compute_execution_params(worker_options, actor_options) +# path to kfp component specifications files +component_spec_path = "../../../../../kfp/kfp_ray_components/" + # create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") + TASK_NAME: str = "noop" From df5ff64fa42339c86dfe8d048e1c0a989b46a1a4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 30 May 2024 00:55:58 +0300 Subject: [PATCH 08/64] Fixes. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 454c141ba..927a4356b 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -45,10 +45,10 @@ image: Dockerfile Dockerfile_v2 requirements.txt .PHONY: reconcile-requirements reconcile-requirements: ifeq ($(KFPv2), 1) - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" createRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" deleteRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent_multi_s3.yaml + sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" createRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" deleteRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent.yaml + sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent_multi_s3.yaml else @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml From 228a8a5b17c78a4e5e2373877d5940eb387c5190 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 30 May 2024 13:24:32 +0300 Subject: [PATCH 09/64] More fixes. Signed-off-by: Revital Sur --- .make.defaults | 20 +- .../kfp_support_lib_v2/README.md | 68 -- .../kfp_support_lib_v2/pyproject.toml | 47 -- .../kfp_support/api_server_client/README.md | 4 - .../kfp_support/api_server_client/__init__.py | 1 - .../api_server_client/kuberay_apis.py | 636 ------------------ .../api_server_client/params/__init__.py | 53 -- .../api_server_client/params/cluster.py | 475 ------------- .../params/environmentvariables.py | 158 ----- .../api_server_client/params/headnode.py | 202 ------ .../api_server_client/params/jobsubmission.py | 163 ----- .../api_server_client/params/templates.py | 224 ------ .../api_server_client/params/volumes.py | 449 ------------- .../api_server_client/params/workernode.py | 206 ------ .../kfp_support/workflow_support/README.md | 45 -- .../src/create_ray_cluster.py | 3 +- .../src/delete_ray_cluster.py | 3 +- kfp/kfp_ray_components/src/execute_ray_job.py | 3 +- transforms/.make.transforms_workflows | 2 +- .../universal/noop/kfp_ray/v2/noop_wf.py | 4 +- 20 files changed, 16 insertions(+), 2750 deletions(-) delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md diff --git a/.make.defaults b/.make.defaults index 47504d95e..9efe778ad 100644 --- a/.make.defaults +++ b/.make.defaults @@ -209,7 +209,7 @@ __check_defined = \ cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} # Build and image using the local Dockerfile and make the data-processing-lib/python -# available in the current directory for use by the Dockerfile (i.e. to install the library). +# available in the current directory for use by the Dockerfile (i.e. to install the library). .PHONY: .defaults.python-lib-src-image .defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source @@ -242,8 +242,8 @@ __check_defined = \ .defaults.python-lib-src-venv:: .defaults.venv .defaults.install-python-lib-src-venv .PHONY: .defaults.install-python-lib-src-venv -.defaults.install-python-lib-src-venv:: - @# Help: Install Python data processing library source into existing venv +.defaults.install-python-lib-src-venv:: + @# Help: Install Python data processing library source into existing venv @echo Installing Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ @@ -256,15 +256,15 @@ __check_defined = \ echo Installed source from Python processing library for `which $(PYTHON)`; \ else \ echo ERROR installing source into `which $(PYTHON)`; \ - fi + fi .PHONY: .defaults.ray-lib-src-venv .defaults.ray-lib-src-venv:: .defaults.venv .defaults.install-ray-lib-src-venv -# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source. +# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source. .PHONY: .defaults.install-ray-lib-src-venv -.defaults.install-ray-lib-src-venv:: - @# Help: Install Ray and Python data processing library source into existing venv +.defaults.install-ray-lib-src-venv:: + @# Help: Install Ray and Python data processing library source into existing venv @echo Installing Ray and Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ @@ -284,10 +284,10 @@ __check_defined = \ .PHONY: .defaults.spark-lib-src-venv .defaults.spark-lib-src-venv:: .defaults.venv .defaults.install-spark-lib-src-venv -# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source. +# Install the python-based lib BEFORE spark assuming spark depends on the same version as python source. .PHONY: .defaults.install-spark-lib-src-venv -.defaults.install-spark-lib-src-venv:: - @# Help: Install Spark and Python data processing library source into existing venv +.defaults.install-spark-lib-src-venv:: + @# Help: Install Spark and Python data processing library source into existing venv @echo Installing Spark and Python data processing library source to existing venv @source venv/bin/activate; \ pip install pytest; \ diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md deleted file mode 100644 index 86f3f4360..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# KFP support library - -This provides support for implementing KFP pipelines automating transform's execution. -It comprises 2 main modules - -* [api server client](src/kfp_support/api_server_client/README.md) -* [workflow support](src/kfp_support/workflow_support/README.md) - -## Development - -### Requirements -1. python 3.10 or later -2. git command line tools -3. [pre-commit](https://pre-commit.com/) -4. twine (pip install twine) - * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` - -### Git -Simple clone the repo and set up the pre-commit hooks. -```shell -git clone git@github.com:IBM/data-prep-kit.git -cd kfp/kfp_support_lib -pre-commit install -``` -If you don't have pre-commit, you can install from [here](https://pre-commit.com/) - -## Library Artifact Build and Publish - -The process of creating a release for `fm_data_processing_kfp` package involves the following steps: - -cd to the package directory. - -update the version in [requirements.env](../requirements.env) file. - -run `make build` and `make publish`. - -## Testing - -To run the package tests perform the following: - -To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. - -```bash -make setup -``` - -The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. - -```bash -make build -``` - -lastly, execute the tests: - -```bash -make test -``` - -### Cleanup - -It is advisable to execute the following command prior to running `make test` once more. This will ensure that any -previous test runs resources are removed before starting new tests. - -```bash -kubectl delete workflows -n kubeflow --all -``` - - diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml deleted file mode 100644 index f995d60d7..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "data_prep_toolkit_kfp_v2" -version = "0.1.1" -requires-python = ">=3.10" -description = "Data Preparation Kit Library. KFP v2 support" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, - { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, - { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, - { name = "Revital Eres", email = "eres@il.ibm.com" }, -] -dependencies = [ - "kfp==2.7.0", - "kfp-kubernetes==1.2.0", - "requests", - "data-prep-toolkit==0.1.1", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", -] - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/kfp_support"] - -[tool.pytest.ini_options] -addopts = "--cov --cov-report term-missing --cov-fail-under 10" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md deleted file mode 100644 index 423f743a1..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# KubeRay API server APIs - -This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) -Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py deleted file mode 100644 index 60cbbc2f2..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py deleted file mode 100644 index 270815e77..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py +++ /dev/null @@ -1,636 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import time - -import requests -from data_processing.utils import get_logger -from kfp_support.api_server_client.params import ( - Cluster, - RayJobInfo, - RayJobRequest, - Template, - cluster_decoder, - clusters_decoder, - template_decoder, - templates_decoder, -) - - -logger = get_logger(__name__) - - -_headers = {"Content-Type": "application/json", "accept": "application/json"} - -CONNECT_TIMEOUT = 50 -READ_TIMEOUT = 50 -TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) - - -class KubeRayAPIs: - """ - This class implements KubeRay APIs based on the API server. - To create a class, the following parameters are required: - base - the URL of the API server (default is set to the standalone API server) - wait interval - the amount of sec to wait between checking for cluster ready - """ - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - token: str = None, - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initializer - :param server_url: API server url - default assuming running it inside the cluster - :param token: token, only used for API server with security enabled - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.server_url = server_url - if token is not None: - _headers["Authorization"] = token - self.wait_interval = wait_interval - self.api_base = "/apis/v1/" - self.http_retries = http_retries - - def list_compute_templates(self) -> tuple[int, str, list[Template]]: - """ - List compute templates across all namespaces of the k8 cluster - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning(f"Failed to list compute templates, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: - """ - List compute templates across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: return tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning( - f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: - """ - get a compute template - :param ns: namespace - :param name: template name - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, template_decoder(response.json()) - else: - logger.warning( - f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_compute_template(self, template: Template) -> tuple[int, str]: - """ - Create a compute template - :param template - definition of a template - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: - """ - delete a compute template - :param ns: namespace - :param name: template name - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def list_clusters(self) -> tuple[int, str, list[Cluster]]: - """ - List clusters across all namespaces of the k8 cluster - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list cluster, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list cluster, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: - """ - List clusters across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: - """ - get cluster - :param ns: namespace - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - clusters definition - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, cluster_decoder(response.json()) - else: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_cluster(self, cluster: Cluster) -> tuple[int, str]: - """ - create cluster - :param cluster: cluster definition - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" - for i in range(self.http_retries): - try: - response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: - """ - get cluster status - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - # Execute HTTP request - status, error, cluster = self.get_cluster(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error, None - cluster_status = "creating" - if cluster.cluster_status is not None: - cluster_status = cluster.cluster_status - return status, None, cluster_status - - def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: - """ - wait for cluster to be ready - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) - :returns: A tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - current_wait = 0 - while True: - status, error, c_status = self.get_cluster_status(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error - if c_status == "ready": - return status, None - if current_wait > wait > 0: - return 408, f"Timed out waiting for cluster ready in {current_wait} sec" - time.sleep(self.wait_interval) - current_wait += self.wait_interval - - def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: - """ - get cluster endpoint - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) for cluster to be ready - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - endpoint (service for dashboard endpoint) - """ - # Ensure that the cluster is ready - status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) - if status // 100 != 2: - return status, error, None - # Get cluster - status, error, cluster = self.get_cluster(ns=ns, name=name) - if status // 100 != 2: - return status, error, None - return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" - - def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: - """ - delete cluster - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: - """ - submit Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param job_request: job submission - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json()["submissionId"] - else: - logger.warning( - f"Failed to submit job to the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, None - - def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: - """ - get Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, RayJobInfo(response.json()) - else: - logger.warning( - f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: - """ - list Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - job_info_array = response.json().get("submissions", None) - if job_info_array is None: - return response.status_code, None, [] - else: - return response.status_code, None, [RayJobInfo(i) for i in job_info_array] - else: - logger.warning( - f"Failed to list jobs from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, [] - - def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: - """ - get Ray job log - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - log - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json().get("log", "") - else: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" - ) - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - stop Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.post(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - delete Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py deleted file mode 100644 index e5a7d70fa..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from kfp_support.api_server_client.params.templates import ( - TolerationOperation, - TolerationEffect, - Toleration, - Template, - toleration_decoder, - template_decoder, - templates_decoder, -) -from kfp_support.api_server_client.params.volumes import ( - HostPath, - MountPropagationMode, - AccessMode, - BaseVolume, - HostPathVolume, - PVCVolume, - EphemeralVolume, - EmptyDirVolume, - ConfigMapVolume, - SecretVolume, - volume_decoder, -) -from kfp_support.api_server_client.params.environmentvariables import ( - EnvVarSource, - EnvVarFrom, - EnvironmentVariables, - env_var_from_decoder, - environment_variables_decoder, -) -from kfp_support.api_server_client.params.headnode import ( - ServiceType, - HeadNodeSpec, - DEFAULT_HEAD_START_PARAMS, - head_node_spec_decoder, -) -from kfp_support.api_server_client.params.workernode import ( - WorkerNodeSpec, - DEFAULT_WORKER_START_PARAMS, - worker_node_spec_decoder, -) -from kfp_support.api_server_client.params.cluster import ( - Environment, - AutoscalerOptions, - ClusterSpec, - ClusterEvent, - Cluster, - UpscalingMode, - autoscaling_decoder, - cluster_spec_decoder, - cluster_decoder, - clusters_decoder, -) -from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py deleted file mode 100644 index 922a14bef..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py +++ /dev/null @@ -1,475 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - HeadNodeSpec, - WorkerNodeSpec, - environment_variables_decoder, - head_node_spec_decoder, - volume_decoder, - worker_node_spec_decoder, -) - - -class Environment(enum.Enum): - """ - Environment definitions - """ - - DEV = 0 # development - TESTING = 1 # testing - STAGING = 2 # staging - PRODUCTION = 3 # production - - -class UpscalingMode(enum.Enum): - """ - Enumeration of autoscaling mode - """ - - Conservative = ( - "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster - ) - Default = "Default" # no rate limitations - Aggressive = "Aggressive" # same as default - - -class AutoscalerOptions: - """ - AutoscalerOptions is used to define Ray cluster autoscaling. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create autoscaling options specification: gets the following parameters: - idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray - resources. Default 60sec (one minute). - upscaling_mode - required autoscaler upscaling mode - image - optional, allows to override the autoscaler's container image - image_pull_policy - optional, allows to override the autoscaler's container image pull policy - cpus - optional, CPUs requirements for autoscaler - default "500m" - memory - optional, memory requirements for autoscaler - default "512Mi" - environment - optional, environment variables for autoscaler container - volumes - optional, a list of volumes to attach to autoscaler container. - This is needed for enabling TLS for the autoscaler container. - """ - - def __init__( - self, - upscaling_mode: UpscalingMode = UpscalingMode.Default, - idle_tmout: int = None, - image: str = None, - image_pull_policy: str = None, - cpus: str = None, - memory: str = None, - environment: EnvironmentVariables = None, - volumes: list[BaseVolume] = None, - ): - """ - Initialization - :param upscaling_mode: upscale mode - :param idle_tmout: idle timeout - :param image: image - :param image_pull_policy: image pull policy - :param cpus: cpu requirement for autoscaling - :param memory: memory requirement for autoscaling - :param environment: autoscaler environment - :param volumes: volumes for autoscaler - """ - self.upscaling_mode = upscaling_mode - self.idle_tmout = idle_tmout - self.image = image - self.image_pull_policy = image_pull_policy - self.cpus = cpus - self.memory = memory - self.environment = environment - self.volumes = volumes - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"upscaling_mode = {self.upscaling_mode}" - if self.idle_tmout is not None: - val += f", idle_timeout = {self.idle_tmout}" - if self.image is not None: - val += f", image = {self.image}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.cpus is not None: - val += f", cpus = {self.cpus}" - if self.memory is not None: - val += f", memory = {self.memory}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"upscalingMode": self.upscaling_mode.value} - if self.idle_tmout is not None: - dct["idleTimeoutSeconds"] = self.idle_tmout - if self.image is not None: - dct["image"] = self.image - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.cpus is not None: - dct["cpu"] = self.cpus - if self.memory is not None: - dct["memory"] = self.memory - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["envs"] = self.environment.to_dict() - return dct - - -class ClusterSpec: - """ - ClusterSpec is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create cluster spec from: gets the following parameters: - head_group_spec - required, specification of the head node - worker_group_spec - optional, list of worker group specs - autoscaler_options - optional, autoscaling options - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - head_node: HeadNodeSpec, - worker_groups: list[WorkerNodeSpec] = None, - autoscaling_options: AutoscalerOptions = None, - ): - """ - Initialization - :param head_node - head node definition - :param worker_groups - worker group definition - :param autoscaling_options - autoscaler options - """ - self.head_node = head_node - self.worker_groups = worker_groups - self.autoscaling_options = autoscaling_options - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster spec - """ - val = f"head_group_spec: {self.head_node.to_string()}" - if self.worker_groups is not None: - val += "\nworker groups: " - for w in self.worker_groups: - val += f"\nworker_group_spec = {w.to_string()}]" - if self.autoscaling_options is not None: - val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: Dictionary representation of cluster spec - """ - dst = {"headGroupSpec": self.head_node.to_dict()} - if self.worker_groups is not None: - dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] - if self.autoscaling_options is not None: - dst["enableInTreeAutoscaling"] = True - dst["autoscalerOptions"] = self.autoscaling_options.to_dict() - return dst - - -class ClusterEvent: - """ - Cluster event is used to define events emitted during cluster creation. - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create event: gets the dictionary with the following parameters: - id - unique Event Id - name - human readable event name - created_at - event creation time - first_timestamp - first time the event occur - last_timestamp - last time the event occur - reason - reason for the transition into the object's current status - message - human-readable description of the status of this operation - type - type of this event (Normal, Warning), new types could be added in the future - count - number of times this event has occurred - """ - - def __init__(self, dst: dict[str, Any]): - """ - Initialization from dictionary - :param dst: dictionary representation of cluster event - """ - self.id = dst.get("id", "") - self.name = dst.get("name", "") - self.created_at = dst.get("created_at", "") - self.first_timestamp = dst.get("first_timestamp", "") - self.last_timestamp = dst.get("last_timestamp", "") - self.reason = dst.get("reason", "") - self.message = dst.get("message", "") - self.type = dst.get("type", "") - self.count = dst.get("count", "0") - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster event - """ - return ( - f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " - f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," - f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" - ) - - -class Cluster: - """ - Cluster is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - name - required, unique (per namespace) cluster name - namespace - required, cluster's namespace (should exist) - user - required, user who owns the cluster - version - required, Ray cluster version - typically Ray version - deployment_environment - optional (see Environment) - cluster_spec - required, ray cluster configuration - annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class - cluster_environment - optional, cluster environment variables - created_at - output, cluster creation ts - deleted_at - output, cluster deletion ts - cluster_status - output, cluster status - events - output, cluster events - service_endpoint - output, cluster service endpoints - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - name: str, - namespace: str, - user: str, - version: str, - cluster_spec: ClusterSpec, - deployment_environment: Environment = None, - annotations: dict[str, str] = None, - cluster_environment: EnvironmentVariables = None, - created_at: str = None, - deleted_at: str = None, - cluster_status: str = None, - events: list[ClusterEvent] = None, - service_endpoint: dict[str, str] = None, - ): - """ - Initialization - :param name: cluster name - :param namespace: cluster namespace - :param user: user name - :param version: version - :param cluster_spec: cluster spec - :param deployment_environment: cluster deployment environment - :param annotations: cluster annotations - :param cluster_environment: cluster environment - :param created_at: created at - :param deleted_at: deleted at - :param cluster_status: status - :param events: cluster events - :param service_endpoint: service endpoint - """ - self.name = name - self.namespace = namespace - self.user = user - self.version = version - self.cluster_spec = cluster_spec - self.environment = deployment_environment - self.annotations = annotations - self.envs = cluster_environment - self.created_at = created_at - self.deleted_at = deleted_at - self.cluster_status = cluster_status - self.events = events - self.service_endpoint = service_endpoint - - def to_string(self) -> str: - """ - convert to string representation - :return: string representation of cluster - """ - val = ( - f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " - f"cluster_spec = {self.cluster_spec.to_string()}" - ) - if self.environment is not None: - val += f"deployment environment = {self.environment.name}" - if self.annotations is not None: - val += f" ,annotations = {str(self.annotations)}" - if self.envs is not None: - val = val + f",cluster environment = {self.envs.to_string()}" - val += "\ncluster output\n" - if self.created_at is not None: - val += f" ,created_at = {self.created_at}" - if self.deleted_at is not None: - val += f" ,deleted_at = {self.deleted_at}" - if self.cluster_status is not None: - val += f" ,cluster status = {self.cluster_status}" - if self.events is not None: - val = val + ",\n cluster events = [" - first = True - for e in self.events: - if first: - first = False - else: - val += ", " - val = val + "{" + e.to_string() + "}" - val = val + "]" - if self.service_endpoint is not None: - val += f" ,service endpoints = {str(self.service_endpoint)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of cluster - """ - # only convert input variables - dst = { - "name": self.name, - "namespace": self.namespace, - "user": self.user, - "version": self.version, - "clusterSpec": self.cluster_spec.to_dict(), - } - if self.environment is not None: - dst["environment"] = self.environment.value - if self.annotations is not None: - dst["annotations"] = self.annotations - if self.envs is not None: - dst["envs"] = self.envs.to_dict() - return dst - - -""" - Creates new cluster from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: - """ - Create autoscaling options from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: autoscaling options - """ - upscaling_mode = UpscalingMode.Default - if "upscalingMode" in dct: - upscaling_mode = UpscalingMode(dct.get("upscalingMode")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("envs")) > 0: - environments = environment_variables_decoder(dct.get("envs")) - return AutoscalerOptions( - upscaling_mode=upscaling_mode, - idle_tmout=dct.get("idleTimeoutSeconds", None), - image=dct.get("image", None), - image_pull_policy=dct.get("imagePullPolicy", None), - cpus=dct.get("cpu", None), - memory=dct.get("memory", None), - environment=environments, - volumes=volumes, - ) - - -def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: - """ - Create cluster spec from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: cluster spec - """ - workers = None - autoscaling_options = None - if "workerGroupSpec" in dct: - workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] - if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): - autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) - return ClusterSpec( - head_node=head_node_spec_decoder(dct.get("headGroupSpec")), - worker_groups=workers, - autoscaling_options=autoscaling_options, - ) - - -def cluster_decoder(dct: dict[str, Any]) -> Cluster: - """ - Create cluster from its dictionary representation - :param dct: dictionary representation of cluster - :return: cluster - """ - environment = None - if "environment" in dct: - environment = Environment(int(dct.get("environment", "0"))) - events = None - if "events" in dct: - events = [ClusterEvent(c) for c in dct["events"]] - envs = None - if "envs" in dct: - envs = environment_variables_decoder(dct.get("envs")) - return Cluster( - name=dct.get("name", ""), - namespace=dct.get("namespace", ""), - user=dct.get("user", ""), - version=dct.get("version", ""), - cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), - deployment_environment=environment, - annotations=dct.get("annotations"), - cluster_environment=envs, - created_at=dct.get("createdAt"), - deleted_at=dct.get("deletedAt"), - cluster_status=dct.get("clusterState"), - events=events, - service_endpoint=dct.get("serviceEndpoint"), - ) - - -def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: - """ - Create list of clusters from its dictionary representation - :param dct: dictionary representation of a list of clusters - :return: list of clusters - """ - return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py deleted file mode 100644 index d1056f6f6..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py +++ /dev/null @@ -1,158 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class EnvVarSource(enum.Enum): - """ - Enumeration of environment sources - """ - - CONFIGMAP = 0 # config map - SECRET = 1 # secret - RESOURCE_FIELD = 2 # resource field - FIELD = 3 # field - - -class EnvVarFrom: - """ - EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - Source required - source of environment variable - name required name for config map or secret, container name for resource, path for field - key required Key for config map or secret, resource name for resource - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, source: EnvVarSource, name: str, key: str): - """ - Initialize - :param source - source - :param name source name - :param key source key - """ - self.source = source - self.name = name - self.key = key - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of environment from - """ - return f"source = {self.source.name}, name = {self.name}, key = {self.key}" - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of environment from - """ - return {"source": self.source.value, "name": self.name, "key": self.key} - - -class EnvironmentVariables: - """ - EnvironmentVariables is used to define environment variables. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - key_value - optional, dictionary of key/value environment variables - from_ref - optional, dictionary of reference environment variables - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): - """ - Initialization - :param key_value: dictionary of key/value pairs for environment variables - :param from_ref: dictionary of key/value pairs for environment from variables - """ - self.key_val = key_value - self.from_ref = from_ref - - def to_string(self) -> str: - """ - convert to string - :return: string representation of environment variables - """ - val = "" - if self.key_val is not None: - val = f"values = {str(self.key_val)}" - if self.from_ref is not None: - if val != "": - val += " , " - val += "valuesFrom = {" - first = True - for k, v in self.from_ref.items(): - if not first: - val += ", " - else: - first = False - val += f"{k} = [{v.to_string()}]" - val += "}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of environment variables - """ - dst = {} - if self.key_val is not None: - dst["values"] = self.key_val - if self.from_ref is not None: - fr = {} - for k, v in self.from_ref.items(): - fr[k] = v.to_dict() - dst["valuesFrom"] = fr - return dst - - -""" - Creates new environment variable from from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: - """ - Create environment from from dictionary - :param dct: dictionary representations of environment from - :return: environment from - """ - return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) - - -def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: - """ - Create environment variables from from dictionary - :param dct: dictionary representations of environment variables - :return: environment variables - """ - keyvalues = None - fr = None - if "values" in dct: - keyvalues = dct.get("values") - if "valuesFrom" in dct: - from_ref = dct.get("valuesFrom") - fr = {} - for k, v in from_ref.items(): - fr[k] = env_var_from_decoder(v) - return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py deleted file mode 100644 index 7a9d4120f..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py +++ /dev/null @@ -1,202 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} - - -class ServiceType(enum.Enum): - """ - Enumeration of head node service types - """ - - ClusterIP = "ClusterIP" # cluster IP - NodePort = "NodePort" # node port - LoadBalancer = "LoadBalancer" # load balancer - - -class HeadNodeSpec: - """ - HeadNodeSpec is used to define Ray cluster head node configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create head node specification: gets the following parameters: - compute_template - required, the computeTemplate of head node group - ray_start_params - required, Ray start parameters - image - optional, image used for head node - service_type - optional (ServiceType), service type foe headnode - enable_ingress - optional, allow to enable ingress for dashboard - volumes - optional, a list of volumes to attach to head node - service_account - optional, a service account (has to exist) to run head node - image_pull_secret - optional, secret to pull head node image from registry - environment - optional, environment variables for head pod - annotations - optional, annotations for head node - labels - optional, labels for head node - image_pull_policy - optional, head node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - compute_template: str, - image: str, - ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, - service_type: ServiceType = ServiceType.ClusterIP, - enable_ingress: bool = False, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param compute_template: compute template - :param ray_start_params: ray start parameters - :param image: node image - :param service_type: service type - :param enable_ingress: enable ingress flag - :param volumes: volumes for head node - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: head node environment - :param annotations: head node annotation - :param labels: labels - :param image_pull_policy: image pull policy - """ - - self.compute_template = compute_template - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) - self.image = image - self.service_type = service_type - self.enable_ingress = enable_ingress - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" - if self.image is not None: - val += f", image = {self.image}" - if self.service_type is not None: - val += f", service_type = {self.service_type.name}" - if self.enable_ingress: - val += ", enable_ingress = True" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} - if self.image is not None: - dct["image"] = self.image - if self.service_type is not None: - dct["serviceType"] = self.service_type.value - if self.enable_ingress: - dct["enableIngress"] = True - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["image_pull_secret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new head node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: - """ - Create head node spec from dictionary - :param dct: dictionary representation of head node spec - :return: Head node spec - """ - service_type = None - if "serviceType" in dct: - service_type = ServiceType(dct.get("serviceType", "ClusterIP")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return HeadNodeSpec( - compute_template=dct.get("computeTemplate"), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - service_type=service_type, - enable_ingress=dct.get("enableIngress", False), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py deleted file mode 100644 index a0b2bfcb0..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py +++ /dev/null @@ -1,163 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -from typing import Any - - -class RayJobRequest: - """ - RayJobRequest used to define job to be submitted to a Ray cluster - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - required, the command to start a job on the cluster - submission_id - optional, submission id for the job submission - runtime_env - optional, yaml string specifying job runtime environment - metadata - optional, dictionary of the submission metadata - num_cpus - optional, number of cpus for job execution - num_gpus - optional, number of gpus for job execution - resources - optional, dictionary of the resources for job execution - """ - - def __init__( - self, - entrypoint: str, - submission_id: str = None, - runtime_env: str = None, - metadata: dict[str, str] = None, - num_cpu: float = -1.0, - num_gpu: float = -1.0, - resources: dict[str, str] = None, - ): - """ - Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html - :param entrypoint: entrypoint - :param submission_id: submission id - :param runtime_env: runtime environment - :param metadata: submission metadata - :param num_cpu: job number cpus - :param num_gpu: job number gpus - :param resources: job custom resources - """ - self.entrypoint = entrypoint - self.submission_id = submission_id - self.runtime_env = runtime_env - self.metadata = metadata - self.num_cpu = num_cpu - self.num_gpu = num_gpu - self.resources = resources - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of job submission - """ - val = f"entrypoint = {self.entrypoint}" - if self.submission_id is not None: - val += f", submission_id = {self.submission_id}" - if self.num_cpu > 0: - val += f", num_cpu = {self.num_cpu}" - if self.num_gpu > 0: - val += f", num_gpu = {self.num_gpu}" - if self.runtime_env is not None: - val += f", runtime_env = {self.runtime_env}" - if self.metadata is not None: - val += f", metadata = {self.metadata}" - if self.resources is not None: - val += f", resources = {self.resources}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of job submission - """ - dct = {"entrypoint": self.entrypoint} - if self.submission_id is not None: - dct["submissionId"] = self.submission_id - if self.runtime_env is not None: - dct["runtimeEnv"] = self.runtime_env - if self.metadata is not None: - dct["metadata"] = self.metadata - if self.num_cpu > 0: - dct["numCpus"] = self.num_cpu - if self.num_gpu > 0: - dct["numGpus"] = self.num_gpu - if self.resources is not None: - dct["resources"] = self.resources - return dct - - -class RayJobInfo: - """ - RayJobInfo used to define information about the job in a Ray cluster - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - the command to start a job on the cluster - job_id - job execution id - submission_id - submission id for the job submission - runtime_env - job runtime environment - status - job execution status - message - status message - start_time - job start time - end-time - job ind time - error_type - type of error - metadata - optional, dictionary of the submission metadata - """ - - def __init__(self, dct: dict[str, Any]): - """ - Initialize from dictionary - :param dct: dictionary representation of Ray job info - """ - self.entrypoint = dct.get("entrypoint", "") - self.job_id = dct.get("jobId", "") - self.submission_id = dct.get("submissionId", "") - self.status = dct.get("status", "") - self.message = dct.get("message", None) - self.start_time = int(dct.get("startTime", "0")) - self.end_time = int(dct.get("endTime", "0")) - self.error_type = dct.get("ErrorType", None) - self.metadata = dct.get("Metadata", None) - self.runtime_env = dct.get("runtimeEnv", None) - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of Ray job info - """ - val = ( - f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," - f" status = {self.status}" - ) - if self.message is not None: - val += f" message = {self.message}" - if self.start_time > 0: - val += ( - f" start time = " - f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.end_time > 0: - val += ( - f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.error_type is not None: - val += f" error type = {self.error_type}" - if self.runtime_env is not None: - val += f" runtime env = {str(self.runtime_env)}" - if self.metadata is not None: - val += f" metadata = {str(self.metadata)}" - return val diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py deleted file mode 100644 index 0ef4c1583..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py +++ /dev/null @@ -1,224 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class TolerationOperation(enum.Enum): - """ - Toleration operation types - """ - - Exists = "Exists" # exists - Equal = "Equal" # equal - - -class TolerationEffect(enum.Enum): - """ - Toleration effect - """ - - NoSchedule = "NoSchedule" # not schedule - PreferNoSchedule = "PreferNoSchedule" # prefer not schedule - NoExecute = "NoExecute" # not execute - - -class Toleration: - """ - Toleration is used by compute template to pick specific nodes for placing pods. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create toleration: gets the following parameters: - key - required, key created by the node's taint - operator - required, operator to apply, supported operators are "Exists" and "Equal" - effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" - value - optional, value - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): - """ - Initialization - :param key: key - :param operator: operator - :param effect: effect - :param value: value - """ - self.key = key - self.operator = operator - self.value = value - self.effect = effect - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of toleration - """ - val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" - if self.value is None: - return val - else: - return val + f", value = {self.value}" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to string - :return: string representation of toleration - """ - dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} - if self.value is not None: - dct["value"] = self.value - return dct - - -# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. -# If it is specified, it has to be in the format that is understood by kubernetes as a valid -# The following devices are currently supported by kubernetes: -# AMD - gpu accelerator amd.com/gpu -# Intel - gpu accelerator gpu.intel.com/i915 -# NVIDIA - gpu accelerator nvidia.com/gpu - - -class Template: - """ - Template is used to define specific nodes configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create templates: gets the following parameters: - name - required, template name - namespace - required, template namespace - cpus - required, template number of cpus - memory - required, template memory (GB) - gpus - optional, number of GPUs, default 0 - gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed - tolerations - optional, tolerations for pod placing, default none - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - - to_json() -> str convert to json string - """ - - def __init__( - self, - name: str, - namespace: str, - cpu: int, - memory: int, - gpu: int = 0, - gpu_accelerator: str = None, - tolerations: list[Toleration] = None, - ): - """ - Initialization - :param name: name - :param namespace: namespace - :param cpu: cpu - :param memory: memory - :param gpu: gpu - :param gpu_accelerator: accelerator type - :param tolerations: tolerations - """ - self.name = name - self.namespace = namespace - self.cpu = cpu - self.memory = memory - self.gpu = gpu - self.gpu_accelerator = gpu_accelerator - self.tolerations = tolerations - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of template - """ - val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" - if self.gpu > 0: - val = val + f", gpu {self.gpu}" - if self.gpu_accelerator is not None: - val = val + f", gpu accelerator {self.gpu_accelerator}" - if self.tolerations is None: - return val - val = val + ", tolerations [" - first = True - for tol in self.tolerations: - if first: - first = False - val = val + "{" + tol.to_string() + "}" - else: - val = val + ", {" + tol.to_string() + "}" - return val + "]" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of template - """ - dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} - if self.gpu > 0: - dct["gpu"] = self.gpu - if self.gpu_accelerator is not None: - dct["gpu accelerator"] = self.gpu_accelerator - if self.tolerations is not None: - dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] - return dct - - -""" - Creates new toleration from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def toleration_decoder(dct: dict[str, Any]) -> Toleration: - """ - Create toleration from dictionary - :param dct: dictionary representation of toleration - :return: toleration - """ - return Toleration( - key=dct.get("key"), - operator=TolerationOperation(dct.get("operator", "Exists")), - effect=TolerationEffect(dct.get("effect", "NoSchedule")), - value=dct.get("value"), - ) - - -def template_decoder(dct: dict[str, Any]) -> Template: - """ - Create template from dictionary - :param dct: dictionary representation of template - :return: template - """ - tolerations = None - if "tolerations" in dct: - tolerations = [toleration_decoder(d) for d in dct["tolerations"]] - return Template( - name=dct.get("name"), - namespace=dct.get("namespace"), - cpu=int(dct.get("cpu", "0")), - memory=int(dct.get("memory", "0")), - gpu=int(dct.get("gpu", "0")), - gpu_accelerator=dct.get("gpu_accelerator"), - tolerations=tolerations, - ) - - -def templates_decoder(dct: dict[str, Any]) -> list[Template]: - """ - Create list of template from dictionary - :param dct: dictionary representation of list of template - :return: list of template - """ - return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py deleted file mode 100644 index fee0e1ea4..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py +++ /dev/null @@ -1,449 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class HostPath(enum.Enum): - """ - Host path enumeration - """ - - DIRECTORY = 0 # directory - FILE = 1 # files - - -class MountPropagationMode(enum.Enum): - """ - Mount propagation enumeration - """ - - NONE = 0 # None - HOSTTOCONTAINER = 1 # host to container - BIDIRECTIONAL = 2 # bi directional - - -class AccessMode(enum.Enum): - """ - Access mode enumeration - """ - - RWO = 0 # read write once - ROX = 1 # read only many - RWX = 2 # read write many - - -class BaseVolume: - """ - KubeRay currently support several types of volumes, including hostPat, PVC, - ephemeral volumes, config maps, secrets and empty dir. All of them use slightly - different parameters. Base Volume is a base class for all different volume types. - """ - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - -class HostPathVolume(BaseVolume): - """ - This class implements HostPath volume. In addition to name and mount path it requires host - path volume specific parameters: - source - data location on host - hostPathType - host path type: directory (0) or file (1) - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - host_path_type: HostPath = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param host_path_type: host path type - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.host_path_type = host_path_type - self.volume_type = 1 - self.mount_propagation = mount_propagation - - def to_string(self) -> str: - """ - Convert to string - :return: HostPathVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - if self.host_path_type is not None: - val += f", host path type = {self.host_path_type.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: HostPathVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - if self.host_path_type is not None: - dst["hostPathType"] = self.host_path_type.value - return dst - - -class PVCVolume(BaseVolume): - """ - This class implements PVC volume. In addition to name and mount path it requires - PVC volume specific parameters: - source - PVC claim name - read_only - read only flag - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - read_only: bool = False, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param read_only: read only - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.volume_type = 0 - self.mount_propagation = mount_propagation - self.readonly = read_only - - def to_string(self) -> str: - """ - Convert to string - :return: PVCVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" - if self.readonly: - val += ", read only = True" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: PVCVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.readonly: - dst["readOnly"] = True - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - return dst - - -class EphemeralVolume(BaseVolume): - """ - This class implements Ephemeral volume. In addition to name and mount path it requires - Ephemeral volume specific parameters: - storage - disk size (valid k8 value, for example 5Gi) - storageClass - storage class - optional, if not specified, use default - accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) - mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - storage: str, - storage_class: str = None, - access_mode: AccessMode = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param storage: storage - :param storage_class: storage class - :param access_mode: access mode - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 2 - self.mount_propagation = mount_propagation - self.storage_class = storage_class - self.access_mode = access_mode - - def to_string(self) -> str: - """ - Convert to string - :return: EphemeralVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" - ) - if self.storage_class is not None: - val += f", storage class = {self.storage_class}" - if self.access_mode is not None: - val += f", access mode = {self.access_mode.name}" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: EphemeralVolume dictionary representation - """ - dct = { - "name": self.name, - "mountPath": self.mount_path, - "storage": self.storage, - "volumeType": self.volume_type, - } - if self.storage_class is not None: - dct["storageClassName"] = self.storage_class - if self.access_mode is not None: - dct["accessMode"] = self.access_mode.value - if self.mount_propagation is not None: - dct["mountPropagationMode"] = self.mount_propagation.value - return dct - - -class EmptyDirVolume(BaseVolume): - """ - This class implements EmptyDir volume. In addition to name and mount path it requires - Empty Dir specific parameters: - storage - optional max storage size (valid k8 value, for example 5Gi) - """ - - def __init__(self, name: str, mount_path: str, storage: str = None): - """ - Initialization - :param name: name - :param mount_path: mount_path - :param storage: storage - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 5 - - def to_string(self) -> str: - """ - Convert to string - :return: EmptyDirVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" - if self.storage is not None: - val += f", storage = {self.storage}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} - if self.storage is not None: - dct["storage"] = self.storage - return dct - - -class ConfigMapVolume(BaseVolume): - """ - This class implements ConfigMap volume. In addition to name and mount path it requires - configMap volume specific parameters: - source - required, config map name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param items: items - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 3 - - def to_string(self) -> str: - """ - Convert to string - :return: ConfigMapVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" - ) - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: ConfigMapVolume dictionary representation - """ - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -class SecretVolume(BaseVolume): - """ - This class implements Secret volume. In addition to name and mount path it requires - Secret volume specific parameters: - source - required, secret name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 4 - - def to_string(self) -> str: - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -""" - Creates new Volume from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def volume_decoder(dst: dict[str, Any]) -> BaseVolume: - def _get_mount_propagation() -> MountPropagationMode: - if "mountPropagationMode" in dst: - return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) - return None - - def _get_host_path() -> HostPath: - if "hostPathType" in dst: - return HostPath(int(dst.get("hostPathType", "0"))) - return None - - def _get_access_mode() -> AccessMode: - if "accessMode" in dst: - return AccessMode(int(dst.get("accessMode", "0"))) - return None - - match dst["volumeType"]: - case 0: - # PVC - return PVCVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - read_only=dst.get("readOnly", False), - mount_propagation=_get_mount_propagation(), - ) - case 1: - # host path - return HostPathVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - host_path_type=_get_host_path(), - mount_propagation=_get_mount_propagation(), - ) - case 2: - # Ephemeral volume - return EphemeralVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - storage=dst.get("storage", ""), - storage_class=dst.get("storageClassName"), - access_mode=_get_access_mode(), - mount_propagation=_get_mount_propagation(), - ) - case 3: - # ConfigMap Volume - return ConfigMapVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 4: - # Secret Volume - return SecretVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 5: - # Empty dir volume - return EmptyDirVolume( - name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") - ) - case _: - raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py deleted file mode 100644 index ddcf193cc..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py +++ /dev/null @@ -1,206 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} - - -class WorkerNodeSpec: - """ - WorkerNodeSpec is used to define Ray cluster worker node pool configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create worker node pool specification: gets the following parameters: - group_name - required, group name of the worker group - compute_template - required, the computeTemplate of worker node group - replicas - required, desired replicas of the worker group - min_replicas - required Min replicas of the worker group, can't be greater than max_replicas - max_replicas - required, max replicas of the worker group - ray_start_params - required, Ray start parameters - image - optional, image used for worker node - volumes - optional, a list of volumes to attach to worker node - service_account - optional, a service account (has to exist) to run worker node - image_pull_secret - optional, secret to pull worker node image from registry - environment - optional, environment variables for worker pod - annotations - optional, annotations for worker node - labels - optional, labels for worker node - image_pull_policy - optional, worker node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - group_name: str, - compute_template: str, - image: str, - max_replicas: int, - replicas: int = 1, - min_replicas: int = 0, - ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param group_name: name - :param compute_template: compute template - :param replicas: number of replicas - :param min_replicas: min number of replicas - :param max_replicas: max number of replicas - :param ray_start_params: ray start parameters - :param image: image name - :param volumes: volumes - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: environment - :param annotations: annotations - :param labels: labels - :param image_pull_policy: image pull policy - """ - # Validate replicas - if min_replicas > replicas: - raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") - if replicas > max_replicas: - raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") - - self.group_name = group_name - self.compute_template = compute_template - self.replicas = replicas - self.min_replicas = min_replicas - self.max_replicas = max_replicas - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) - self.image = image - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of worker node spec - """ - val = ( - f"group_name = {self.group_name}, compute template = {self.compute_template}, " - f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " - f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" - ) - if self.image is not None: - val += f", image = {self.image}" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of worker node spec - """ - dct = { - "groupName": self.group_name, - "computeTemplate": self.compute_template, - "replicas": self.replicas, - "minReplicas": self.min_replicas, - "maxReplicas": self.max_replicas, - "rayStartParams": self.ray_start_params, - } - if self.image is not None: - dct["image"] = self.image - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["imagePullSecret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new worker node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: - """ - Create worker node spec from dictionary - :param dct: dictionary definition of worker node spec - :return: worker node spec - """ - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return WorkerNodeSpec( - group_name=dct.get("groupName"), - compute_template=dct.get("computeTemplate"), - replicas=dct.get("replicas", 0), - min_replicas=dct.get("minReplicas", 0), - max_replicas=dct.get("maxReplicas", 0), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md deleted file mode 100644 index 4943a0b06..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Workflow Utils for KFP v1 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## PipelinesUtils - -This class provides some higher level functionality based on the capabilities of the python KFP client, including" -* get_experiment_by_name obtains KFP experiment object based on its name -* get_pipeline_by_name obtains KFP pipeline object based on its name -* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a -dictionary of parameters. It returns kfp run ID -* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index dec823e4b..05a850efb 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -12,8 +12,7 @@ import sys -from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - +from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs def start_ray_cluster( name: str, # name of Ray cluster diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 85fbf8dde..9bf9dcbad 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -12,8 +12,7 @@ import sys -from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - +from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs # Cleans and shutdowns the Ray cluster def cleanup_ray_cluster( diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 8fe53667f..b7c3b5bc0 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -10,8 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs - +from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": import argparse diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index 586053de4..ddcb02ffd 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -23,7 +23,7 @@ endef sed -i.back "s/data-prep-kit\/$$DOCKER_IMAGE_NAME:.*/data-prep-kit\/$$DOCKER_IMAGE_NAME:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ done < ${REPOROOT}/.make.versions ifeq ($(KFPv2), 1) - @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION_v2}\"/" ${PIPELINE_FILE} + @sed -i.back "s/kfp-data-processing_v2:.*/kfp-data-processing_v2:${KFP_DOCKER_VERSION_v2}\"/" ${PIPELINE_FILE} else @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION}\"/" ${PIPELINE_FILE} endif diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index 03610a1bb..db54e0087 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -29,7 +29,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1-kfp-v21" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.1.1-kfp-v21" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @@ -154,7 +154,7 @@ def noop( exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC,image_pull_policy="Always") + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC, image_pull_policy="Always") ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) From 77713d0b84abf6256c54f9f7daa4c276ded01689 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 30 May 2024 17:10:42 +0300 Subject: [PATCH 10/64] revert 3 latest commits --- kfp/kfp_ray_components/Makefile | 8 +- .../kfp_support_lib_v2/README.md | 68 ++ .../kfp_support_lib_v2/pyproject.toml | 47 ++ .../kfp_support/api_server_client/README.md | 4 + .../kfp_support/api_server_client/__init__.py | 1 + .../api_server_client/kuberay_apis.py | 636 ++++++++++++++++++ .../api_server_client/params/__init__.py | 53 ++ .../api_server_client/params/cluster.py | 475 +++++++++++++ .../params/environmentvariables.py | 158 +++++ .../api_server_client/params/headnode.py | 202 ++++++ .../api_server_client/params/jobsubmission.py | 163 +++++ .../api_server_client/params/templates.py | 224 ++++++ .../api_server_client/params/volumes.py | 449 +++++++++++++ .../api_server_client/params/workernode.py | 206 ++++++ .../kfp_support/workflow_support/README.md | 45 ++ .../compile_utils/__init__.py | 3 + .../compile_utils/component.py} | 32 +- .../runtime_utils/__init__.py | 2 + .../runtime_utils}/kfp_utils.py | 0 .../runtime_utils}/remote_jobs_utils.py | 2 +- .../kfp_support/workflow_support_v2/README.md | 36 + .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 + .../comp_utils/component.py | 54 ++ .../workflow_support_v2/utils/__init__.py | 8 + .../utils/workflow_utils.py | 557 +++++++++++++++ .../src/create_ray_cluster.py | 3 +- .../src/delete_ray_cluster.py | 3 +- kfp/kfp_ray_components/src/execute_ray_job.py | 3 +- .../compile_utils/__init__.py | 3 + .../compile_utils/component.py | 101 +++ .../runtime_utils/__init__.py | 2 + .../runtime_utils/kfp_utils.py | 113 ++++ .../runtime_utils/remote_jobs_utils.py | 527 +++++++++++++++ .../workflow_support/utils/__init__.py | 4 - .../workflow_support/utils/pipeline_utils.py | 173 ----- .../utils/pipelines_tests_utils.py | 75 --- transforms/.make.transforms_workflows | 4 +- transforms/universal/noop/Makefile | 31 +- .../universal/noop/kfp_ray/v2/noop_wf.py | 17 +- 40 files changed, 4167 insertions(+), 328 deletions(-) create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py rename kfp/{kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py => kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py} (78%) create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py rename kfp/{kfp_support_lib_v2/src/kfp_support/workflow_support/utils => kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils}/kfp_utils.py (100%) rename kfp/{kfp_support_lib_v2/src/kfp_support/workflow_support/utils => kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils}/remote_jobs_utils.py (99%) create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py create mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py create mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 927a4356b..30ef36f5a 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -44,19 +44,13 @@ image: Dockerfile Dockerfile_v2 requirements.txt .PHONY: reconcile-requirements reconcile-requirements: -ifeq ($(KFPv2), 1) - sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" createRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" deleteRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent.yaml - sed -i.back "s/kfp-data-processing_v2*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION_v2}/" executeRayJobComponent_multi_s3.yaml -else @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml + # TODO remove it for KFPv2 sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml -endif .PHONY: load-image load-image: diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md new file mode 100644 index 000000000..86f3f4360 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md @@ -0,0 +1,68 @@ +# KFP support library + +This provides support for implementing KFP pipelines automating transform's execution. +It comprises 2 main modules + +* [api server client](src/kfp_support/api_server_client/README.md) +* [workflow support](src/kfp_support/workflow_support/README.md) + +## Development + +### Requirements +1. python 3.10 or later +2. git command line tools +3. [pre-commit](https://pre-commit.com/) +4. twine (pip install twine) + * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` + +### Git +Simple clone the repo and set up the pre-commit hooks. +```shell +git clone git@github.com:IBM/data-prep-kit.git +cd kfp/kfp_support_lib +pre-commit install +``` +If you don't have pre-commit, you can install from [here](https://pre-commit.com/) + +## Library Artifact Build and Publish + +The process of creating a release for `fm_data_processing_kfp` package involves the following steps: + +cd to the package directory. + +update the version in [requirements.env](../requirements.env) file. + +run `make build` and `make publish`. + +## Testing + +To run the package tests perform the following: + +To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. + +```bash +make setup +``` + +The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. + +```bash +make build +``` + +lastly, execute the tests: + +```bash +make test +``` + +### Cleanup + +It is advisable to execute the following command prior to running `make test` once more. This will ensure that any +previous test runs resources are removed before starting new tests. + +```bash +kubectl delete workflows -n kubeflow --all +``` + + diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml new file mode 100644 index 000000000..f995d60d7 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml @@ -0,0 +1,47 @@ +[project] +name = "data_prep_toolkit_kfp_v2" +version = "0.1.1" +requires-python = ">=3.10" +description = "Data Preparation Kit Library. KFP v2 support" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, + { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, + { name = "Revital Eres", email = "eres@il.ibm.com" }, +] +dependencies = [ + "kfp==2.7.0", + "kfp-kubernetes==1.2.0", + "requests", + "data-prep-toolkit==0.1.1", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", +] + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/kfp_support"] + +[tool.pytest.ini_options] +addopts = "--cov --cov-report term-missing --cov-fail-under 10" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md new file mode 100644 index 000000000..423f743a1 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md @@ -0,0 +1,4 @@ +# KubeRay API server APIs + +This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) +Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py new file mode 100644 index 000000000..60cbbc2f2 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py @@ -0,0 +1 @@ +from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py new file mode 100644 index 000000000..270815e77 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py @@ -0,0 +1,636 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +import requests +from data_processing.utils import get_logger +from kfp_support.api_server_client.params import ( + Cluster, + RayJobInfo, + RayJobRequest, + Template, + cluster_decoder, + clusters_decoder, + template_decoder, + templates_decoder, +) + + +logger = get_logger(__name__) + + +_headers = {"Content-Type": "application/json", "accept": "application/json"} + +CONNECT_TIMEOUT = 50 +READ_TIMEOUT = 50 +TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) + + +class KubeRayAPIs: + """ + This class implements KubeRay APIs based on the API server. + To create a class, the following parameters are required: + base - the URL of the API server (default is set to the standalone API server) + wait interval - the amount of sec to wait between checking for cluster ready + """ + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + token: str = None, + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initializer + :param server_url: API server url - default assuming running it inside the cluster + :param token: token, only used for API server with security enabled + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.server_url = server_url + if token is not None: + _headers["Authorization"] = token + self.wait_interval = wait_interval + self.api_base = "/apis/v1/" + self.http_retries = http_retries + + def list_compute_templates(self) -> tuple[int, str, list[Template]]: + """ + List compute templates across all namespaces of the k8 cluster + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning(f"Failed to list compute templates, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: + """ + List compute templates across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: return tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, templates_decoder(response.json()) + else: + logger.warning( + f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: + """ + get a compute template + :param ns: namespace + :param name: template name + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + compute templates + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, template_decoder(response.json()) + else: + logger.warning( + f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_compute_template(self, template: Template) -> tuple[int, str]: + """ + Create a compute template + :param template - definition of a template + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" + for i in range(self.http_retries): + try: + response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: + """ + delete a compute template + :param ns: namespace + :param name: template name + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete compute template, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete compute template, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def list_clusters(self) -> tuple[int, str, list[Cluster]]: + """ + List clusters across all namespaces of the k8 cluster + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + "clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list cluster, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list cluster, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: + """ + List clusters across for a given namespaces of the k8 cluster + :param ns: namespace to query + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of clusters + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, clusters_decoder(response.json()) + else: + logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: + """ + get cluster + :param ns: namespace + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + clusters definition + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, cluster_decoder(response.json()) + else: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def create_cluster(self, cluster: Cluster) -> tuple[int, str]: + """ + create cluster + :param cluster: cluster definition + :return: tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" + for i in range(self.http_retries): + try: + response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning(f"Failed to create cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to create cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: + """ + get cluster status + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + # Execute HTTP request + status, error, cluster = self.get_cluster(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error, None + cluster_status = "creating" + if cluster.cluster_status is not None: + cluster_status = cluster.cluster_status + return status, None, cluster_status + + def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: + """ + wait for cluster to be ready + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) + :returns: A tuple containing + http return code + message - only returned if http return code is not equal to 200 + cluster status + """ + current_wait = 0 + while True: + status, error, c_status = self.get_cluster_status(ns=ns, name=name) + # Check execution status + if status // 100 != 2: + return status, error + if c_status == "ready": + return status, None + if current_wait > wait > 0: + return 408, f"Timed out waiting for cluster ready in {current_wait} sec" + time.sleep(self.wait_interval) + current_wait += self.wait_interval + + def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: + """ + get cluster endpoint + :param ns: namespace of the cluster + :param name: name of the cluster + :param wait: wait time (-1 waits forever) for cluster to be ready + :returns: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + endpoint (service for dashboard endpoint) + """ + # Ensure that the cluster is ready + status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) + if status // 100 != 2: + return status, error, None + # Get cluster + status, error, cluster = self.get_cluster(ns=ns, name=name) + if status // 100 != 2: + return status, error, None + return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" + + def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: + """ + delete cluster + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers) + if response.status_code // 100 == 2: + return response.status_code, None + elif response.status_code == 404: + # not found - no need to retry + return response.status_code, response.json()["message"] + else: + logger.warning(f"Failed to delete cluster , status : {response.status_code}") + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to delete cluster , exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: + """ + submit Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param job_request: job submission + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json()["submissionId"] + else: + logger.warning( + f"Failed to submit job to the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, None + + def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: + """ + get Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, RayJobInfo(response.json()) + else: + logger.warning( + f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: + """ + list Ray job details + :param ns: namespace of the cluster + :param name: name of the cluster + :return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + list of RayJobInfo object + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + job_info_array = response.json().get("submissions", None) + if job_info_array is None: + return response.status_code, None, [] + else: + return response.status_code, None, [RayJobInfo(i) for i in job_info_array] + else: + logger.warning( + f"Failed to list jobs from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(5) + return status, message, [] + + def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: + """ + get Ray job log + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + log + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" + for i in range(self.http_retries): + try: + response = requests.get(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None, response.json().get("log", "") + else: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning( + f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" + ) + status = 500 + message = str(e) + time.sleep(1) + return status, message, None + + def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + stop Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.post(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message + + def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: + """ + delete Ray job + :param ns: namespace of the cluster + :param name: name of the cluster + :param sid: job submission id + return: a tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + status = 200 + message = None + # Execute HTTP request + url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" + for i in range(self.http_retries): + try: + response = requests.delete(url, headers=_headers, timeout=TIMEOUT) + if response.status_code // 100 == 2: + return response.status_code, None + else: + logger.warning( + f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " + f"status : {response.status_code}" + ) + status = response.status_code + message = response.json()["message"] + except Exception as e: + logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") + status = 500 + message = str(e) + time.sleep(1) + return status, message diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py new file mode 100644 index 000000000..e5a7d70fa --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py @@ -0,0 +1,53 @@ +from kfp_support.api_server_client.params.templates import ( + TolerationOperation, + TolerationEffect, + Toleration, + Template, + toleration_decoder, + template_decoder, + templates_decoder, +) +from kfp_support.api_server_client.params.volumes import ( + HostPath, + MountPropagationMode, + AccessMode, + BaseVolume, + HostPathVolume, + PVCVolume, + EphemeralVolume, + EmptyDirVolume, + ConfigMapVolume, + SecretVolume, + volume_decoder, +) +from kfp_support.api_server_client.params.environmentvariables import ( + EnvVarSource, + EnvVarFrom, + EnvironmentVariables, + env_var_from_decoder, + environment_variables_decoder, +) +from kfp_support.api_server_client.params.headnode import ( + ServiceType, + HeadNodeSpec, + DEFAULT_HEAD_START_PARAMS, + head_node_spec_decoder, +) +from kfp_support.api_server_client.params.workernode import ( + WorkerNodeSpec, + DEFAULT_WORKER_START_PARAMS, + worker_node_spec_decoder, +) +from kfp_support.api_server_client.params.cluster import ( + Environment, + AutoscalerOptions, + ClusterSpec, + ClusterEvent, + Cluster, + UpscalingMode, + autoscaling_decoder, + cluster_spec_decoder, + cluster_decoder, + clusters_decoder, +) +from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py new file mode 100644 index 000000000..922a14bef --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py @@ -0,0 +1,475 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + HeadNodeSpec, + WorkerNodeSpec, + environment_variables_decoder, + head_node_spec_decoder, + volume_decoder, + worker_node_spec_decoder, +) + + +class Environment(enum.Enum): + """ + Environment definitions + """ + + DEV = 0 # development + TESTING = 1 # testing + STAGING = 2 # staging + PRODUCTION = 3 # production + + +class UpscalingMode(enum.Enum): + """ + Enumeration of autoscaling mode + """ + + Conservative = ( + "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster + ) + Default = "Default" # no rate limitations + Aggressive = "Aggressive" # same as default + + +class AutoscalerOptions: + """ + AutoscalerOptions is used to define Ray cluster autoscaling. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create autoscaling options specification: gets the following parameters: + idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray + resources. Default 60sec (one minute). + upscaling_mode - required autoscaler upscaling mode + image - optional, allows to override the autoscaler's container image + image_pull_policy - optional, allows to override the autoscaler's container image pull policy + cpus - optional, CPUs requirements for autoscaler - default "500m" + memory - optional, memory requirements for autoscaler - default "512Mi" + environment - optional, environment variables for autoscaler container + volumes - optional, a list of volumes to attach to autoscaler container. + This is needed for enabling TLS for the autoscaler container. + """ + + def __init__( + self, + upscaling_mode: UpscalingMode = UpscalingMode.Default, + idle_tmout: int = None, + image: str = None, + image_pull_policy: str = None, + cpus: str = None, + memory: str = None, + environment: EnvironmentVariables = None, + volumes: list[BaseVolume] = None, + ): + """ + Initialization + :param upscaling_mode: upscale mode + :param idle_tmout: idle timeout + :param image: image + :param image_pull_policy: image pull policy + :param cpus: cpu requirement for autoscaling + :param memory: memory requirement for autoscaling + :param environment: autoscaler environment + :param volumes: volumes for autoscaler + """ + self.upscaling_mode = upscaling_mode + self.idle_tmout = idle_tmout + self.image = image + self.image_pull_policy = image_pull_policy + self.cpus = cpus + self.memory = memory + self.environment = environment + self.volumes = volumes + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"upscaling_mode = {self.upscaling_mode}" + if self.idle_tmout is not None: + val += f", idle_timeout = {self.idle_tmout}" + if self.image is not None: + val += f", image = {self.image}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.cpus is not None: + val += f", cpus = {self.cpus}" + if self.memory is not None: + val += f", memory = {self.memory}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"upscalingMode": self.upscaling_mode.value} + if self.idle_tmout is not None: + dct["idleTimeoutSeconds"] = self.idle_tmout + if self.image is not None: + dct["image"] = self.image + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.cpus is not None: + dct["cpu"] = self.cpus + if self.memory is not None: + dct["memory"] = self.memory + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["envs"] = self.environment.to_dict() + return dct + + +class ClusterSpec: + """ + ClusterSpec is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create cluster spec from: gets the following parameters: + head_group_spec - required, specification of the head node + worker_group_spec - optional, list of worker group specs + autoscaler_options - optional, autoscaling options + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + head_node: HeadNodeSpec, + worker_groups: list[WorkerNodeSpec] = None, + autoscaling_options: AutoscalerOptions = None, + ): + """ + Initialization + :param head_node - head node definition + :param worker_groups - worker group definition + :param autoscaling_options - autoscaler options + """ + self.head_node = head_node + self.worker_groups = worker_groups + self.autoscaling_options = autoscaling_options + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster spec + """ + val = f"head_group_spec: {self.head_node.to_string()}" + if self.worker_groups is not None: + val += "\nworker groups: " + for w in self.worker_groups: + val += f"\nworker_group_spec = {w.to_string()}]" + if self.autoscaling_options is not None: + val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: Dictionary representation of cluster spec + """ + dst = {"headGroupSpec": self.head_node.to_dict()} + if self.worker_groups is not None: + dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] + if self.autoscaling_options is not None: + dst["enableInTreeAutoscaling"] = True + dst["autoscalerOptions"] = self.autoscaling_options.to_dict() + return dst + + +class ClusterEvent: + """ + Cluster event is used to define events emitted during cluster creation. + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create event: gets the dictionary with the following parameters: + id - unique Event Id + name - human readable event name + created_at - event creation time + first_timestamp - first time the event occur + last_timestamp - last time the event occur + reason - reason for the transition into the object's current status + message - human-readable description of the status of this operation + type - type of this event (Normal, Warning), new types could be added in the future + count - number of times this event has occurred + """ + + def __init__(self, dst: dict[str, Any]): + """ + Initialization from dictionary + :param dst: dictionary representation of cluster event + """ + self.id = dst.get("id", "") + self.name = dst.get("name", "") + self.created_at = dst.get("created_at", "") + self.first_timestamp = dst.get("first_timestamp", "") + self.last_timestamp = dst.get("last_timestamp", "") + self.reason = dst.get("reason", "") + self.message = dst.get("message", "") + self.type = dst.get("type", "") + self.count = dst.get("count", "0") + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of cluster event + """ + return ( + f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " + f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," + f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" + ) + + +class Cluster: + """ + Cluster is used to define Ray cluster. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + name - required, unique (per namespace) cluster name + namespace - required, cluster's namespace (should exist) + user - required, user who owns the cluster + version - required, Ray cluster version - typically Ray version + deployment_environment - optional (see Environment) + cluster_spec - required, ray cluster configuration + annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class + cluster_environment - optional, cluster environment variables + created_at - output, cluster creation ts + deleted_at - output, cluster deletion ts + cluster_status - output, cluster status + events - output, cluster events + service_endpoint - output, cluster service endpoints + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__( + self, + name: str, + namespace: str, + user: str, + version: str, + cluster_spec: ClusterSpec, + deployment_environment: Environment = None, + annotations: dict[str, str] = None, + cluster_environment: EnvironmentVariables = None, + created_at: str = None, + deleted_at: str = None, + cluster_status: str = None, + events: list[ClusterEvent] = None, + service_endpoint: dict[str, str] = None, + ): + """ + Initialization + :param name: cluster name + :param namespace: cluster namespace + :param user: user name + :param version: version + :param cluster_spec: cluster spec + :param deployment_environment: cluster deployment environment + :param annotations: cluster annotations + :param cluster_environment: cluster environment + :param created_at: created at + :param deleted_at: deleted at + :param cluster_status: status + :param events: cluster events + :param service_endpoint: service endpoint + """ + self.name = name + self.namespace = namespace + self.user = user + self.version = version + self.cluster_spec = cluster_spec + self.environment = deployment_environment + self.annotations = annotations + self.envs = cluster_environment + self.created_at = created_at + self.deleted_at = deleted_at + self.cluster_status = cluster_status + self.events = events + self.service_endpoint = service_endpoint + + def to_string(self) -> str: + """ + convert to string representation + :return: string representation of cluster + """ + val = ( + f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " + f"cluster_spec = {self.cluster_spec.to_string()}" + ) + if self.environment is not None: + val += f"deployment environment = {self.environment.name}" + if self.annotations is not None: + val += f" ,annotations = {str(self.annotations)}" + if self.envs is not None: + val = val + f",cluster environment = {self.envs.to_string()}" + val += "\ncluster output\n" + if self.created_at is not None: + val += f" ,created_at = {self.created_at}" + if self.deleted_at is not None: + val += f" ,deleted_at = {self.deleted_at}" + if self.cluster_status is not None: + val += f" ,cluster status = {self.cluster_status}" + if self.events is not None: + val = val + ",\n cluster events = [" + first = True + for e in self.events: + if first: + first = False + else: + val += ", " + val = val + "{" + e.to_string() + "}" + val = val + "]" + if self.service_endpoint is not None: + val += f" ,service endpoints = {str(self.service_endpoint)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of cluster + """ + # only convert input variables + dst = { + "name": self.name, + "namespace": self.namespace, + "user": self.user, + "version": self.version, + "clusterSpec": self.cluster_spec.to_dict(), + } + if self.environment is not None: + dst["environment"] = self.environment.value + if self.annotations is not None: + dst["annotations"] = self.annotations + if self.envs is not None: + dst["envs"] = self.envs.to_dict() + return dst + + +""" + Creates new cluster from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: + """ + Create autoscaling options from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: autoscaling options + """ + upscaling_mode = UpscalingMode.Default + if "upscalingMode" in dct: + upscaling_mode = UpscalingMode(dct.get("upscalingMode")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("envs")) > 0: + environments = environment_variables_decoder(dct.get("envs")) + return AutoscalerOptions( + upscaling_mode=upscaling_mode, + idle_tmout=dct.get("idleTimeoutSeconds", None), + image=dct.get("image", None), + image_pull_policy=dct.get("imagePullPolicy", None), + cpus=dct.get("cpu", None), + memory=dct.get("memory", None), + environment=environments, + volumes=volumes, + ) + + +def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: + """ + Create cluster spec from its dictionary representation + :param dct: dictionary representation of cluster spec + :return: cluster spec + """ + workers = None + autoscaling_options = None + if "workerGroupSpec" in dct: + workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] + if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): + autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) + return ClusterSpec( + head_node=head_node_spec_decoder(dct.get("headGroupSpec")), + worker_groups=workers, + autoscaling_options=autoscaling_options, + ) + + +def cluster_decoder(dct: dict[str, Any]) -> Cluster: + """ + Create cluster from its dictionary representation + :param dct: dictionary representation of cluster + :return: cluster + """ + environment = None + if "environment" in dct: + environment = Environment(int(dct.get("environment", "0"))) + events = None + if "events" in dct: + events = [ClusterEvent(c) for c in dct["events"]] + envs = None + if "envs" in dct: + envs = environment_variables_decoder(dct.get("envs")) + return Cluster( + name=dct.get("name", ""), + namespace=dct.get("namespace", ""), + user=dct.get("user", ""), + version=dct.get("version", ""), + cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), + deployment_environment=environment, + annotations=dct.get("annotations"), + cluster_environment=envs, + created_at=dct.get("createdAt"), + deleted_at=dct.get("deletedAt"), + cluster_status=dct.get("clusterState"), + events=events, + service_endpoint=dct.get("serviceEndpoint"), + ) + + +def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: + """ + Create list of clusters from its dictionary representation + :param dct: dictionary representation of a list of clusters + :return: list of clusters + """ + return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py new file mode 100644 index 000000000..d1056f6f6 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py @@ -0,0 +1,158 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class EnvVarSource(enum.Enum): + """ + Enumeration of environment sources + """ + + CONFIGMAP = 0 # config map + SECRET = 1 # secret + RESOURCE_FIELD = 2 # resource field + FIELD = 3 # field + + +class EnvVarFrom: + """ + EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + Source required - source of environment variable + name required name for config map or secret, container name for resource, path for field + key required Key for config map or secret, resource name for resource + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, source: EnvVarSource, name: str, key: str): + """ + Initialize + :param source - source + :param name source name + :param key source key + """ + self.source = source + self.name = name + self.key = key + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of environment from + """ + return f"source = {self.source.name}, name = {self.name}, key = {self.key}" + + def to_dict(self) -> dict[str, Any]: + """ + convert to dictionary + :return: dictionary representation of environment from + """ + return {"source": self.source.value, "name": self.name, "key": self.key} + + +class EnvironmentVariables: + """ + EnvironmentVariables is used to define environment variables. + It provides APIs to create, stringify, convert to dict and json. + + Methods: + - Create env variable from: gets the following parameters: + key_value - optional, dictionary of key/value environment variables + from_ref - optional, dictionary of reference environment variables + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): + """ + Initialization + :param key_value: dictionary of key/value pairs for environment variables + :param from_ref: dictionary of key/value pairs for environment from variables + """ + self.key_val = key_value + self.from_ref = from_ref + + def to_string(self) -> str: + """ + convert to string + :return: string representation of environment variables + """ + val = "" + if self.key_val is not None: + val = f"values = {str(self.key_val)}" + if self.from_ref is not None: + if val != "": + val += " , " + val += "valuesFrom = {" + first = True + for k, v in self.from_ref.items(): + if not first: + val += ", " + else: + first = False + val += f"{k} = [{v.to_string()}]" + val += "}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of environment variables + """ + dst = {} + if self.key_val is not None: + dst["values"] = self.key_val + if self.from_ref is not None: + fr = {} + for k, v in self.from_ref.items(): + fr[k] = v.to_dict() + dst["valuesFrom"] = fr + return dst + + +""" + Creates new environment variable from from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: + """ + Create environment from from dictionary + :param dct: dictionary representations of environment from + :return: environment from + """ + return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) + + +def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: + """ + Create environment variables from from dictionary + :param dct: dictionary representations of environment variables + :return: environment variables + """ + keyvalues = None + fr = None + if "values" in dct: + keyvalues = dct.get("values") + if "valuesFrom" in dct: + from_ref = dct.get("valuesFrom") + fr = {} + for k, v in from_ref.items(): + fr[k] = env_var_from_decoder(v) + return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py new file mode 100644 index 000000000..7a9d4120f --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py @@ -0,0 +1,202 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} + + +class ServiceType(enum.Enum): + """ + Enumeration of head node service types + """ + + ClusterIP = "ClusterIP" # cluster IP + NodePort = "NodePort" # node port + LoadBalancer = "LoadBalancer" # load balancer + + +class HeadNodeSpec: + """ + HeadNodeSpec is used to define Ray cluster head node configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create head node specification: gets the following parameters: + compute_template - required, the computeTemplate of head node group + ray_start_params - required, Ray start parameters + image - optional, image used for head node + service_type - optional (ServiceType), service type foe headnode + enable_ingress - optional, allow to enable ingress for dashboard + volumes - optional, a list of volumes to attach to head node + service_account - optional, a service account (has to exist) to run head node + image_pull_secret - optional, secret to pull head node image from registry + environment - optional, environment variables for head pod + annotations - optional, annotations for head node + labels - optional, labels for head node + image_pull_policy - optional, head node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + compute_template: str, + image: str, + ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, + service_type: ServiceType = ServiceType.ClusterIP, + enable_ingress: bool = False, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param compute_template: compute template + :param ray_start_params: ray start parameters + :param image: node image + :param service_type: service type + :param enable_ingress: enable ingress flag + :param volumes: volumes for head node + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: head node environment + :param annotations: head node annotation + :param labels: labels + :param image_pull_policy: image pull policy + """ + + self.compute_template = compute_template + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) + self.image = image + self.service_type = service_type + self.enable_ingress = enable_ingress + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of the head node + """ + val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" + if self.image is not None: + val += f", image = {self.image}" + if self.service_type is not None: + val += f", service_type = {self.service_type.name}" + if self.enable_ingress: + val += ", enable_ingress = True" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of the head node + """ + dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} + if self.image is not None: + dct["image"] = self.image + if self.service_type is not None: + dct["serviceType"] = self.service_type.value + if self.enable_ingress: + dct["enableIngress"] = True + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["image_pull_secret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new head node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: + """ + Create head node spec from dictionary + :param dct: dictionary representation of head node spec + :return: Head node spec + """ + service_type = None + if "serviceType" in dct: + service_type = ServiceType(dct.get("serviceType", "ClusterIP")) + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return HeadNodeSpec( + compute_template=dct.get("computeTemplate"), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + service_type=service_type, + enable_ingress=dct.get("enableIngress", False), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py new file mode 100644 index 000000000..a0b2bfcb0 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py @@ -0,0 +1,163 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +from typing import Any + + +class RayJobRequest: + """ + RayJobRequest used to define job to be submitted to a Ray cluster + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - required, the command to start a job on the cluster + submission_id - optional, submission id for the job submission + runtime_env - optional, yaml string specifying job runtime environment + metadata - optional, dictionary of the submission metadata + num_cpus - optional, number of cpus for job execution + num_gpus - optional, number of gpus for job execution + resources - optional, dictionary of the resources for job execution + """ + + def __init__( + self, + entrypoint: str, + submission_id: str = None, + runtime_env: str = None, + metadata: dict[str, str] = None, + num_cpu: float = -1.0, + num_gpu: float = -1.0, + resources: dict[str, str] = None, + ): + """ + Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html + :param entrypoint: entrypoint + :param submission_id: submission id + :param runtime_env: runtime environment + :param metadata: submission metadata + :param num_cpu: job number cpus + :param num_gpu: job number gpus + :param resources: job custom resources + """ + self.entrypoint = entrypoint + self.submission_id = submission_id + self.runtime_env = runtime_env + self.metadata = metadata + self.num_cpu = num_cpu + self.num_gpu = num_gpu + self.resources = resources + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of job submission + """ + val = f"entrypoint = {self.entrypoint}" + if self.submission_id is not None: + val += f", submission_id = {self.submission_id}" + if self.num_cpu > 0: + val += f", num_cpu = {self.num_cpu}" + if self.num_gpu > 0: + val += f", num_gpu = {self.num_gpu}" + if self.runtime_env is not None: + val += f", runtime_env = {self.runtime_env}" + if self.metadata is not None: + val += f", metadata = {self.metadata}" + if self.resources is not None: + val += f", resources = {self.resources}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of job submission + """ + dct = {"entrypoint": self.entrypoint} + if self.submission_id is not None: + dct["submissionId"] = self.submission_id + if self.runtime_env is not None: + dct["runtimeEnv"] = self.runtime_env + if self.metadata is not None: + dct["metadata"] = self.metadata + if self.num_cpu > 0: + dct["numCpus"] = self.num_cpu + if self.num_gpu > 0: + dct["numGpus"] = self.num_gpu + if self.resources is not None: + dct["resources"] = self.resources + return dct + + +class RayJobInfo: + """ + RayJobInfo used to define information about the job in a Ray cluster + It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict + + Methods: + - Create RayJobRequest: gets the following parameters: + entrypoint - the command to start a job on the cluster + job_id - job execution id + submission_id - submission id for the job submission + runtime_env - job runtime environment + status - job execution status + message - status message + start_time - job start time + end-time - job ind time + error_type - type of error + metadata - optional, dictionary of the submission metadata + """ + + def __init__(self, dct: dict[str, Any]): + """ + Initialize from dictionary + :param dct: dictionary representation of Ray job info + """ + self.entrypoint = dct.get("entrypoint", "") + self.job_id = dct.get("jobId", "") + self.submission_id = dct.get("submissionId", "") + self.status = dct.get("status", "") + self.message = dct.get("message", None) + self.start_time = int(dct.get("startTime", "0")) + self.end_time = int(dct.get("endTime", "0")) + self.error_type = dct.get("ErrorType", None) + self.metadata = dct.get("Metadata", None) + self.runtime_env = dct.get("runtimeEnv", None) + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of Ray job info + """ + val = ( + f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," + f" status = {self.status}" + ) + if self.message is not None: + val += f" message = {self.message}" + if self.start_time > 0: + val += ( + f" start time = " + f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.end_time > 0: + val += ( + f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" + ) + if self.error_type is not None: + val += f" error type = {self.error_type}" + if self.runtime_env is not None: + val += f" runtime env = {str(self.runtime_env)}" + if self.metadata is not None: + val += f" metadata = {str(self.metadata)}" + return val diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py new file mode 100644 index 000000000..0ef4c1583 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py @@ -0,0 +1,224 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class TolerationOperation(enum.Enum): + """ + Toleration operation types + """ + + Exists = "Exists" # exists + Equal = "Equal" # equal + + +class TolerationEffect(enum.Enum): + """ + Toleration effect + """ + + NoSchedule = "NoSchedule" # not schedule + PreferNoSchedule = "PreferNoSchedule" # prefer not schedule + NoExecute = "NoExecute" # not execute + + +class Toleration: + """ + Toleration is used by compute template to pick specific nodes for placing pods. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create toleration: gets the following parameters: + key - required, key created by the node's taint + operator - required, operator to apply, supported operators are "Exists" and "Equal" + effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" + value - optional, value + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + """ + + def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): + """ + Initialization + :param key: key + :param operator: operator + :param effect: effect + :param value: value + """ + self.key = key + self.operator = operator + self.value = value + self.effect = effect + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of toleration + """ + val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" + if self.value is None: + return val + else: + return val + f", value = {self.value}" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to string + :return: string representation of toleration + """ + dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} + if self.value is not None: + dct["value"] = self.value + return dct + + +# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. +# If it is specified, it has to be in the format that is understood by kubernetes as a valid +# The following devices are currently supported by kubernetes: +# AMD - gpu accelerator amd.com/gpu +# Intel - gpu accelerator gpu.intel.com/i915 +# NVIDIA - gpu accelerator nvidia.com/gpu + + +class Template: + """ + Template is used to define specific nodes configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create templates: gets the following parameters: + name - required, template name + namespace - required, template namespace + cpus - required, template number of cpus + memory - required, template memory (GB) + gpus - optional, number of GPUs, default 0 + gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed + tolerations - optional, tolerations for pod placing, default none + - to_string() -> str: convert toleration to string for printing + - to_dict() -> dict[str, Any] convert to dict + - to_json() -> str convert to json string + """ + + def __init__( + self, + name: str, + namespace: str, + cpu: int, + memory: int, + gpu: int = 0, + gpu_accelerator: str = None, + tolerations: list[Toleration] = None, + ): + """ + Initialization + :param name: name + :param namespace: namespace + :param cpu: cpu + :param memory: memory + :param gpu: gpu + :param gpu_accelerator: accelerator type + :param tolerations: tolerations + """ + self.name = name + self.namespace = namespace + self.cpu = cpu + self.memory = memory + self.gpu = gpu + self.gpu_accelerator = gpu_accelerator + self.tolerations = tolerations + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of template + """ + val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" + if self.gpu > 0: + val = val + f", gpu {self.gpu}" + if self.gpu_accelerator is not None: + val = val + f", gpu accelerator {self.gpu_accelerator}" + if self.tolerations is None: + return val + val = val + ", tolerations [" + first = True + for tol in self.tolerations: + if first: + first = False + val = val + "{" + tol.to_string() + "}" + else: + val = val + ", {" + tol.to_string() + "}" + return val + "]" + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of template + """ + dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} + if self.gpu > 0: + dct["gpu"] = self.gpu + if self.gpu_accelerator is not None: + dct["gpu accelerator"] = self.gpu_accelerator + if self.tolerations is not None: + dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] + return dct + + +""" + Creates new toleration from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def toleration_decoder(dct: dict[str, Any]) -> Toleration: + """ + Create toleration from dictionary + :param dct: dictionary representation of toleration + :return: toleration + """ + return Toleration( + key=dct.get("key"), + operator=TolerationOperation(dct.get("operator", "Exists")), + effect=TolerationEffect(dct.get("effect", "NoSchedule")), + value=dct.get("value"), + ) + + +def template_decoder(dct: dict[str, Any]) -> Template: + """ + Create template from dictionary + :param dct: dictionary representation of template + :return: template + """ + tolerations = None + if "tolerations" in dct: + tolerations = [toleration_decoder(d) for d in dct["tolerations"]] + return Template( + name=dct.get("name"), + namespace=dct.get("namespace"), + cpu=int(dct.get("cpu", "0")), + memory=int(dct.get("memory", "0")), + gpu=int(dct.get("gpu", "0")), + gpu_accelerator=dct.get("gpu_accelerator"), + tolerations=tolerations, + ) + + +def templates_decoder(dct: dict[str, Any]) -> list[Template]: + """ + Create list of template from dictionary + :param dct: dictionary representation of list of template + :return: list of template + """ + return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py new file mode 100644 index 000000000..fee0e1ea4 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py @@ -0,0 +1,449 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import enum +from typing import Any + + +class HostPath(enum.Enum): + """ + Host path enumeration + """ + + DIRECTORY = 0 # directory + FILE = 1 # files + + +class MountPropagationMode(enum.Enum): + """ + Mount propagation enumeration + """ + + NONE = 0 # None + HOSTTOCONTAINER = 1 # host to container + BIDIRECTIONAL = 2 # bi directional + + +class AccessMode(enum.Enum): + """ + Access mode enumeration + """ + + RWO = 0 # read write once + ROX = 1 # read only many + RWX = 2 # read write many + + +class BaseVolume: + """ + KubeRay currently support several types of volumes, including hostPat, PVC, + ephemeral volumes, config maps, secrets and empty dir. All of them use slightly + different parameters. Base Volume is a base class for all different volume types. + """ + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of base volume + """ + raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") + + +class HostPathVolume(BaseVolume): + """ + This class implements HostPath volume. In addition to name and mount path it requires host + path volume specific parameters: + source - data location on host + hostPathType - host path type: directory (0) or file (1) + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + host_path_type: HostPath = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param host_path_type: host path type + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.host_path_type = host_path_type + self.volume_type = 1 + self.mount_propagation = mount_propagation + + def to_string(self) -> str: + """ + Convert to string + :return: HostPathVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + if self.host_path_type is not None: + val += f", host path type = {self.host_path_type.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: HostPathVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + if self.host_path_type is not None: + dst["hostPathType"] = self.host_path_type.value + return dst + + +class PVCVolume(BaseVolume): + """ + This class implements PVC volume. In addition to name and mount path it requires + PVC volume specific parameters: + source - PVC claim name + read_only - read only flag + mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + read_only: bool = False, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param read_only: read only + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.volume_type = 0 + self.mount_propagation = mount_propagation + self.readonly = read_only + + def to_string(self) -> str: + """ + Convert to string + :return: PVCVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" + if self.readonly: + val += ", read only = True" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: PVCVolume dictionary representation + """ + dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.readonly: + dst["readOnly"] = True + if self.mount_propagation is not None: + dst["mountPropagationMode"] = self.mount_propagation.value + return dst + + +class EphemeralVolume(BaseVolume): + """ + This class implements Ephemeral volume. In addition to name and mount path it requires + Ephemeral volume specific parameters: + storage - disk size (valid k8 value, for example 5Gi) + storageClass - storage class - optional, if not specified, use default + accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) + mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) + """ + + def __init__( + self, + name: str, + mount_path: str, + storage: str, + storage_class: str = None, + access_mode: AccessMode = None, + mount_propagation: MountPropagationMode = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param storage: storage + :param storage_class: storage class + :param access_mode: access mode + :param mount_propagation: mount propagation + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 2 + self.mount_propagation = mount_propagation + self.storage_class = storage_class + self.access_mode = access_mode + + def to_string(self) -> str: + """ + Convert to string + :return: EphemeralVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" + ) + if self.storage_class is not None: + val += f", storage class = {self.storage_class}" + if self.access_mode is not None: + val += f", access mode = {self.access_mode.name}" + if self.mount_propagation is not None: + val += f", mount propagation = {self.mount_propagation.name}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: EphemeralVolume dictionary representation + """ + dct = { + "name": self.name, + "mountPath": self.mount_path, + "storage": self.storage, + "volumeType": self.volume_type, + } + if self.storage_class is not None: + dct["storageClassName"] = self.storage_class + if self.access_mode is not None: + dct["accessMode"] = self.access_mode.value + if self.mount_propagation is not None: + dct["mountPropagationMode"] = self.mount_propagation.value + return dct + + +class EmptyDirVolume(BaseVolume): + """ + This class implements EmptyDir volume. In addition to name and mount path it requires + Empty Dir specific parameters: + storage - optional max storage size (valid k8 value, for example 5Gi) + """ + + def __init__(self, name: str, mount_path: str, storage: str = None): + """ + Initialization + :param name: name + :param mount_path: mount_path + :param storage: storage + """ + self.name = name + self.mount_path = mount_path + self.storage = storage + self.volume_type = 5 + + def to_string(self) -> str: + """ + Convert to string + :return: EmptyDirVolume string representation + """ + val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" + if self.storage is not None: + val += f", storage = {self.storage}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} + if self.storage is not None: + dct["storage"] = self.storage + return dct + + +class ConfigMapVolume(BaseVolume): + """ + This class implements ConfigMap volume. In addition to name and mount path it requires + configMap volume specific parameters: + source - required, config map name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + """ + Initialization + :param name: name + :param mount_path: mount path + :param source: source + :param items: items + """ + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 3 + + def to_string(self) -> str: + """ + Convert to string + :return: ConfigMapVolume string representation + """ + val = ( + f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" + ) + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: ConfigMapVolume dictionary representation + """ + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +class SecretVolume(BaseVolume): + """ + This class implements Secret volume. In addition to name and mount path it requires + Secret volume specific parameters: + source - required, secret name + items - optional, key/path items (optional) + """ + + def __init__( + self, + name: str, + mount_path: str, + source: str, + items: dict[str, str] = None, + ): + self.name = name + self.mount_path = mount_path + self.source = source + self.items = items + self.volume_type = 4 + + def to_string(self) -> str: + val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" + if self.items is not None: + val = val + f", items = {str(self.items)}" + return val + + def to_dict(self) -> dict[str, Any]: + dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} + if self.items is not None: + dct["items"] = self.items + return dct + + +""" + Creates new Volume from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def volume_decoder(dst: dict[str, Any]) -> BaseVolume: + def _get_mount_propagation() -> MountPropagationMode: + if "mountPropagationMode" in dst: + return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) + return None + + def _get_host_path() -> HostPath: + if "hostPathType" in dst: + return HostPath(int(dst.get("hostPathType", "0"))) + return None + + def _get_access_mode() -> AccessMode: + if "accessMode" in dst: + return AccessMode(int(dst.get("accessMode", "0"))) + return None + + match dst["volumeType"]: + case 0: + # PVC + return PVCVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + read_only=dst.get("readOnly", False), + mount_propagation=_get_mount_propagation(), + ) + case 1: + # host path + return HostPathVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + host_path_type=_get_host_path(), + mount_propagation=_get_mount_propagation(), + ) + case 2: + # Ephemeral volume + return EphemeralVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + storage=dst.get("storage", ""), + storage_class=dst.get("storageClassName"), + access_mode=_get_access_mode(), + mount_propagation=_get_mount_propagation(), + ) + case 3: + # ConfigMap Volume + return ConfigMapVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 4: + # Secret Volume + return SecretVolume( + name=dst.get("name", ""), + mount_path=dst.get("mountPath", ""), + source=dst.get("source", ""), + items=dst.get("items"), + ) + case 5: + # Empty dir volume + return EmptyDirVolume( + name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") + ) + case _: + raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py new file mode 100644 index 000000000..ddcf193cc --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py @@ -0,0 +1,206 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from kfp_support.api_server_client.params import ( + BaseVolume, + EnvironmentVariables, + environment_variables_decoder, + volume_decoder, +) + + +DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} + + +class WorkerNodeSpec: + """ + WorkerNodeSpec is used to define Ray cluster worker node pool configuration. + It provides APIs to create, stringify and convert to dict. + + Methods: + - Create worker node pool specification: gets the following parameters: + group_name - required, group name of the worker group + compute_template - required, the computeTemplate of worker node group + replicas - required, desired replicas of the worker group + min_replicas - required Min replicas of the worker group, can't be greater than max_replicas + max_replicas - required, max replicas of the worker group + ray_start_params - required, Ray start parameters + image - optional, image used for worker node + volumes - optional, a list of volumes to attach to worker node + service_account - optional, a service account (has to exist) to run worker node + image_pull_secret - optional, secret to pull worker node image from registry + environment - optional, environment variables for worker pod + annotations - optional, annotations for worker node + labels - optional, labels for worker node + image_pull_policy - optional, worker node pull image policy. Default IfNotPresent + """ + + def __init__( + self, + group_name: str, + compute_template: str, + image: str, + max_replicas: int, + replicas: int = 1, + min_replicas: int = 0, + ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, + volumes: list[BaseVolume] = None, + service_account: str = None, + image_pull_secret: str = None, + environment: EnvironmentVariables = None, + annotations: dict[str, str] = None, + labels: dict[str, str] = None, + image_pull_policy: str = None, + ): + """ + Initialization + :param group_name: name + :param compute_template: compute template + :param replicas: number of replicas + :param min_replicas: min number of replicas + :param max_replicas: max number of replicas + :param ray_start_params: ray start parameters + :param image: image name + :param volumes: volumes + :param service_account: service account + :param image_pull_secret: image pull secret + :param environment: environment + :param annotations: annotations + :param labels: labels + :param image_pull_policy: image pull policy + """ + # Validate replicas + if min_replicas > replicas: + raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") + if replicas > max_replicas: + raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") + + self.group_name = group_name + self.compute_template = compute_template + self.replicas = replicas + self.min_replicas = min_replicas + self.max_replicas = max_replicas + self.ray_start_params = ray_start_params + self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) + self.image = image + self.volumes = volumes + self.service_account = service_account + self.image_pull_secret = image_pull_secret + self.environment = environment + self.annotations = annotations + self.labels = labels + self.image_pull_policy = image_pull_policy + + def to_string(self) -> str: + """ + Convert to string + :return: string representation of worker node spec + """ + val = ( + f"group_name = {self.group_name}, compute template = {self.compute_template}, " + f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " + f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" + ) + if self.image is not None: + val += f", image = {self.image}" + if self.service_account is not None: + val += f", service_account = {self.service_account}" + if self.image_pull_secret is not None: + val += f", image_pull_secret = {self.image_pull_secret}" + if self.image_pull_policy is not None: + val += f", image_pull_policy = {self.image_pull_policy}" + if self.volumes is not None: + val = val + ",\n volumes = [" + first = True + for v in self.volumes: + if first: + first = False + else: + val += ", " + val = val + "{" + v.to_string() + "}" + val = val + "]" + if self.environment is not None: + val = val + f",\n environment = {self.environment.to_string()}" + if self.annotations is not None: + val = val + f",\n annotations = {str(self.annotations)}" + if self.labels is not None: + val = val + f",\n labels = {str(self.labels)}" + return val + + def to_dict(self) -> dict[str, Any]: + """ + Convert to dictionary + :return: dictionary representation of worker node spec + """ + dct = { + "groupName": self.group_name, + "computeTemplate": self.compute_template, + "replicas": self.replicas, + "minReplicas": self.min_replicas, + "maxReplicas": self.max_replicas, + "rayStartParams": self.ray_start_params, + } + if self.image is not None: + dct["image"] = self.image + if self.service_account is not None: + dct["service_account"] = self.service_account + if self.image_pull_secret is not None: + dct["imagePullSecret"] = self.image_pull_secret + if self.image_pull_policy is not None: + dct["imagePullPolicy"] = self.image_pull_policy + if self.volumes is not None: + dct["volumes"] = [v.to_dict() for v in self.volumes] + if self.environment is not None: + dct["environment"] = self.environment.to_dict() + if self.annotations is not None: + dct["annotations"] = self.annotations + if self.labels is not None: + dct["labels"] = self.labels + return dct + + +""" + Creates new worker node from dictionary, used for unmarshalling json. Python does not + support multiple constructors, so do it this way +""" + + +def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: + """ + Create worker node spec from dictionary + :param dct: dictionary definition of worker node spec + :return: worker node spec + """ + volumes = None + if "volumes" in dct: + volumes = [volume_decoder(v) for v in dct["volumes"]] + environments = None + if "environment" in dct and len(dct.get("environment")) > 0: + environments = environment_variables_decoder(dct.get("environment")) + return WorkerNodeSpec( + group_name=dct.get("groupName"), + compute_template=dct.get("computeTemplate"), + replicas=dct.get("replicas", 0), + min_replicas=dct.get("minReplicas", 0), + max_replicas=dct.get("maxReplicas", 0), + ray_start_params=dct.get("rayStartParams"), + image=dct.get("image"), + volumes=volumes, + service_account=dct.get("service_account", None), + image_pull_secret=dct.get("imagePullSecret", None), + image_pull_policy=dct.get("imagePullPolicy", None), + environment=environments, + annotations=dct.get("annotations", None), + labels=dct.get("labels", None), + ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md new file mode 100644 index 000000000..4943a0b06 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md @@ -0,0 +1,45 @@ +# Workflow Utils for KFP v1 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## PipelinesUtils + +This class provides some higher level functionality based on the capabilities of the python KFP client, including" +* get_experiment_by_name obtains KFP experiment object based on its name +* get_pipeline_by_name obtains KFP pipeline object based on its name +* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a +dictionary of parameters. It returns kfp run ID +* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..bbe1476fb --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.compile_utils.component import ( + ComponentUtils +) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py similarity index 78% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py rename to kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py index 71583b8f2..1f66bf59f 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/components_utils.py +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py @@ -1,37 +1,15 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os import kfp.dsl as dsl from kfp import kubernetes from typing import Dict RUN_NAME = "KFP_RUN_NAME" -from data_processing.utils import get_logger -from kubernetes import client as k8s_client - - -logger = get_logger(__name__) - -ONE_HOUR_SEC = 60 * 60 -ONE_DAY_SEC = ONE_HOUR_SEC * 24 -ONE_WEEK_SEC = ONE_DAY_SEC * 7 - class ComponentUtils: """ Class containing methods supporting building pipelines """ + @staticmethod def add_settings_to_component( task: dsl.PipelineTask, timeout: int, @@ -77,8 +55,8 @@ def set_s3_env_vars_to_component( @staticmethod def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor ) -> str: """ This is the most simplistic transform execution parameters computation @@ -89,7 +67,7 @@ def default_compute_execution_params( import sys from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.utils import KFPUtils + from kfp_support.workflow_support.runtime_utils import KFPUtils logger = get_logger(__name__) @@ -120,4 +98,4 @@ def default_compute_execution_params( ) sys.exit(1) - return str(n_actors) + return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..d2301bd0a --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils +from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/kfp_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py similarity index 100% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/kfp_utils.py rename to kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py rename to kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py index 40b26c7a1..39d4d9e64 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/remote_jobs_utils.py +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from kfp_support.workflow_support.utils import KFPUtils +from kfp_support.workflow_support.runtime_utils import KFPUtils from ray.job_submission import JobStatus diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md new file mode 100644 index 000000000..472c39136 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md @@ -0,0 +1,36 @@ +# Workflow Utils for KFPv2 + +This library provides 3 main classes: +* KFPUtils - helper utilities for KFP implementations +* PipelinesUtils - helper class for pipeline management based on KFP client +* RayRemoteJobs - class supporting Ray remote jobs + +## KFPUtils + +This class contains a collection of functions useful for KFP pipelines implementation, which include: +* credentials - get S3 credentials from the environment +* get_namespace - get the name of the kubernetes namespace we are running in +* runtime_name - generates unique runtime name +* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string +* load_from_json - convert json string to dictionary and exit with error if conversion fails + +## RayRemoteJobs + +At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, +including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) +[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and +[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. +We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class +implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: +* create_ray_cluster - creates Ray cluster. +* delete_ray_cluster - deletes Ray cluster. +* submit_job - submits Ray job to the cluster +* follow_execution - watching job execution to completion, periodically printing out the job log +These basic methods can be used as a foundation of any KFP pipeline implementation + +## ComponentUtils + +This class provides some methods to simplify building pipelines: +* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy +* set_cos_env_vars_to_component - sets environment variables to support S3 +* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py new file mode 100644 index 000000000..9297ede66 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.components_utils.component import ( + CompileComponentUtils +) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py new file mode 100644 index 000000000..adaa971c1 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py @@ -0,0 +1,54 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class CompileComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py new file mode 100644 index 000000000..3a6ab1263 --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py @@ -0,0 +1,8 @@ +from kfp_support.workflow_support.runtime_utils.workflow_utils import ( + KFPUtils, + RayRemoteJobs, + ComponentUtils, + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, +) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py new file mode 100644 index 000000000..7328c740d --- /dev/null +++ b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py @@ -0,0 +1,557 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import json +import os +import re +import sys +import time +from typing import Any, Optional + +from data_processing.data_access import DataAccess +from data_processing.utils import get_logger +import kfp_server_api +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from ray.job_submission import JobStatus + +logger = get_logger(__name__) + +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if isinstance(value, str): + res += f'--{key}="{value}" ' + else: + res += f"--{key}={value} " + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {status}") + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.output_folder + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + # @staticmethod + # def add_settings_to_component( + # task: dsl.PipelineTask, + # timeout: int, + # image_pull_policy: str = "IfNotPresent", + # cache_strategy: bool = False, + # ) -> None: + # """ + # Add settings to kfp task + # :param task: kfp task + # :param timeout: timeout to set to the component in seconds + # :param image_pull_policy: pull policy to set to the component + # :param cache_strategy: cache strategy + # """ + # + # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # # Set cashing + # task.set_caching_options(enable_caching=cache_strategy) + # # image pull policy + # kubernetes.set_image_pull_policy(task, image_pull_policy) + # # Set the timeout for the task to one day (in seconds) + # kubernetes.set_timeout(task, seconds=timeout) + + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 05a850efb..dec823e4b 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -12,7 +12,8 @@ import sys -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs + def start_ray_cluster( name: str, # name of Ray cluster diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 9bf9dcbad..85fbf8dde 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -12,7 +12,8 @@ import sys -from kfp_support.workflow_support.utils import KFPUtils, RayRemoteJobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs + # Cleans and shutdowns the Ray cluster def cleanup_ray_cluster( diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index b7c3b5bc0..8fe53667f 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -10,7 +10,8 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import KFPUtils, execute_ray_jobs +from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs + if __name__ == "__main__": import argparse diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..bbe1476fb --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py @@ -0,0 +1,3 @@ +from kfp_support.workflow_support.compile_utils.component import ( + ComponentUtils +) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py new file mode 100644 index 000000000..1f66bf59f --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py @@ -0,0 +1,101 @@ +import kfp.dsl as dsl +from kfp import kubernetes +from typing import Dict + +RUN_NAME = "KFP_RUN_NAME" + +class ComponentUtils: + """ + Class containing methods supporting building pipelines + """ + + @staticmethod + def add_settings_to_component( + task: dsl.PipelineTask, + timeout: int, + image_pull_policy: str = "IfNotPresent", + cache_strategy: bool = False, + ) -> None: + """ + Add settings to kfp task + :param task: kfp task + :param timeout: timeout to set to the component in seconds + :param image_pull_policy: pull policy to set to the component + :param cache_strategy: cache strategy + """ + + kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, + field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") + # Set cashing + task.set_caching_options(enable_caching=cache_strategy) + # image pull policy + kubernetes.set_image_pull_policy(task, image_pull_policy) + # Set the timeout for the task to one day (in seconds) + kubernetes.set_timeout(task, seconds=timeout) + + @staticmethod + def set_s3_env_vars_to_component( + task: dsl.PipelineTask, + secret: str = '', + env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, + prefix: str = None, + ) -> None: + """ + Set S3 env variables to KFP component + :param task: kfp task + :param secret: secret name with the S3 credentials + :param env2key: dict with mapping each env variable to a key in the secret + :param prefix: prefix to add to env name + """ + + if prefix is not None: + for env_name, _ in env2key.items(): + env2key[prefix + "_" + env_name] = env2key.pop(env_name) + kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from kfp_support.workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..d2301bd0a --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils +from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py new file mode 100644 index 000000000..ef00b0e92 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py @@ -0,0 +1,113 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import json +import os +import re +import sys +from typing import Any + +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class KFPUtils: + """ + Helper utilities for KFP implementations + """ + + @staticmethod + def credentials( + access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" + ) -> tuple[str, str, str]: + """ + Get credentials from the environment + :param access_key: environment variable for access key + :param secret_key: environment variable for secret key + :param endpoint: environment variable for S3 endpoint + :return: + """ + s3_key = os.getenv(access_key, None) + s3_secret = os.getenv(secret_key, None) + s3_endpoint = os.getenv(endpoint, None) + if s3_key is None or s3_secret is None or s3_endpoint is None: + logger.warning("Failed to load s3 credentials") + return s3_key, s3_secret, s3_endpoint + + @staticmethod + def get_namespace() -> str: + """ + Get k8 namespace that we are running it + :return: + """ + ns = "" + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + except Exception as e: + logger.warning( + f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" + ) + else: + with file: + ns = file.read() + return ns + + @staticmethod + def runtime_name(ray_name: str = "", run_id: str = "") -> str: + """ + Get unique runtime name + :param ray_name: + :param run_id: + :return: runtime name + """ + # K8s objects cannot contain special characters, except '_', All characters should be in lower case. + if ray_name != "": + ray_name = ray_name.replace("_", "-").lower() + pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. + ray_name = re.sub(pattern, "", ray_name) + else: + ray_name = "a" + # the return value plus namespace name will be the name of the Ray Route, + # which length is restricted to 64 characters, + # therefore we restrict the return name by 15 character. + if run_id != "": + return f"{ray_name[:9]}-{run_id[:5]}" + return ray_name[:15] + + @staticmethod + def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: + res = f"python {executor} " + for key, value in d.items(): + if str(value) != "": + if isinstance(value, str): + if '"' in value: + logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") + res += f'--{key}="{value}" ' + elif isinstance(value, bool): + if value: + res += f"--{key} " + else: + res += f"--{key}={value} " + + logger.info(f"request to execute: {res}") + return res + + # Load a string that represents a json to python dictionary + @staticmethod + def load_from_json(js: str) -> dict[str, Any]: + try: + return json.loads(js) + except Exception as e: + logger.warning(f"Failed to load parameters {js} with error {e}") + sys.exit(1) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py new file mode 100644 index 000000000..39d4d9e64 --- /dev/null +++ b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py @@ -0,0 +1,527 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import re +import sys +import time +from typing import Any + +from data_processing.data_access import DataAccess, DataAccessFactory +from data_processing.utils import ParamsUtils, get_logger +from kfp_support.api_server_client import KubeRayAPIs +from kfp_support.api_server_client.params import ( + DEFAULT_HEAD_START_PARAMS, + DEFAULT_WORKER_START_PARAMS, + Cluster, + ClusterSpec, + HeadNodeSpec, + RayJobRequest, + Template, + WorkerNodeSpec, + environment_variables_decoder, + volume_decoder, +) +from kfp_support.workflow_support.runtime_utils import KFPUtils +from ray.job_submission import JobStatus + + +logger = get_logger(__name__) + + +class RayRemoteJobs: + """ + class supporting Ray remote jobs + """ + + ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + def __init__( + self, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + default_image: str = "rayproject/ray:2.9.3-py310", + http_retries: int = 5, + wait_interval: int = 2, + ): + """ + Initialization + :param server_url: API server URL. Default value is assuming running inside the cluster + :param default_image - default Ray image + :param wait_interval: wait interval + :param http_retries: http retries + """ + self.api_server_client = KubeRayAPIs( + server_url=server_url, http_retries=http_retries, wait_interval=wait_interval + ) + self.default_image = default_image + + def create_ray_cluster( + self, + name: str, + namespace: str, + head_node: dict[str, Any], + worker_nodes: list[dict[str, Any]], + wait_cluster_ready: int = -1, + ) -> tuple[int, str]: + """ + Create Ray cluster + :param name: name, _ are not allowed in the name + :param namespace: namespace + :param head_node: head node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for head node + service_account - service account to use (has to be created) + environment - dictionary of head node environment + annotations: dictionary of head node annotation + labels: dictionary of head node labels + image_pull_policy: image pull policy, default IfNotPresent + + :param worker_nodes: an array of worker node specification dictionary including the following: + mandatory fields: + cpu - number of cpus + memory memory size (GB) + image - image to use + max_replicas - max replicas for this worker group + optional fields: + gpu - number of gpus + gpu_accelerator - gpu accelerator to use + replicas - number of replicas to create for this group (default 1) + min_replicas - min number of replicas for this group (default 0) + image_pull_secret - image pull secret + ray_start_params - dictionary of ray start parameters + volumes - list of volumes for this group + service_account - service account to use (has to be created) + environment - dictionary of node of this group environment + annotations: dictionary of node of this group annotation + labels: dictionary of node of this group labels + image_pull_policy: image pull policy, default IfNotPresent + + :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # start with templates + # head_node + cpus = head_node.get("cpu", 1) + memory = head_node.get("memory", 1) + gpus = head_node.get("gpu", 0) + accelerator = head_node.get("gpu_accelerator", None) + head_node_template_name = f"{name}-head-template" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) + head_template = Template( + name=head_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(head_template) + if status != 200: + return status, error + worker_template_names = [""] * len(worker_nodes) + index = 0 + # For every worker group + for worker_node in worker_nodes: + cpus = worker_node.get("cpu", 1) + memory = worker_node.get("memory", 1) + gpus = worker_node.get("gpu", 0) + accelerator = worker_node.get("gpu_accelerator", None) + worker_node_template_name = f"{name}-worker-template-{index}" + _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) + worker_template = Template( + name=worker_node_template_name, + namespace=namespace, + cpu=cpus, + memory=memory, + gpu=gpus, + gpu_accelerator=accelerator, + ) + status, error = self.api_server_client.create_compute_template(worker_template) + if status != 200: + return status, error + worker_template_names[index] = worker_node_template_name + index += 1 + # Build head node spec + image = head_node.get("image", self.default_image) + image_pull_secret = head_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) + volumes_dict = head_node.get("volumes", None) + service_account = head_node.get("service_account", None) + environment_dict = head_node.get("environment", None) + annotations = head_node.get("annotations", None) + labels = head_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + head_node_spec = HeadNodeSpec( + compute_template=head_node_template_name, + image=image, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + # build worker nodes + worker_groups = [] + index = 0 + for worker_node in worker_nodes: + max_replicas = worker_node.get("max_replicas", 1) + replicas = worker_node.get("replicas", 1) + min_replicas = worker_node.get("min_replicas", 0) + image = worker_node.get("image", self.default_image) + image_pull_secret = worker_node.get("image_pull_secret", None) + image_pull_policy = head_node.get("image_pull_policy", None) + ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) + volumes_dict = worker_node.get("volumes", None) + service_account = worker_node.get("service_account", None) + environment_dict = worker_node.get("environment", None) + annotations = worker_node.get("annotations", None) + labels = worker_node.get("labels", None) + if volumes_dict is None: + volumes = None + else: + volumes = [volume_decoder(v) for v in volumes_dict] + if environment_dict is None: + environment = None + else: + environment = environment_variables_decoder(environment_dict) + worker_groups.append( + WorkerNodeSpec( + group_name=f"worker-group-{index}", + compute_template=worker_template_names[index], + image=image, + max_replicas=max_replicas, + replicas=replicas, + min_replicas=min_replicas, + ray_start_params=ray_start_params, + volumes=volumes, + service_account=service_account, + image_pull_secret=image_pull_secret, + environment=environment, + annotations=annotations, + labels=labels, + image_pull_policy=image_pull_policy, + ) + ) + index += 1 + # Build cluster spec + cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) + # Build cluster + cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) + status, error = self.api_server_client.create_cluster(cluster) + if status != 200: + return status, error + # Wait for cluster ready + return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) + + def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: + """ + Clean up Ray cluster and supporting template + :param name: cluster name + :param namespace: cluster namespace + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + """ + # delete cluster + status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) + if status != 200: + return status, error + # clean up templates + status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) + if status != 200: + return status, error + for template in template_array: + if template.name.startswith(name): + status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) + if status != 200: + return status, error + return status, error + + def submit_job( + self, + name: str, + namespace: str, + request: dict[str, Any], + runtime_env: str = None, + executor: str = "transformer_launcher.py", + ) -> tuple[int, str, str]: + """ + Submit job for execution + :param name: cluster name + :param namespace: cluster namespace + :param request: dictionary of the remote job request + :param runtime_env: runtime environment string + :param executor: python file to execute + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + submission id - submission id + """ + # Although the cluster is ready, the service web server might not be ready yet at this point. + # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails + # couple of times, its harmless + _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) + time.sleep(5) + # Build job request + job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) + if runtime_env is not None: + job_request.runtime_env = runtime_env + return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) + + def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: + """ + Get job status + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :return:tuple containing + http return code + message - only returned if http return code is not equal to 200 + status - job status + """ + # get job info + status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + return status, error, "" + return status, error, info.status + + @staticmethod + def _print_log(log: str, previous_log_len: int) -> None: + """ + Prints the delta between current and previous logs + :param log: current log + :param previous_log_len: previous log length + :return: None + """ + l_to_print = log[previous_log_len:] + if len(l_to_print) > 0: + l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) + print(l_to_print) + + def follow_execution( + self, + name: str, + namespace: str, + submission_id: str, + data_access: DataAccess = None, + job_ready_timeout: int = 600, + print_timeout: int = 120, + ) -> None: + """ + Follow remote job execution + :param name: cluster name + :param namespace: cluster namespace + :param submission_id: job submission ID + :param data_access - data access class + :param job_ready_timeout: timeout to wait for fob to become ready + :param print_timeout: print interval + :return: None + """ + # Wait for job to start running + job_status = JobStatus.PENDING + while job_status != JobStatus.RUNNING and job_ready_timeout > 0: + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: + break + time.sleep(self.api_server_client.wait_interval) + job_ready_timeout -= self.api_server_client.wait_interval + logger.info(f"job status is {job_status}") + if job_ready_timeout <= 0: + logger.warning("timed out waiting for job become ready, exiting") + sys.exit(1) + # While job is running print log + previous_log_len = 0 + # At this point job could succeeded, failed, stop or running. So print log regardless + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + # continue printing log, while job is running + while job_status == JobStatus.RUNNING: + time.sleep(print_timeout) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + previous_log_len = len(log) + status, error, job_status = self._get_job_status( + name=name, namespace=namespace, submission_id=submission_id + ) + if status // 100 != 2: + sys.exit(1) + # Print the final log and execution status + # Sleep here to avoid racing conditions + time.sleep(2) + status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) + if status // 100 != 2: + sys.exit(1) + self._print_log(log=log, previous_log_len=previous_log_len) + logger.info(f"Job completed with execution status {job_status}") + if job_status != JobStatus.SUCCEEDED: + sys.exit(1) + if data_access is None: + return + # Here data access is either S3 or lakehouse both of which contain self.output_folder + try: + output_folder = data_access.get_output_folder() + except Exception as e: + logger.warning(f"failed to get output folder {e}") + return + output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" + execution_log_path = f"{output_folder}execution.log" + logger.info(f"saving execution log to {execution_log_path}") + data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) + + +def _execute_remote_job( + name: str, + ns: str, + script: str, + params: dict[str, Any], + data_access_params: dict[str, Any], + additional_params: dict[str, Any], + remote_jobs: RayRemoteJobs, +) -> None: + """ + Execute remote job on Ray cluster + :param name: cluster name + :param ns: execution/cluster namespace + :param additional_params: additional parameters for the job + :param data_access_params: data access parameters + :param params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param script: script to run (has to be present in the image) + :param remote_jobs: remote jobs execution support class + :return: + """ + + status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) + if status != 200: + logger.error(f"Failed to submit job - status: {status}, error: {error}") + exit(1) + + logger.info(f"submitted job successfully, submission id {submission}") + # create data access + data_factory = DataAccessFactory() + data_factory.apply_input_params(args=data_access_params) + data_access = data_factory.create_data_access() + # print execution log + remote_jobs.follow_execution( + name=name, + namespace=ns, + submission_id=submission, + data_access=data_access, + print_timeout=additional_params.get("wait_print_tmout", 120), + job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), + ) + + +def execute_ray_jobs( + name: str, # name of Ray cluster + additional_params: dict[str, Any], + e_params: dict[str, Any], + exec_script_name: str, + server_url: str, +) -> None: + """ + Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. + All of the jobs will be executed, although some of the jobs may fail. + :param name: cluster name + :param additional_params: additional parameters for the job + :param e_params: job execution parameters (specific for a specific transform, + generated by the transform workflow) + :param exec_script_name: script to run (has to be present in the image) + :param server_url: API server url + :return: None + """ + # prepare for execution + ns = KFPUtils.get_namespace() + if ns == "": + logger.warning(f"Failed to get namespace") + sys.exit(1) + # create remote jobs class + remote_jobs = RayRemoteJobs( + server_url=server_url, + http_retries=additional_params.get("http_retries", 5), + wait_interval=additional_params.get("wait_interval", 2), + ) + # find config parameter + config = ParamsUtils.get_config_parameter(e_params) + if config is None: + exit(1) + # get config value + config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) + s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) + if type(config_value) is not list: + # single request + return _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: config_value, "data_s3_cred": s3_creds}, + params=e_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + # remove config key from the dictionary + launch_params = dict(e_params) + del launch_params[config] + # Loop through all configuration + n_launches = 0 + for conf in config_value: + # populate individual config and launch + launch_params[config] = ParamsUtils.convert_to_ast(d=conf) + try: + _execute_remote_job( + name=name, + ns=ns, + script=exec_script_name, + data_access_params={config: conf, "data_s3_cred": s3_creds}, + params=launch_params, + additional_params=additional_params, + remote_jobs=remote_jobs, + ) + n_launches += 1 + except SystemExit: + logger.warning(f"Failed to execute job for configuration {conf}") + continue + + if n_launches == 0: + logger.warning("All executions failed") + sys.exit(1) + else: + logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py deleted file mode 100644 index 166032380..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from kfp_support.workflow_support.utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.utils.pipeline_utils import PipelinesUtils -from kfp_support.workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC -from kfp_support.workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py deleted file mode 100644 index 714205129..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipeline_utils.py +++ /dev/null @@ -1,173 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -import time -from typing import Any, Optional - -from data_processing.utils import get_logger -import kfp_server_api - -from kfp import Client - - -logger = get_logger(__name__) - - -class PipelinesUtils: - """ - Helper class for pipeline management - """ - - def __init__(self, host: str = "http://localhost:8080"): - """ - Initialization - :param host: host to connect to - """ - self.kfp_client = Client(host=host) - - def upload_pipeline( - self, - pipeline_package_path: str = None, - pipeline_name: str = None, - overwrite: bool = False, - description: str = None, - ) -> kfp_server_api.V2beta1Pipeline: - """ - Uploads the pipeline - :param pipeline_package_path: Local path to the pipeline package. - :param pipeline_name: Optional. Name of the pipeline to be shown in the UI - :param overwrite: Optional. If pipeline exists, delete it before creating a new one. - :param description: Optional. Description of the pipeline to be shown in the UI. - :return: Server response object containing pipeline id and other information. - """ - if overwrite: - pipeline = self.get_pipeline_by_name(name=pipeline_name) - if pipeline is not None: - try: - logger.info(f"pipeline {pipeline_name} already exists. Trying to delete it.") - self.kfp_client.delete_pipeline(pipeline_id=pipeline.id) - except Exception as e: - logger.warning(f"Exception deleting pipeline {e} before uploading") - return None - try: - pipeline = self.kfp_client.upload_pipeline( - pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description - ) - except Exception as e: - logger.warning(f"Exception uploading pipeline {e}") - return None - if pipeline is None: - logger.warning(f"Failed to upload pipeline {pipeline_name}.") - return None - logger.info("Pipeline uploaded") - return pipeline - - def delete_pipeline(self, pipeline_id): - """ - Delete pipeline. - :param pipeline_id: id of the pipeline. - :return - Returns: - Object. If the method is called asynchronously, returns the request thread. - Raises: - kfp_server_api.ApiException: If pipeline is not found. - """ - return self.kfp_client.delete_pipeline(pipeline_id) - - def start_pipeline( - self, - pipeline: kfp_server_api.V2beta1Pipeline, - experiment: kfp_server_api.V2beta1Experiment, - params: Optional[dict[str, Any]], - ) -> str: - """ - Start a specified pipeline. - :param pipeline: pipeline definition - :param experiment: experiment to use - :param params: pipeline parameters - :return: the id of the run object - """ - job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") - try: - run_id = self.kfp_client.run_pipeline( - experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params - ) - logger.info(f"Pipeline run {job_name} submitted") - return run_id.id - except Exception as e: - logger.warning(f"Exception starting pipeline {e}") - return None - - def get_experiment_by_name(self, name: str = "Default") -> kfp_server_api.V2beta1Experiment: - """ - Get experiment by name - :param name: name - :return: experiment - """ - try: - return self.kfp_client.get_experiment(experiment_name=name) - except Exception as e: - logger.warning(f"Exception getting experiment {e}") - return None - - def get_pipeline_by_name(self, name: str, np: int = 100) -> kfp_server_api.V2beta1Pipeline: - """ - Given pipeline name, return the pipeline - :param name: pipeline name - :param np: page size for pipeline query. For large clusters with many pipelines, you might need to - increase this number - :return: pipeline - """ - try: - # Get all pipelines - pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines - required = list(filter(lambda p: name in p.name, pipelines)) - if len(required) != 1: - logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") - return None - return required[0] - - except Exception as e: - logger.warning(f"Exception getting pipeline {e}") - return None - - def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: - """ - Waits for a pipeline run to complete - :param run_id: run id - :param timeout: timeout (sec) (-1 wait forever) - :param wait: internal wait (sec) - :return: Completion status and an error message if such exists - """ - try: - if timeout > 0: - end = time.time() + timeout - else: - end = 2**63 - 1 - run_details = self.kfp_client.get_run(run_id=run_id) - status = run_details.run.status - while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: - time.sleep(wait) - if (end - time.time()) < 0: - return "failed", f"Execution is taking too long" - run_details = self.kfp_client.get_run(run_id=run_id) - status = run_details.run.status - logger.info(f"Got pipeline execution status {status}") - - if status.lower() in ["succeeded", "completed"]: - return status, "" - return status, run_details.run.error - - except Exception as e: - logger.warning(f"Failed waiting pipeline completion {e}") - return "failed", str(e) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py deleted file mode 100644 index 1e7ff9cf7..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py +++ /dev/null @@ -1,75 +0,0 @@ -import os -import sys - -from data_processing.utils import get_logger, str2bool - -from . import PipelinesUtils - - -logger = get_logger(__name__) - - -def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/", overwrite: bool = True): - """ - Upload and run a single pipeline - - :param pipeline_package_path: Local path to the pipeline package. - :param endpoint: endpoint to kfp service. - :return the pipeline name as it appears in the kfp GUI. - """ - tmout: int = 800 - wait: int = 60 - file_name = os.path.basename(pipeline_package_path) - pipeline_name = os.path.splitext(file_name)[0] - utils = PipelinesUtils(host=endpoint) - pipeline = utils.upload_pipeline( - pipeline_package_path=pipeline_package_path, - pipeline_name=pipeline_name, - overwrite=overwrite, - ) - if pipeline is None: - return None - experiment = utils.get_experiment_by_name() - run_id = utils.start_pipeline(pipeline, experiment, params=[]) - status, error = utils.wait_pipeline_completion(run_id=run_id, timeout=tmout, wait=wait) - if status.lower() not in ["succeeded", "completed"]: - # Execution failed - logger.warning(f"Pipeline {pipeline_name} failed with error {error} and status {status}") - return None - logger.info(f"Pipeline {pipeline_name} successfully completed") - return pipeline_name - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Run sanity test") - parser.add_argument("-c", "--command", type=str, choices=["upload", "sanity-test"]) - parser.add_argument("-e", "--endpoint", type=str, default="http://localhost:8080/") - parser.add_argument("-p", "--pipeline_package_path", type=str, default="") - parser.add_argument("-o", "--overwrite", type=str, default="True") - - args = parser.parse_args() - match args.command: - case "upload": - file_name = os.path.basename(args.pipeline_package_path) - pipeline_name = os.path.splitext(file_name)[0] - utils = PipelinesUtils(host=args.endpoint) - pipeline = utils.upload_pipeline( - pipeline_package_path=args.pipeline_package_path, - pipeline_name=pipeline_name, - overwrite=str2bool(args.overwrite), - ) - if pipeline is None: - sys.exit(1) - case "sanity-test": - run = run_test( - endpoint=args.endpoint, - pipeline_package_path=args.pipeline_package_path, - overwrite=str2bool(args.overwrite), - ) - if run is None: - sys.exit(1) - case _: - logger.warning("Unsupported command") - exit(1) diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index ddcb02ffd..e5b97d2b5 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -6,7 +6,6 @@ include ${REPOROOT}/kfp/requirements.env include ${REPOROOT}/.make.defaults USE_DEV_IMAGES ?= 1 -KFPv2 ?= 1 define set_env_var $(eval export $(1)=$(2)) @@ -65,6 +64,9 @@ ifeq ($(KFPv2), 1) else . ${WORKFLOW_VENV_ACTIVATE} && pip install -e $(REPOROOT)/kfp/kfp_support_lib/ endif + pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ + pip install -e $($DPK_RAY_LIB_DIR); \ + pip install -e $(REPOROOT)/kfp/kfp_support_lib/; #TODO KFPv2 ${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 502eea306..02fd06dc2 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -1,6 +1,4 @@ REPOROOT=../../.. - -KFPv2 ?= 1 # Use make help, to see the available rules include $(REPOROOT)/.make.defaults @@ -45,41 +43,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: -ifeq ($(KFPv2), 1) - $(MAKE) -C kfp_ray/v2 workflow-venv -else $(MAKE) -C kfp_ray/v1 workflow-venv -endif .PHONY: workflow-build workflow-build: -ifeq ($(KFPv2), 1) - $(MAKE) -C kfp_ray/v2 workflow-build -else - $(MAKE) -C kfp_ray/v1 workflow-build -endif - + $(MAKE) -C kfp_ray/v1 workflow-build .PHONY: workflow-test workflow-test: -ifeq ($(KFPv2), 1) - $(MAKE) -C kfp_ray/v2 workflow-build -else - $(MAKE) -C kfp_ray/v1 workflow-test -endif + $(MAKE) -C $(PIPELINE_PATH) workflow-test .PHONY: workflow-upload workflow-upload: -ifeq ($(KFPv2), 1) - $(MAKE) -C kfp_ray/v2 workflow-upload -else - $(MAKE) -C kfp_ray/v1 workflow-upload -endif + $(MAKE) -C $(PIPELINE_PATH) workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: -ifeq ($(KFPv2), 1) - $(MAKE) -C kfp_ray/v2 reconcile-requirements -else - $(MAKE) -C kfp_ray/v1 reconcile-requirements -endif + $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index db54e0087..b3aba7cdb 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.utils import ( +from kfp_support.workflow_support.runtime_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, @@ -29,7 +29,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.1.1-kfp-v21" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0-kfp-v21" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @@ -40,16 +40,13 @@ def compute_exec_params_op(worker_options: str, actor_options: str) -> str: return ComponentUtils.default_compute_execution_params(worker_options, actor_options) -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - # create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") +create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") # execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") # clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") - +cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" @@ -154,7 +151,7 @@ def noop( exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC, image_pull_policy="Always") + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC,image_pull_policy="Always") ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) From 433dcae7665881ce319ffa82c76c7c3d5d827dac Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Fri, 31 May 2024 10:48:12 +0300 Subject: [PATCH 11/64] add KFPv2 installation setting Signed-off-by: Alexey Roytman --- kfp/doc/setup.md | 9 ++++++++- kind/Makefile | 11 ++++++++--- kind/README.md | 7 +++++-- kind/requirements.env | 4 +++- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/kfp/doc/setup.md b/kfp/doc/setup.md index e4803e16b..84385ef0f 100644 --- a/kfp/doc/setup.md +++ b/kfp/doc/setup.md @@ -66,7 +66,14 @@ choose your OS system, and process according to "(Optional) Install the MinIO Cl ## Installation steps -You can create a Kind cluster with all required software installed using the following command: +Before installation, you have to deside which KFP version do you want to use. +In order to use KFP v2, please set the following environment variable: + +```shell +export KFPv2=1 +``` + +Now, you can create a Kind cluster with all required software installed using the following command: ```shell make setup diff --git a/kind/Makefile b/kind/Makefile index 873510b15..2a8fea1da 100644 --- a/kind/Makefile +++ b/kind/Makefile @@ -1,12 +1,17 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. -export REPOROOT=${CURDIR}/../ +export ROOT_DIR=${CURDIR}/../ -IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kind/requirements.env | sed 's/=/:=/' | sed 's/^/export /' > makeenv") + +ifneq ($(KFPv2), 1) + GREP_V=KFP_V2 +else + GREP_V=KFP_V1 +endif +IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kind/requirements.env | sed 's/=/:=/;/$(GREP_V)/d;s/KFP_V._//;s/^/export /' > makeenv") include makeenv -export ROOT_DIR=${CURDIR} # Include the common rules. # Use "make help" to see them. include ../.make.defaults diff --git a/kind/README.md b/kind/README.md index e4a106f25..60fae6841 100644 --- a/kind/README.md +++ b/kind/README.md @@ -28,7 +28,10 @@ amount of node, modify [cluster configuration](hack/kind-cluster-config.yaml) Install [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/v1/installation/standalone-deployment/#deploying-kubeflow-pipelines) and wait for it to be ready: ```shell -cd $ROOT_DIR/hack/tools/ && PIPELINE_VERSION=1.8.5 ./install_kubeflow.sh deploy && cd - +# Set required KFP version. You can reference to the latest supported version in the [requirements.env](./requirements.env) file. +# Currently, we support 1.8.5 for KFPv1 and 2.2.0 for KFP v2 +export PIPELINE_VERSION=1.8.5 +cd $ROOT_DIR/hack/tools/ && ./install_kubeflow.sh deploy && cd - kubectl wait --for=condition=ready --all pod -n kubeflow --timeout=300s ``` @@ -56,7 +59,7 @@ kubectl wait --namespace ingress-nginx \ --timeout=90s ``` -To deploy the ingress for ray apiserver, kfp and Minio execute the following: +To deploy the ingress for Ray API Server, KFP and MinIO execute the following: ```shell kubectl apply -f $ROOT_DIR/hack/ray_api_server_ingress.yaml kubectl apply -f $ROOT_DIR/hack/kfp_ingress.yaml diff --git a/kind/requirements.env b/kind/requirements.env index 70eca5bd8..cedd6ba0e 100644 --- a/kind/requirements.env +++ b/kind/requirements.env @@ -1,4 +1,6 @@ -PIPELINE_VERSION=1.8.5 +KFP_V1_PIPELINE_VERSION=1.8.5 +KFP_V2_PIPELINE_VERSION=2.2.0 + KUBERAY_OPERATOR=1.0.0 KUBERAY_APISERVER=1.1.0 From 815f0ca57b89196937a9e743dc12fb17ab6a4dc2 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Sun, 2 Jun 2024 10:33:41 +0300 Subject: [PATCH 12/64] before rebase Signed-off-by: Alexey Roytman --- .make.defaults | 2 +- kfp/kfp_ray_components/Dockerfile | 4 +- kfp/kfp_ray_components/Makefile | 24 +- .../kfp_support_lib_v2/README.md | 68 -- .../kfp_support_lib_v2/pyproject.toml | 47 -- .../kfp_support/api_server_client/README.md | 4 - .../kfp_support/api_server_client/__init__.py | 1 - .../api_server_client/params/cluster.py | 475 ------------- .../api_server_client/params/workernode.py | 206 ------ .../kfp_support/workflow_support/README.md | 45 -- .../runtime_utils/remote_jobs_utils.py | 527 --------------- .../kfp_support/workflow_support_v2/README.md | 36 - .../workflow_support_v2/__init__.py | 0 .../comp_utils/__init__.py | 3 - .../comp_utils/component.py | 54 -- .../workflow_support_v2/utils/__init__.py | 8 - .../utils/workflow_utils.py | 557 --------------- .../src/create_ray_cluster.py | 11 +- .../src/delete_ray_cluster.py | 10 +- kfp/kfp_ray_components/src/execute_ray_job.py | 10 +- .../src/execute_ray_job_multi_s3.py | 10 +- kfp/kfp_ray_components/src/subworkflow.py | 11 +- .../{Makefile => Makefile_old} | 0 kfp/kfp_support_lib/README.md | 9 +- .../doc/kfp_support_library.md | 2 +- .../kfp_v1_workflow_support}/Makefile | 17 +- .../pyproject.toml | 4 +- .../src/utils/__init__.py | 4 + .../src}/utils/components_utils.py | 0 .../src/utils}/kfp_utils.py | 0 .../src}/utils/pipeline_utils.py | 0 .../src}/utils/pipelines_tests_utils.py | 0 .../src}/utils/remote_jobs_utils.py | 2 +- .../test/pipeline_utils_test.py | 2 +- .../kfp_v2_workflow_support/Makefile | 62 ++ .../kfp_v2_workflow_support}/pyproject.toml | 9 +- .../src}/compile_utils/__init__.py | 0 .../src}/compile_utils/component.py | 0 .../src}/runtime_utils/__init__.py | 0 .../src/runtime_utils}/kfp_utils.py | 0 .../src}/runtime_utils/remote_jobs_utils.py | 2 +- .../python_apiserver_client/.gitignore | 32 + .../python_apiserver_client/Makefile | 53 ++ .../python_apiserver_client/README.md | 4 + .../python_apiserver_client/pyproject.toml | 28 + .../src/python_apiserver_client/__init__.py | 1 + .../python_apiserver_client}/kuberay_apis.py | 2 +- .../params/__init__.py | 14 +- .../params/cluster.py | 2 +- .../params/environmentvariables.py | 0 .../params/headnode.py | 2 +- .../params/jobsubmission.py | 0 .../params/templates.py | 0 .../params/volumes.py | 0 .../params/workernode.py | 2 +- .../test/api_params_test.py | 2 +- .../test/configmaps.py | 0 .../test/kuberay_api_test.py | 6 +- .../test/ray_remote_jobs_test.py | 4 +- .../kfp_support/api_server_client/README.md | 4 - .../kfp_support/api_server_client/__init__.py | 1 - .../api_server_client/kuberay_apis.py | 636 ------------------ .../api_server_client/params/__init__.py | 53 -- .../params/environmentvariables.py | 158 ----- .../api_server_client/params/headnode.py | 202 ------ .../api_server_client/params/jobsubmission.py | 163 ----- .../api_server_client/params/templates.py | 224 ------ .../api_server_client/params/volumes.py | 449 ------------- .../api_server_client/params/workernode.py | 206 ------ .../kfp_support/workflow_support/README.md | 45 -- .../workflow_support/utils/__init__.py | 4 - kfp/kfp_support_lib/test/api_params_test.py | 433 ------------ kfp/kfp_support_lib_v2/README.md | 68 -- .../doc/kfp_support_library.md | 10 - .../kfp_support/api_server_client/README.md | 4 - .../kfp_support/api_server_client/__init__.py | 1 - .../api_server_client/kuberay_apis.py | 636 ------------------ .../api_server_client/params/__init__.py | 53 -- .../api_server_client/params/cluster.py | 475 ------------- .../params/environmentvariables.py | 158 ----- .../api_server_client/params/headnode.py | 202 ------ .../api_server_client/params/jobsubmission.py | 163 ----- .../api_server_client/params/templates.py | 224 ------ .../api_server_client/params/volumes.py | 449 ------------- .../kfp_support/workflow_support/README.md | 45 -- .../compile_utils/__init__.py | 3 - .../compile_utils/component.py | 101 --- .../runtime_utils/__init__.py | 2 - .../runtime_utils/kfp_utils.py | 113 ---- kfp/kfp_support_lib_v2/test/configmaps.py | 72 -- .../test/kuberay_api_test.py | 297 -------- .../test/ray_remote_jobs_test.py | 90 --- kfp/requirements.env | 3 +- .../universal/noop/kfp_ray/v2/noop_wf.py | 40 +- 94 files changed, 305 insertions(+), 7860 deletions(-) delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py delete mode 100644 kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py rename kfp/kfp_support_lib/{Makefile => Makefile_old} (100%) rename kfp/{kfp_support_lib_v2 => kfp_support_lib/kfp_v1_workflow_support}/Makefile (82%) rename kfp/kfp_support_lib/{ => kfp_v1_workflow_support}/pyproject.toml (94%) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py rename kfp/kfp_support_lib/{src/kfp_support/workflow_support => kfp_v1_workflow_support/src}/utils/components_utils.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils => kfp_support_lib/kfp_v1_workflow_support/src/utils}/kfp_utils.py (100%) rename kfp/kfp_support_lib/{src/kfp_support/workflow_support => kfp_v1_workflow_support/src}/utils/pipeline_utils.py (100%) rename kfp/kfp_support_lib/{src/kfp_support/workflow_support => kfp_v1_workflow_support/src}/utils/pipelines_tests_utils.py (100%) rename kfp/kfp_support_lib/{src/kfp_support/workflow_support => kfp_v1_workflow_support/src}/utils/remote_jobs_utils.py (99%) rename kfp/kfp_support_lib/{ => kfp_v1_workflow_support}/test/pipeline_utils_test.py (95%) create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile rename kfp/{kfp_support_lib_v2 => kfp_support_lib/kfp_v2_workflow_support}/pyproject.toml (87%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support => kfp_support_lib/kfp_v2_workflow_support/src}/compile_utils/__init__.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support => kfp_support_lib/kfp_v2_workflow_support/src}/compile_utils/component.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support => kfp_support_lib/kfp_v2_workflow_support/src}/runtime_utils/__init__.py (100%) rename kfp/kfp_support_lib/{src/kfp_support/workflow_support/utils => kfp_v2_workflow_support/src/runtime_utils}/kfp_utils.py (100%) rename kfp/{kfp_support_lib_v2/src/kfp_support/workflow_support => kfp_support_lib/kfp_v2_workflow_support/src}/runtime_utils/remote_jobs_utils.py (99%) create mode 100644 kfp/kfp_support_lib/python_apiserver_client/.gitignore create mode 100644 kfp/kfp_support_lib/python_apiserver_client/Makefile create mode 100644 kfp/kfp_support_lib/python_apiserver_client/README.md create mode 100644 kfp/kfp_support_lib/python_apiserver_client/pyproject.toml create mode 100644 kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/kuberay_apis.py (99%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/__init__.py (65%) rename kfp/kfp_support_lib/{src/kfp_support/api_server_client => python_apiserver_client/src/python_apiserver_client}/params/cluster.py (99%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/environmentvariables.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/headnode.py (99%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/jobsubmission.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/templates.py (100%) rename kfp/{kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/volumes.py (100%) rename kfp/{kfp_support_lib_v2/src/kfp_support/api_server_client => kfp_support_lib/python_apiserver_client/src/python_apiserver_client}/params/workernode.py (99%) rename kfp/{kfp_support_lib_v2 => kfp_support_lib/python_apiserver_client}/test/api_params_test.py (99%) rename kfp/kfp_support_lib/{ => python_apiserver_client}/test/configmaps.py (100%) rename kfp/kfp_support_lib/{ => python_apiserver_client}/test/kuberay_api_test.py (98%) rename kfp/kfp_support_lib/{ => python_apiserver_client}/test/ray_remote_jobs_test.py (94%) delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py delete mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md delete mode 100644 kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py delete mode 100644 kfp/kfp_support_lib/test/api_params_test.py delete mode 100644 kfp/kfp_support_lib_v2/README.md delete mode 100644 kfp/kfp_support_lib_v2/doc/kfp_support_library.md delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py delete mode 100644 kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py delete mode 100644 kfp/kfp_support_lib_v2/test/configmaps.py delete mode 100644 kfp/kfp_support_lib_v2/test/kuberay_api_test.py delete mode 100644 kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py diff --git a/.make.defaults b/.make.defaults index 9efe778ad..46bf3dab2 100644 --- a/.make.defaults +++ b/.make.defaults @@ -54,7 +54,7 @@ DPK_PYTHON_LIB_DIR=$(REPOROOT)/data-processing-lib/python DPK_RAY_LIB_DIR=$(REPOROOT)/data-processing-lib/ray DPK_SPARK_LIB_DIR=$(REPOROOT)/data-processing-lib/spark -KFPv2?=1 +KFPv2?=0 ifeq ($(KFPv2), 0) PIPELINE_PATH="kfp_ray/v1" diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 69f9f0d67..c19799977 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -10,11 +10,11 @@ LABEL git-commit=$GIT_COMMIT COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt -# Copy and install data processing libraries +# Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . -COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ +COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . COPY --chown=ray:users kfp_support_lib/ kfp_support_lib/ diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 30ef36f5a..e489da4ac 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -10,16 +10,16 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed ' include makeenv -ifeq ($(KFPv2), 0) -DOCKER_FILE=Dockerfile -DOCKER_IMAGE_NAME=kfp-data-processing -DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} -KFP_SUPPORT_LIB=kfp_support_lib -else +ifeq ($(KFPv2), 1) DOCKER_FILE=Dockerfile_v2 DOCKER_IMAGE_NAME=kfp-data-processing_v2 DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION_v2} KFP_SUPPORT_LIB=kfp_support_lib_v2 +else +DOCKER_FILE=Dockerfile +DOCKER_IMAGE_NAME=kfp-data-processing +DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} +KFP_SUPPORT_LIB=kfp_support_lib endif @@ -31,11 +31,11 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) .lib-src-image:: $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python - $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib LIB_NAME=kfp_support_lib + # $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib LIB_NAME=kfp_support_lib $(MAKE) .defaults.image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python - rm -rf kfp_support_lib + # rm -rf kfp_support_lib .PHONY: image image: Dockerfile Dockerfile_v2 requirements.txt @@ -45,10 +45,10 @@ image: Dockerfile Dockerfile_v2 requirements.txt .PHONY: reconcile-requirements reconcile-requirements: @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml - sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" createRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" deleteRayClusterComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml + sed -i.back "s/kfp-data-processing.*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml # TODO remove it for KFPv2 sed -i.back "s/kfp-data-processing*:[0-9].*/$(DOCKER_IMAGE_NAME):${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/README.md deleted file mode 100644 index 86f3f4360..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# KFP support library - -This provides support for implementing KFP pipelines automating transform's execution. -It comprises 2 main modules - -* [api server client](src/kfp_support/api_server_client/README.md) -* [workflow support](src/kfp_support/workflow_support/README.md) - -## Development - -### Requirements -1. python 3.10 or later -2. git command line tools -3. [pre-commit](https://pre-commit.com/) -4. twine (pip install twine) - * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` - -### Git -Simple clone the repo and set up the pre-commit hooks. -```shell -git clone git@github.com:IBM/data-prep-kit.git -cd kfp/kfp_support_lib -pre-commit install -``` -If you don't have pre-commit, you can install from [here](https://pre-commit.com/) - -## Library Artifact Build and Publish - -The process of creating a release for `fm_data_processing_kfp` package involves the following steps: - -cd to the package directory. - -update the version in [requirements.env](../requirements.env) file. - -run `make build` and `make publish`. - -## Testing - -To run the package tests perform the following: - -To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. - -```bash -make setup -``` - -The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. - -```bash -make build -``` - -lastly, execute the tests: - -```bash -make test -``` - -### Cleanup - -It is advisable to execute the following command prior to running `make test` once more. This will ensure that any -previous test runs resources are removed before starting new tests. - -```bash -kubectl delete workflows -n kubeflow --all -``` - - diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml deleted file mode 100644 index f995d60d7..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "data_prep_toolkit_kfp_v2" -version = "0.1.1" -requires-python = ">=3.10" -description = "Data Preparation Kit Library. KFP v2 support" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, - { name = "Alexey Roytman", email = "roytman@il.ibm.com" }, - { name = "Mohammad Nassar", email = "Mohammad.Nassar@ibm.com" }, - { name = "Revital Eres", email = "eres@il.ibm.com" }, -] -dependencies = [ - "kfp==2.7.0", - "kfp-kubernetes==1.2.0", - "requests", - "data-prep-toolkit==0.1.1", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", -] - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/kfp_support"] - -[tool.pytest.ini_options] -addopts = "--cov --cov-report term-missing --cov-fail-under 10" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md deleted file mode 100644 index 423f743a1..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# KubeRay API server APIs - -This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) -Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py deleted file mode 100644 index 60cbbc2f2..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py deleted file mode 100644 index 922a14bef..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py +++ /dev/null @@ -1,475 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - HeadNodeSpec, - WorkerNodeSpec, - environment_variables_decoder, - head_node_spec_decoder, - volume_decoder, - worker_node_spec_decoder, -) - - -class Environment(enum.Enum): - """ - Environment definitions - """ - - DEV = 0 # development - TESTING = 1 # testing - STAGING = 2 # staging - PRODUCTION = 3 # production - - -class UpscalingMode(enum.Enum): - """ - Enumeration of autoscaling mode - """ - - Conservative = ( - "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster - ) - Default = "Default" # no rate limitations - Aggressive = "Aggressive" # same as default - - -class AutoscalerOptions: - """ - AutoscalerOptions is used to define Ray cluster autoscaling. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create autoscaling options specification: gets the following parameters: - idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray - resources. Default 60sec (one minute). - upscaling_mode - required autoscaler upscaling mode - image - optional, allows to override the autoscaler's container image - image_pull_policy - optional, allows to override the autoscaler's container image pull policy - cpus - optional, CPUs requirements for autoscaler - default "500m" - memory - optional, memory requirements for autoscaler - default "512Mi" - environment - optional, environment variables for autoscaler container - volumes - optional, a list of volumes to attach to autoscaler container. - This is needed for enabling TLS for the autoscaler container. - """ - - def __init__( - self, - upscaling_mode: UpscalingMode = UpscalingMode.Default, - idle_tmout: int = None, - image: str = None, - image_pull_policy: str = None, - cpus: str = None, - memory: str = None, - environment: EnvironmentVariables = None, - volumes: list[BaseVolume] = None, - ): - """ - Initialization - :param upscaling_mode: upscale mode - :param idle_tmout: idle timeout - :param image: image - :param image_pull_policy: image pull policy - :param cpus: cpu requirement for autoscaling - :param memory: memory requirement for autoscaling - :param environment: autoscaler environment - :param volumes: volumes for autoscaler - """ - self.upscaling_mode = upscaling_mode - self.idle_tmout = idle_tmout - self.image = image - self.image_pull_policy = image_pull_policy - self.cpus = cpus - self.memory = memory - self.environment = environment - self.volumes = volumes - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"upscaling_mode = {self.upscaling_mode}" - if self.idle_tmout is not None: - val += f", idle_timeout = {self.idle_tmout}" - if self.image is not None: - val += f", image = {self.image}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.cpus is not None: - val += f", cpus = {self.cpus}" - if self.memory is not None: - val += f", memory = {self.memory}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"upscalingMode": self.upscaling_mode.value} - if self.idle_tmout is not None: - dct["idleTimeoutSeconds"] = self.idle_tmout - if self.image is not None: - dct["image"] = self.image - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.cpus is not None: - dct["cpu"] = self.cpus - if self.memory is not None: - dct["memory"] = self.memory - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["envs"] = self.environment.to_dict() - return dct - - -class ClusterSpec: - """ - ClusterSpec is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create cluster spec from: gets the following parameters: - head_group_spec - required, specification of the head node - worker_group_spec - optional, list of worker group specs - autoscaler_options - optional, autoscaling options - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - head_node: HeadNodeSpec, - worker_groups: list[WorkerNodeSpec] = None, - autoscaling_options: AutoscalerOptions = None, - ): - """ - Initialization - :param head_node - head node definition - :param worker_groups - worker group definition - :param autoscaling_options - autoscaler options - """ - self.head_node = head_node - self.worker_groups = worker_groups - self.autoscaling_options = autoscaling_options - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster spec - """ - val = f"head_group_spec: {self.head_node.to_string()}" - if self.worker_groups is not None: - val += "\nworker groups: " - for w in self.worker_groups: - val += f"\nworker_group_spec = {w.to_string()}]" - if self.autoscaling_options is not None: - val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: Dictionary representation of cluster spec - """ - dst = {"headGroupSpec": self.head_node.to_dict()} - if self.worker_groups is not None: - dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] - if self.autoscaling_options is not None: - dst["enableInTreeAutoscaling"] = True - dst["autoscalerOptions"] = self.autoscaling_options.to_dict() - return dst - - -class ClusterEvent: - """ - Cluster event is used to define events emitted during cluster creation. - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create event: gets the dictionary with the following parameters: - id - unique Event Id - name - human readable event name - created_at - event creation time - first_timestamp - first time the event occur - last_timestamp - last time the event occur - reason - reason for the transition into the object's current status - message - human-readable description of the status of this operation - type - type of this event (Normal, Warning), new types could be added in the future - count - number of times this event has occurred - """ - - def __init__(self, dst: dict[str, Any]): - """ - Initialization from dictionary - :param dst: dictionary representation of cluster event - """ - self.id = dst.get("id", "") - self.name = dst.get("name", "") - self.created_at = dst.get("created_at", "") - self.first_timestamp = dst.get("first_timestamp", "") - self.last_timestamp = dst.get("last_timestamp", "") - self.reason = dst.get("reason", "") - self.message = dst.get("message", "") - self.type = dst.get("type", "") - self.count = dst.get("count", "0") - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster event - """ - return ( - f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " - f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," - f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" - ) - - -class Cluster: - """ - Cluster is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - name - required, unique (per namespace) cluster name - namespace - required, cluster's namespace (should exist) - user - required, user who owns the cluster - version - required, Ray cluster version - typically Ray version - deployment_environment - optional (see Environment) - cluster_spec - required, ray cluster configuration - annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class - cluster_environment - optional, cluster environment variables - created_at - output, cluster creation ts - deleted_at - output, cluster deletion ts - cluster_status - output, cluster status - events - output, cluster events - service_endpoint - output, cluster service endpoints - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - name: str, - namespace: str, - user: str, - version: str, - cluster_spec: ClusterSpec, - deployment_environment: Environment = None, - annotations: dict[str, str] = None, - cluster_environment: EnvironmentVariables = None, - created_at: str = None, - deleted_at: str = None, - cluster_status: str = None, - events: list[ClusterEvent] = None, - service_endpoint: dict[str, str] = None, - ): - """ - Initialization - :param name: cluster name - :param namespace: cluster namespace - :param user: user name - :param version: version - :param cluster_spec: cluster spec - :param deployment_environment: cluster deployment environment - :param annotations: cluster annotations - :param cluster_environment: cluster environment - :param created_at: created at - :param deleted_at: deleted at - :param cluster_status: status - :param events: cluster events - :param service_endpoint: service endpoint - """ - self.name = name - self.namespace = namespace - self.user = user - self.version = version - self.cluster_spec = cluster_spec - self.environment = deployment_environment - self.annotations = annotations - self.envs = cluster_environment - self.created_at = created_at - self.deleted_at = deleted_at - self.cluster_status = cluster_status - self.events = events - self.service_endpoint = service_endpoint - - def to_string(self) -> str: - """ - convert to string representation - :return: string representation of cluster - """ - val = ( - f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " - f"cluster_spec = {self.cluster_spec.to_string()}" - ) - if self.environment is not None: - val += f"deployment environment = {self.environment.name}" - if self.annotations is not None: - val += f" ,annotations = {str(self.annotations)}" - if self.envs is not None: - val = val + f",cluster environment = {self.envs.to_string()}" - val += "\ncluster output\n" - if self.created_at is not None: - val += f" ,created_at = {self.created_at}" - if self.deleted_at is not None: - val += f" ,deleted_at = {self.deleted_at}" - if self.cluster_status is not None: - val += f" ,cluster status = {self.cluster_status}" - if self.events is not None: - val = val + ",\n cluster events = [" - first = True - for e in self.events: - if first: - first = False - else: - val += ", " - val = val + "{" + e.to_string() + "}" - val = val + "]" - if self.service_endpoint is not None: - val += f" ,service endpoints = {str(self.service_endpoint)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of cluster - """ - # only convert input variables - dst = { - "name": self.name, - "namespace": self.namespace, - "user": self.user, - "version": self.version, - "clusterSpec": self.cluster_spec.to_dict(), - } - if self.environment is not None: - dst["environment"] = self.environment.value - if self.annotations is not None: - dst["annotations"] = self.annotations - if self.envs is not None: - dst["envs"] = self.envs.to_dict() - return dst - - -""" - Creates new cluster from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: - """ - Create autoscaling options from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: autoscaling options - """ - upscaling_mode = UpscalingMode.Default - if "upscalingMode" in dct: - upscaling_mode = UpscalingMode(dct.get("upscalingMode")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("envs")) > 0: - environments = environment_variables_decoder(dct.get("envs")) - return AutoscalerOptions( - upscaling_mode=upscaling_mode, - idle_tmout=dct.get("idleTimeoutSeconds", None), - image=dct.get("image", None), - image_pull_policy=dct.get("imagePullPolicy", None), - cpus=dct.get("cpu", None), - memory=dct.get("memory", None), - environment=environments, - volumes=volumes, - ) - - -def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: - """ - Create cluster spec from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: cluster spec - """ - workers = None - autoscaling_options = None - if "workerGroupSpec" in dct: - workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] - if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): - autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) - return ClusterSpec( - head_node=head_node_spec_decoder(dct.get("headGroupSpec")), - worker_groups=workers, - autoscaling_options=autoscaling_options, - ) - - -def cluster_decoder(dct: dict[str, Any]) -> Cluster: - """ - Create cluster from its dictionary representation - :param dct: dictionary representation of cluster - :return: cluster - """ - environment = None - if "environment" in dct: - environment = Environment(int(dct.get("environment", "0"))) - events = None - if "events" in dct: - events = [ClusterEvent(c) for c in dct["events"]] - envs = None - if "envs" in dct: - envs = environment_variables_decoder(dct.get("envs")) - return Cluster( - name=dct.get("name", ""), - namespace=dct.get("namespace", ""), - user=dct.get("user", ""), - version=dct.get("version", ""), - cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), - deployment_environment=environment, - annotations=dct.get("annotations"), - cluster_environment=envs, - created_at=dct.get("createdAt"), - deleted_at=dct.get("deletedAt"), - cluster_status=dct.get("clusterState"), - events=events, - service_endpoint=dct.get("serviceEndpoint"), - ) - - -def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: - """ - Create list of clusters from its dictionary representation - :param dct: dictionary representation of a list of clusters - :return: list of clusters - """ - return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py deleted file mode 100644 index ddcf193cc..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py +++ /dev/null @@ -1,206 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} - - -class WorkerNodeSpec: - """ - WorkerNodeSpec is used to define Ray cluster worker node pool configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create worker node pool specification: gets the following parameters: - group_name - required, group name of the worker group - compute_template - required, the computeTemplate of worker node group - replicas - required, desired replicas of the worker group - min_replicas - required Min replicas of the worker group, can't be greater than max_replicas - max_replicas - required, max replicas of the worker group - ray_start_params - required, Ray start parameters - image - optional, image used for worker node - volumes - optional, a list of volumes to attach to worker node - service_account - optional, a service account (has to exist) to run worker node - image_pull_secret - optional, secret to pull worker node image from registry - environment - optional, environment variables for worker pod - annotations - optional, annotations for worker node - labels - optional, labels for worker node - image_pull_policy - optional, worker node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - group_name: str, - compute_template: str, - image: str, - max_replicas: int, - replicas: int = 1, - min_replicas: int = 0, - ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param group_name: name - :param compute_template: compute template - :param replicas: number of replicas - :param min_replicas: min number of replicas - :param max_replicas: max number of replicas - :param ray_start_params: ray start parameters - :param image: image name - :param volumes: volumes - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: environment - :param annotations: annotations - :param labels: labels - :param image_pull_policy: image pull policy - """ - # Validate replicas - if min_replicas > replicas: - raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") - if replicas > max_replicas: - raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") - - self.group_name = group_name - self.compute_template = compute_template - self.replicas = replicas - self.min_replicas = min_replicas - self.max_replicas = max_replicas - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) - self.image = image - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of worker node spec - """ - val = ( - f"group_name = {self.group_name}, compute template = {self.compute_template}, " - f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " - f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" - ) - if self.image is not None: - val += f", image = {self.image}" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of worker node spec - """ - dct = { - "groupName": self.group_name, - "computeTemplate": self.compute_template, - "replicas": self.replicas, - "minReplicas": self.min_replicas, - "maxReplicas": self.max_replicas, - "rayStartParams": self.ray_start_params, - } - if self.image is not None: - dct["image"] = self.image - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["imagePullSecret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new worker node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: - """ - Create worker node spec from dictionary - :param dct: dictionary definition of worker node spec - :return: worker node spec - """ - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return WorkerNodeSpec( - group_name=dct.get("groupName"), - compute_template=dct.get("computeTemplate"), - replicas=dct.get("replicas", 0), - min_replicas=dct.get("minReplicas", 0), - max_replicas=dct.get("maxReplicas", 0), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md deleted file mode 100644 index 4943a0b06..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Workflow Utils for KFP v1 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## PipelinesUtils - -This class provides some higher level functionality based on the capabilities of the python KFP client, including" -* get_experiment_by_name obtains KFP experiment object based on its name -* get_pipeline_by_name obtains KFP pipeline object based on its name -* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a -dictionary of parameters. It returns kfp run ID -* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py deleted file mode 100644 index 39d4d9e64..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py +++ /dev/null @@ -1,527 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import re -import sys -import time -from typing import Any - -from data_processing.data_access import DataAccess, DataAccessFactory -from data_processing.utils import ParamsUtils, get_logger -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - HeadNodeSpec, - RayJobRequest, - Template, - WorkerNodeSpec, - environment_variables_decoder, - volume_decoder, -) -from kfp_support.workflow_support.runtime_utils import KFPUtils -from ray.job_submission import JobStatus - - -logger = get_logger(__name__) - - -class RayRemoteJobs: - """ - class supporting Ray remote jobs - """ - - ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - default_image: str = "rayproject/ray:2.9.3-py310", - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initialization - :param server_url: API server URL. Default value is assuming running inside the cluster - :param default_image - default Ray image - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval - ) - self.default_image = default_image - - def create_ray_cluster( - self, - name: str, - namespace: str, - head_node: dict[str, Any], - worker_nodes: list[dict[str, Any]], - wait_cluster_ready: int = -1, - ) -> tuple[int, str]: - """ - Create Ray cluster - :param name: name, _ are not allowed in the name - :param namespace: namespace - :param head_node: head node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for head node - service_account - service account to use (has to be created) - environment - dictionary of head node environment - annotations: dictionary of head node annotation - labels: dictionary of head node labels - image_pull_policy: image pull policy, default IfNotPresent - - :param worker_nodes: an array of worker node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - max_replicas - max replicas for this worker group - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - replicas - number of replicas to create for this group (default 1) - min_replicas - min number of replicas for this group (default 0) - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for this group - service_account - service account to use (has to be created) - environment - dictionary of node of this group environment - annotations: dictionary of node of this group annotation - labels: dictionary of node of this group labels - image_pull_policy: image pull policy, default IfNotPresent - - :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # start with templates - # head_node - cpus = head_node.get("cpu", 1) - memory = head_node.get("memory", 1) - gpus = head_node.get("gpu", 0) - accelerator = head_node.get("gpu_accelerator", None) - head_node_template_name = f"{name}-head-template" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) - head_template = Template( - name=head_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(head_template) - if status != 200: - return status, error - worker_template_names = [""] * len(worker_nodes) - index = 0 - # For every worker group - for worker_node in worker_nodes: - cpus = worker_node.get("cpu", 1) - memory = worker_node.get("memory", 1) - gpus = worker_node.get("gpu", 0) - accelerator = worker_node.get("gpu_accelerator", None) - worker_node_template_name = f"{name}-worker-template-{index}" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) - worker_template = Template( - name=worker_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(worker_template) - if status != 200: - return status, error - worker_template_names[index] = worker_node_template_name - index += 1 - # Build head node spec - image = head_node.get("image", self.default_image) - image_pull_secret = head_node.get("image_pull_secret", None) - image_pull_policy = head_node.get("image_pull_policy", None) - ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) - volumes_dict = head_node.get("volumes", None) - service_account = head_node.get("service_account", None) - environment_dict = head_node.get("environment", None) - annotations = head_node.get("annotations", None) - labels = head_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - head_node_spec = HeadNodeSpec( - compute_template=head_node_template_name, - image=image, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - image_pull_policy=image_pull_policy, - ) - # build worker nodes - worker_groups = [] - index = 0 - for worker_node in worker_nodes: - max_replicas = worker_node.get("max_replicas", 1) - replicas = worker_node.get("replicas", 1) - min_replicas = worker_node.get("min_replicas", 0) - image = worker_node.get("image", self.default_image) - image_pull_secret = worker_node.get("image_pull_secret", None) - image_pull_policy = head_node.get("image_pull_policy", None) - ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) - volumes_dict = worker_node.get("volumes", None) - service_account = worker_node.get("service_account", None) - environment_dict = worker_node.get("environment", None) - annotations = worker_node.get("annotations", None) - labels = worker_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - worker_groups.append( - WorkerNodeSpec( - group_name=f"worker-group-{index}", - compute_template=worker_template_names[index], - image=image, - max_replicas=max_replicas, - replicas=replicas, - min_replicas=min_replicas, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - image_pull_policy=image_pull_policy, - ) - ) - index += 1 - # Build cluster spec - cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) - # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) - status, error = self.api_server_client.create_cluster(cluster) - if status != 200: - return status, error - # Wait for cluster ready - return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) - - def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: - """ - Clean up Ray cluster and supporting template - :param name: cluster name - :param namespace: cluster namespace - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # delete cluster - status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) - if status != 200: - return status, error - # clean up templates - status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) - if status != 200: - return status, error - for template in template_array: - if template.name.startswith(name): - status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) - if status != 200: - return status, error - return status, error - - def submit_job( - self, - name: str, - namespace: str, - request: dict[str, Any], - runtime_env: str = None, - executor: str = "transformer_launcher.py", - ) -> tuple[int, str, str]: - """ - Submit job for execution - :param name: cluster name - :param namespace: cluster namespace - :param request: dictionary of the remote job request - :param runtime_env: runtime environment string - :param executor: python file to execute - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - submission id - """ - # Although the cluster is ready, the service web server might not be ready yet at this point. - # To ensure that it is ready, trying to get jobs info from the cluster. Even if it fails - # couple of times, its harmless - _, _, _ = self.api_server_client.list_job_info(ns=namespace, name=name) - time.sleep(5) - # Build job request - job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) - if runtime_env is not None: - job_request.runtime_env = runtime_env - return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) - - def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: - """ - Get job status - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - status - job status - """ - # get job info - status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - return status, error, "" - return status, error, info.status - - @staticmethod - def _print_log(log: str, previous_log_len: int) -> None: - """ - Prints the delta between current and previous logs - :param log: current log - :param previous_log_len: previous log length - :return: None - """ - l_to_print = log[previous_log_len:] - if len(l_to_print) > 0: - l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) - print(l_to_print) - - def follow_execution( - self, - name: str, - namespace: str, - submission_id: str, - data_access: DataAccess = None, - job_ready_timeout: int = 600, - print_timeout: int = 120, - ) -> None: - """ - Follow remote job execution - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :param data_access - data access class - :param job_ready_timeout: timeout to wait for fob to become ready - :param print_timeout: print interval - :return: None - """ - # Wait for job to start running - job_status = JobStatus.PENDING - while job_status != JobStatus.RUNNING and job_ready_timeout > 0: - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: - break - time.sleep(self.api_server_client.wait_interval) - job_ready_timeout -= self.api_server_client.wait_interval - logger.info(f"job status is {job_status}") - if job_ready_timeout <= 0: - logger.warning("timed out waiting for job become ready, exiting") - sys.exit(1) - # While job is running print log - previous_log_len = 0 - # At this point job could succeeded, failed, stop or running. So print log regardless - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - # continue printing log, while job is running - while job_status == JobStatus.RUNNING: - time.sleep(print_timeout) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - # Print the final log and execution status - # Sleep here to avoid racing conditions - time.sleep(2) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - logger.info(f"Job completed with execution status {job_status}") - if job_status != JobStatus.SUCCEEDED: - sys.exit(1) - if data_access is None: - return - # Here data access is either S3 or lakehouse both of which contain self.output_folder - try: - output_folder = data_access.get_output_folder() - except Exception as e: - logger.warning(f"failed to get output folder {e}") - return - output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" - execution_log_path = f"{output_folder}execution.log" - logger.info(f"saving execution log to {execution_log_path}") - data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) - - -def _execute_remote_job( - name: str, - ns: str, - script: str, - params: dict[str, Any], - data_access_params: dict[str, Any], - additional_params: dict[str, Any], - remote_jobs: RayRemoteJobs, -) -> None: - """ - Execute remote job on Ray cluster - :param name: cluster name - :param ns: execution/cluster namespace - :param additional_params: additional parameters for the job - :param data_access_params: data access parameters - :param params: job execution parameters (specific for a specific transform, - generated by the transform workflow) - :param script: script to run (has to be present in the image) - :param remote_jobs: remote jobs execution support class - :return: - """ - - status, error, submission = remote_jobs.submit_job(name=name, namespace=ns, request=params, executor=script) - if status != 200: - logger.error(f"Failed to submit job - status: {status}, error: {error}") - exit(1) - - logger.info(f"submitted job successfully, submission id {submission}") - # create data access - data_factory = DataAccessFactory() - data_factory.apply_input_params(args=data_access_params) - data_access = data_factory.create_data_access() - # print execution log - remote_jobs.follow_execution( - name=name, - namespace=ns, - submission_id=submission, - data_access=data_access, - print_timeout=additional_params.get("wait_print_tmout", 120), - job_ready_timeout=additional_params.get("wait_job_ready_tmout", 600), - ) - - -def execute_ray_jobs( - name: str, # name of Ray cluster - additional_params: dict[str, Any], - e_params: dict[str, Any], - exec_script_name: str, - server_url: str, -) -> None: - """ - Execute Ray jobs on a cluster periodically printing execution log. Completes when all Ray job complete. - All of the jobs will be executed, although some of the jobs may fail. - :param name: cluster name - :param additional_params: additional parameters for the job - :param e_params: job execution parameters (specific for a specific transform, - generated by the transform workflow) - :param exec_script_name: script to run (has to be present in the image) - :param server_url: API server url - :return: None - """ - # prepare for execution - ns = KFPUtils.get_namespace() - if ns == "": - logger.warning(f"Failed to get namespace") - sys.exit(1) - # create remote jobs class - remote_jobs = RayRemoteJobs( - server_url=server_url, - http_retries=additional_params.get("http_retries", 5), - wait_interval=additional_params.get("wait_interval", 2), - ) - # find config parameter - config = ParamsUtils.get_config_parameter(e_params) - if config is None: - exit(1) - # get config value - config_value = KFPUtils.load_from_json(e_params[config].replace("'", '"')) - s3_creds = KFPUtils.load_from_json(e_params["data_s3_cred"].replace("'", '"')) - if type(config_value) is not list: - # single request - return _execute_remote_job( - name=name, - ns=ns, - script=exec_script_name, - data_access_params={config: config_value, "data_s3_cred": s3_creds}, - params=e_params, - additional_params=additional_params, - remote_jobs=remote_jobs, - ) - # remove config key from the dictionary - launch_params = dict(e_params) - del launch_params[config] - # Loop through all configuration - n_launches = 0 - for conf in config_value: - # populate individual config and launch - launch_params[config] = ParamsUtils.convert_to_ast(d=conf) - try: - _execute_remote_job( - name=name, - ns=ns, - script=exec_script_name, - data_access_params={config: conf, "data_s3_cred": s3_creds}, - params=launch_params, - additional_params=additional_params, - remote_jobs=remote_jobs, - ) - n_launches += 1 - except SystemExit: - logger.warning(f"Failed to execute job for configuration {conf}") - continue - - if n_launches == 0: - logger.warning("All executions failed") - sys.exit(1) - else: - logger.info(f"{n_launches} ot of {len(config_value)} succeeded") diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md deleted file mode 100644 index 472c39136..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# Workflow Utils for KFPv2 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py deleted file mode 100644 index 9297ede66..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.components_utils.component import ( - CompileComponentUtils -) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py deleted file mode 100644 index adaa971c1..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/comp_utils/component.py +++ /dev/null @@ -1,54 +0,0 @@ -import kfp.dsl as dsl -from kfp import kubernetes -from typing import Dict - -RUN_NAME = "KFP_RUN_NAME" - -class CompileComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - @staticmethod - def add_settings_to_component( - task: dsl.PipelineTask, - timeout: int, - image_pull_policy: str = "IfNotPresent", - cache_strategy: bool = False, - ) -> None: - """ - Add settings to kfp task - :param task: kfp task - :param timeout: timeout to set to the component in seconds - :param image_pull_policy: pull policy to set to the component - :param cache_strategy: cache strategy - """ - - kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, - field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # Set cashing - task.set_caching_options(enable_caching=cache_strategy) - # image pull policy - kubernetes.set_image_pull_policy(task, image_pull_policy) - # Set the timeout for the task to one day (in seconds) - kubernetes.set_timeout(task, seconds=timeout) - - @staticmethod - def set_s3_env_vars_to_component( - task: dsl.PipelineTask, - secret: str = '', - env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, - prefix: str = None, - ) -> None: - """ - Set S3 env variables to KFP component - :param task: kfp task - :param secret: secret name with the S3 credentials - :param env2key: dict with mapping each env variable to a key in the secret - :param prefix: prefix to add to env name - """ - - if prefix is not None: - for env_name, _ in env2key.items(): - env2key[prefix + "_" + env_name] = env2key.pop(env_name) - kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py deleted file mode 100644 index 3a6ab1263..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.workflow_utils import ( - KFPUtils, - RayRemoteJobs, - ComponentUtils, - ONE_HOUR_SEC, - ONE_DAY_SEC, - ONE_WEEK_SEC, -) diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py b/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py deleted file mode 100644 index 7328c740d..000000000 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support_v2/utils/workflow_utils.py +++ /dev/null @@ -1,557 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -import json -import os -import re -import sys -import time -from typing import Any, Optional - -from data_processing.data_access import DataAccess -from data_processing.utils import get_logger -import kfp_server_api -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - HeadNodeSpec, - RayJobRequest, - Template, - WorkerNodeSpec, - environment_variables_decoder, - volume_decoder, -) -from ray.job_submission import JobStatus - -logger = get_logger(__name__) - -ONE_HOUR_SEC = 60 * 60 -ONE_DAY_SEC = ONE_HOUR_SEC * 24 -ONE_WEEK_SEC = ONE_DAY_SEC * 7 - -class KFPUtils: - """ - Helper utilities for KFP implementations - """ - - @staticmethod - def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" - ) -> tuple[str, str, str]: - """ - Get credentials from the environment - :param access_key: environment variable for access key - :param secret_key: environment variable for secret key - :param endpoint: environment variable for S3 endpoint - :return: - """ - s3_key = os.getenv(access_key, None) - s3_secret = os.getenv(secret_key, None) - s3_endpoint = os.getenv(endpoint, None) - if s3_key is None or s3_secret is None or s3_endpoint is None: - logger.warning("Failed to load s3 credentials") - return s3_key, s3_secret, s3_endpoint - - @staticmethod - def get_namespace() -> str: - """ - Get k8 namespace that we are running it - :return: - """ - ns = "" - try: - file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") - except Exception as e: - logger.warning( - f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" - ) - else: - with file: - ns = file.read() - return ns - - @staticmethod - def runtime_name(ray_name: str = "", run_id: str = "") -> str: - """ - Get unique runtime name - :param ray_name: - :param run_id: - :return: runtime name - """ - # K8s objects cannot contain special characters, except '_', All characters should be in lower case. - if ray_name != "": - ray_name = ray_name.replace("_", "-").lower() - pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. - ray_name = re.sub(pattern, "", ray_name) - else: - ray_name = "a" - # the return value plus namespace name will be the name of the Ray Route, - # which length is restricted to 64 characters, - # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] - - @staticmethod - def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: - res = f"python {executor} " - for key, value in d.items(): - if isinstance(value, str): - res += f'--{key}="{value}" ' - else: - res += f"--{key}={value} " - return res - - # Load a string that represents a json to python dictionary - @staticmethod - def load_from_json(js: str) -> dict[str, Any]: - try: - return json.loads(js) - except Exception as e: - logger.warning(f"Failed to load parameters {js} with error {e}") - sys.exit(1) - -class RayRemoteJobs: - """ - class supporting Ray remote jobs - """ - - ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - default_image: str = "rayproject/ray:2.9.3-py310", - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initialization - :param server_url: API server URL. Default value is assuming running inside the cluster - :param default_image - default Ray image - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.api_server_client = KubeRayAPIs( - server_url=server_url, http_retries=http_retries, wait_interval=wait_interval - ) - self.default_image = default_image - - def create_ray_cluster( - self, - name: str, - namespace: str, - head_node: dict[str, Any], - worker_nodes: list[dict[str, Any]], - wait_cluster_ready: int = -1, - ) -> tuple[int, str]: - """ - Create Ray cluster - :param name: name, _ are not allowed in the name - :param namespace: namespace - :param head_node: head node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for head node - service_account - service account to use (has to be created) - environment - dictionary of head node environment - annotations: dictionary of head node annotation - labels: dictionary of head node labels - - :param worker_nodes: an array of worker node specification dictionary including the following: - mandatory fields: - cpu - number of cpus - memory memory size (GB) - image - image to use - max_replicas - max replicas for this worker group - optional fields: - gpu - number of gpus - gpu_accelerator - gpu accelerator to use - replicas - number of replicas to create for this group (default 1) - min_replicas - min number of replicas for this group (default 0) - image_pull_secret - image pull secret - ray_start_params - dictionary of ray start parameters - volumes - list of volumes for this group - service_account - service account to use (has to be created) - environment - dictionary of node of this group environment - annotations: dictionary of node of this group annotation - labels: dictionary of node of this group labels - :param wait_cluster_ready - time to wait for cluster ready sec (-1 forever) - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # start with templates - # head_node - cpus = head_node.get("cpu", 1) - memory = head_node.get("memory", 1) - gpus = head_node.get("gpu", 0) - accelerator = head_node.get("gpu_accelerator", None) - head_node_template_name = f"{name}-head-template" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=head_node_template_name) - head_template = Template( - name=head_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(head_template) - if status != 200: - return status, error - worker_template_names = [""] * len(worker_nodes) - index = 0 - # For every worker group - for worker_node in worker_nodes: - cpus = worker_node.get("cpu", 1) - memory = worker_node.get("memory", 1) - gpus = worker_node.get("gpu", 0) - accelerator = worker_node.get("gpu_accelerator", None) - worker_node_template_name = f"{name}-worker-template-{index}" - _, _ = self.api_server_client.delete_compute_template(ns=namespace, name=worker_node_template_name) - worker_template = Template( - name=worker_node_template_name, - namespace=namespace, - cpu=cpus, - memory=memory, - gpu=gpus, - gpu_accelerator=accelerator, - ) - status, error = self.api_server_client.create_compute_template(worker_template) - if status != 200: - return status, error - worker_template_names[index] = worker_node_template_name - index += 1 - # Build head node spec - image = head_node.get("image", self.default_image) - image_pull_secret = head_node.get("image_pull_secret", None) - ray_start_params = head_node.get("ray_start_params", DEFAULT_HEAD_START_PARAMS) - volumes_dict = head_node.get("volumes", None) - service_account = head_node.get("service_account", None) - environment_dict = head_node.get("environment", None) - annotations = head_node.get("annotations", None) - labels = head_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - head_node_spec = HeadNodeSpec( - compute_template=head_node_template_name, - image=image, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - # build worker nodes - worker_groups = [] - index = 0 - for worker_node in worker_nodes: - max_replicas = worker_node.get("max_replicas", 1) - replicas = worker_node.get("replicas", 1) - min_replicas = worker_node.get("min_replicas", 0) - image = worker_node.get("image", self.default_image) - image_pull_secret = worker_node.get("image_pull_secret", None) - ray_start_params = worker_node.get("ray_start_params", DEFAULT_WORKER_START_PARAMS) - volumes_dict = worker_node.get("volumes", None) - service_account = worker_node.get("service_account", None) - environment_dict = worker_node.get("environment", None) - annotations = worker_node.get("annotations", None) - labels = worker_node.get("labels", None) - if volumes_dict is None: - volumes = None - else: - volumes = [volume_decoder(v) for v in volumes_dict] - if environment_dict is None: - environment = None - else: - environment = environment_variables_decoder(environment_dict) - worker_groups.append( - WorkerNodeSpec( - group_name=f"worker-group-{index}", - compute_template=worker_template_names[index], - image=image, - max_replicas=max_replicas, - replicas=replicas, - min_replicas=min_replicas, - ray_start_params=ray_start_params, - volumes=volumes, - service_account=service_account, - image_pull_secret=image_pull_secret, - environment=environment, - annotations=annotations, - labels=labels, - ) - ) - index += 1 - # Build cluster spec - cluster_spec = ClusterSpec(head_node=head_node_spec, worker_groups=worker_groups) - # Build cluster - cluster = Cluster(name=name, namespace=namespace, user="dataprep", version="2.9.3", cluster_spec=cluster_spec) - status, error = self.api_server_client.create_cluster(cluster) - if status != 200: - return status, error - # Wait for cluster ready - return self.api_server_client.wait_cluster_ready(name=name, ns=namespace, wait=wait_cluster_ready) - - def delete_ray_cluster(self, name: str, namespace: str) -> tuple[int, str]: - """ - Clean up Ray cluster and supporting template - :param name: cluster name - :param namespace: cluster namespace - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - # delete cluster - status, error = self.api_server_client.delete_cluster(ns=namespace, name=name) - if status != 200: - return status, error - # clean up templates - status, error, template_array = self.api_server_client.list_compute_templates_namespace(ns=namespace) - if status != 200: - return status, error - for template in template_array: - if template.name.startswith(name): - status, error = self.api_server_client.delete_compute_template(ns=namespace, name=template.name) - if status != 200: - return status, error - return status, error - - def submit_job( - self, - name: str, - namespace: str, - request: dict[str, Any], - runtime_env: str = None, - executor: str = "transformer_launcher.py", - ) -> tuple[int, str, str]: - """ - Submit job for execution - :param name: cluster name - :param namespace: cluster namespace - :param request: dictionary of the remote job request - :param runtime_env: runtime environment string - :param executor: python file to execute - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - submission id - """ - # Build job request - job_request = RayJobRequest(entrypoint=KFPUtils.dict_to_req(d=request, executor=executor)) - if runtime_env is not None: - job_request.runtime_env = runtime_env - return self.api_server_client.submit_job(ns=namespace, name=name, job_request=job_request) - - def _get_job_status(self, name: str, namespace: str, submission_id: str) -> tuple[int, str, str]: - """ - Get job status - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :return:tuple containing - http return code - message - only returned if http return code is not equal to 200 - status - job status - """ - # get job info - status, error, info = self.api_server_client.get_job_info(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - return status, error, "" - return status, error, info.status - - @staticmethod - def _print_log(log: str, previous_log_len: int) -> None: - """ - Prints the delta between current and previous logs - :param log: current log - :param previous_log_len: previous log length - :return: None - """ - l_to_print = log[previous_log_len:] - if len(l_to_print) > 0: - l_to_print = RayRemoteJobs.ansi_escape.sub("", l_to_print) - print(l_to_print) - - def follow_execution( - self, - name: str, - namespace: str, - submission_id: str, - data_access: DataAccess = None, - job_ready_timeout: int = 600, - print_timeout: int = 120, - ) -> None: - """ - Follow remote job execution - :param name: cluster name - :param namespace: cluster namespace - :param submission_id: job submission ID - :param data_access - data access class - :param job_ready_timeout: timeout to wait for fob to become ready - :param print_timeout: print interval - :return: None - """ - # Wait for job to start running - job_status = JobStatus.PENDING - while job_status != JobStatus.RUNNING and job_ready_timeout > 0: - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - if job_status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.RUNNING}: - break - time.sleep(self.api_server_client.wait_interval) - job_ready_timeout -= self.api_server_client.wait_interval - logger.info(f"job status is {job_status}") - if job_ready_timeout <= 0: - logger.warning("timed out waiting for job become ready, exiting") - sys.exit(1) - # While job is running print log - previous_log_len = 0 - # At this point job could succeeded, failed, stop or running. So print log regardless - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - # continue printing log, while job is running - while job_status == JobStatus.RUNNING: - time.sleep(print_timeout) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - previous_log_len = len(log) - status, error, job_status = self._get_job_status( - name=name, namespace=namespace, submission_id=submission_id - ) - if status // 100 != 2: - sys.exit(1) - # Print the final log and execution status - # Sleep here to avoid racing conditions - time.sleep(2) - status, error, log = self.api_server_client.get_job_log(ns=namespace, name=name, sid=submission_id) - if status // 100 != 2: - sys.exit(1) - self._print_log(log=log, previous_log_len=previous_log_len) - logger.info(f"Job completed with execution status {status}") - if data_access is None: - return - # Here data access is either S3 or lakehouse both of which contain self.output_folder - try: - output_folder = data_access.output_folder - except Exception as e: - logger.warning(f"failed to get output folder {e}") - return - output_folder = output_folder if output_folder.endswith("/") else output_folder + "/" - execution_log_path = f"{output_folder}execution.log" - logger.info(f"saving execution log to {execution_log_path}") - data_access.save_file(path=execution_log_path, data=bytes(log, "UTF-8")) - - -class ComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - # @staticmethod - # def add_settings_to_component( - # task: dsl.PipelineTask, - # timeout: int, - # image_pull_policy: str = "IfNotPresent", - # cache_strategy: bool = False, - # ) -> None: - # """ - # Add settings to kfp task - # :param task: kfp task - # :param timeout: timeout to set to the component in seconds - # :param image_pull_policy: pull policy to set to the component - # :param cache_strategy: cache strategy - # """ - # - # kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # # Set cashing - # task.set_caching_options(enable_caching=cache_strategy) - # # image pull policy - # kubernetes.set_image_pull_policy(task, image_pull_policy) - # # Set the timeout for the task to one day (in seconds) - # kubernetes.set_timeout(task, seconds=timeout) - - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / a_options.get("memory", 1)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index dec823e4b..c9ba1c16e 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -9,11 +9,16 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - +import os import sys -from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - +kfp_v2 = os.getenv("KFP_v2", 0) +if kfp_v2 == 1: + from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + print(f"Load KFPv2 libs") +else: + from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + print(f"Load KFPv1 libs") def start_ray_cluster( name: str, # name of Ray cluster diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 85fbf8dde..724945fa3 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -10,10 +10,16 @@ # limitations under the License. ################################################################################ +import os import sys -from kfp_support.workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - +kfp_v2 = os.getenv("KFP_v2", 0) +if kfp_v2 == 1: + from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + print(f"Load KFPv2 libs") +else: + from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + print(f"Load KFPv1 libs") # Cleans and shutdowns the Ray cluster def cleanup_ray_cluster( diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 8fe53667f..62c252400 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -10,7 +10,15 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs +import os + +kfp_v2 = os.getenv("KFP_v2", 0) +if kfp_v2 == 1: + from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + print(f"Load KFPv2 libs") +else: + from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + print(f"Load KFPv1 libs") if __name__ == "__main__": diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 7cb3cacb8..dac66f778 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -10,7 +10,15 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.runtime_utils import KFPUtils, execute_ray_jobs +import os + +kfp_v2 = os.getenv("KFP_v2", 0) +if kfp_v2 == 1: + from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + print(f"Load KFPv2 libs") +else: + from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + print(f"Load KFPv1 libs") if __name__ == "__main__": diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 78d703a26..2e9616562 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -1,8 +1,15 @@ +import os import sys -from data_processing.utils.params_utils import ParamsUtils -from kfp_support.workflow_support.runtime_utils import KFPUtils, PipelinesUtils +kfp_v2 = os.getenv("KFP_v2", 0) +if kfp_v2 == 1: + from kfp_v1_workflow_support.utils import KFPUtils, PipelinesUtils + print(f"Load KFPv2 libs") +else: + from kfp_v1_workflow_support.utils import KFPUtils, PipelinesUtils + print(f"Load KFPv1 libs") +from data_processing.utils import ParamsUtils def invoke_sub_workflow( name: str, # workflow name diff --git a/kfp/kfp_support_lib/Makefile b/kfp/kfp_support_lib/Makefile_old similarity index 100% rename from kfp/kfp_support_lib/Makefile rename to kfp/kfp_support_lib/Makefile_old diff --git a/kfp/kfp_support_lib/README.md b/kfp/kfp_support_lib/README.md index 86f3f4360..440fc16c3 100644 --- a/kfp/kfp_support_lib/README.md +++ b/kfp/kfp_support_lib/README.md @@ -1,10 +1,13 @@ # KFP support library This provides support for implementing KFP pipelines automating transform's execution. -It comprises 2 main modules +It comprises 3 main modules -* [api server client](src/kfp_support/api_server_client/README.md) -* [workflow support](src/kfp_support/workflow_support/README.md) +* [api server client](python_apiserver_client/README.md) +* [kfp_v1_workflow_support](kfp_v1_workflow_support//README.md) +* [kfp_v2_workflow_support](kfp_v2_workflow_support//README.md) + +Depends on the using KFV version either `kfp_v1_workflow_support` or `kfp_v2_workflow_support` should be used. ## Development diff --git a/kfp/kfp_support_lib/doc/kfp_support_library.md b/kfp/kfp_support_lib/doc/kfp_support_library.md index 0ae5e9d1c..fc571eb81 100644 --- a/kfp/kfp_support_lib/doc/kfp_support_library.md +++ b/kfp/kfp_support_lib/doc/kfp_support_library.md @@ -2,7 +2,7 @@ This library is aimed to simplify transform pipelines implementations and consists of 2 main parts: -* [API Server Client](../src/kfp_support/api_server_client/README.md) +* [API Server Client](../python_apiserver_client/README.md) * [workflow support](../src/kfp_support/workflow_support/README.md) See also how this library is used for [kfp components](../../kfp_ray_components/README.md) implementation diff --git a/kfp/kfp_support_lib_v2/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile similarity index 82% rename from kfp/kfp_support_lib_v2/Makefile rename to kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 60fd51f15..5516a1df4 100644 --- a/kfp/kfp_support_lib_v2/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -1,12 +1,12 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. -REPOROOT=../.. +REPOROOT=../../.. include ${REPOROOT}/.make.versions include ${REPOROOT}/kfp/requirements.env # Include the common rules. # Use "make help" to see them. -include ../../.make.defaults +include ${REPOROOT}/.make.defaults # Command to run pytest PYTHON_VERSION=$(shell $(PYTHON) --version) @@ -15,7 +15,7 @@ VENV_ACTIVATE=venv/bin/activate DEPLOY_KUBEFLOW ?= 0 clean:: - @# Help: Clean up the distribution build and the venv + @# Help: Clean up the distribution build and the venv rm -r dist venv || true rm -rf src/*egg-info || true rm -rf *.back || true @@ -28,17 +28,16 @@ update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml - sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml + sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml build:: update-toml venv - @# Help: Build the distribution for publishing to a pypi + @# Help: Build the distribution for publishing to a pypi rm -r dist || true rm -rf src/*egg-info || true ${PYTHON} -m pip install --upgrade build ${PYTHON} -m build publish:: .check-env -publish:: @# Help: Publish the wheel to testpypi if [ -d "dist"]; then rm -r dist; fi ${PYTHON} -m pip install --upgrade build @@ -46,18 +45,18 @@ publish:: ${PYTHON} -m twine upload --verbose --non-interactive dist/* venv:: pyproject.toml .check-env - @# Help: Create the virtual environment using pyproject.toml + @# Help: Create the virtual environment using pyproject.toml rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ pip install -e .; \ pip install ray==${RAY} \ - pip install pytest pytest-cov + pip install pytest pytest-cov test:: venv @# Help: Use the already-built virtual environment to run pytest on the test directory. - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; ifeq ($(DEPLOY_KUBEFLOW),1) . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; endif diff --git a/kfp/kfp_support_lib/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml similarity index 94% rename from kfp/kfp_support_lib/pyproject.toml rename to kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index dcd6af36d..06a4aab9b 100644 --- a/kfp/kfp_support_lib/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,7 +1,7 @@ [project] -name = "data_prep_toolkit_kfp" +name = "data_prep_toolkit_kfp_v1" version = "0.2.0" -requires-python = ">=3.10" +requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py new file mode 100644 index 000000000..8536bacd6 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py @@ -0,0 +1,4 @@ +from kfp_support.utils import KFPUtils +from kfp_support.utils.pipeline_utils import PipelinesUtils +from kfp_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC +from kfp_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/components_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/components_utils.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/kfp_utils.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/kfp_utils.py diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipeline_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipeline_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipeline_utils.py diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipelines_tests_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipelines_tests_utils.py diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py index 40b26c7a1..e3cef883d 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py @@ -18,7 +18,7 @@ from data_processing.data_access import DataAccess, DataAccessFactory from data_processing.utils import ParamsUtils, get_logger from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( +from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, Cluster, diff --git a/kfp/kfp_support_lib/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py similarity index 95% rename from kfp/kfp_support_lib/test/pipeline_utils_test.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py index f0bfd9189..449dbd79d 100644 --- a/kfp/kfp_support_lib/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.workflow_support.utils import PipelinesUtils +from kfp_support.utils import PipelinesUtils def test_pipelines(): diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile new file mode 100644 index 000000000..135e29514 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -0,0 +1,62 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ${REPOROOT}/.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +update-toml:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION_v2}'"/' pyproject.toml + sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v2}",/' pyproject.toml + +build:: update-toml venv + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build + +publish:: .check-env + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv:: pyproject.toml .check-env + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + pip install -e .; \ + pip install ray==${RAY} \ + pip install pytest pytest-cov + +test:: venv + @# Help: Use the already-built virtual environment to run pytest on the test directory. +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; +endif diff --git a/kfp/kfp_support_lib_v2/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml similarity index 87% rename from kfp/kfp_support_lib_v2/pyproject.toml rename to kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index f995d60d7..4238e0417 100644 --- a/kfp/kfp_support_lib_v2/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,8 +1,8 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.1.1" -requires-python = ">=3.10" -description = "Data Preparation Kit Library. KFP v2 support" +version = "0.2.0" +requires-python = ">=3.10,<3.12" +description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ @@ -12,8 +12,7 @@ authors = [ { name = "Revital Eres", email = "eres@il.ibm.com" }, ] dependencies = [ - "kfp==2.7.0", - "kfp-kubernetes==1.2.0", + "kfp==2.2.0", "requests", "data-prep-toolkit==0.1.1", ] diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/__init__.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/__init__.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/component.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/component.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/__init__.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/__init__.py diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/kfp_utils.py similarity index 100% rename from kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/kfp_utils.py diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/remote_jobs_utils.py index 39d4d9e64..c7e7cbe45 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/remote_jobs_utils.py @@ -18,7 +18,7 @@ from data_processing.data_access import DataAccess, DataAccessFactory from data_processing.utils import ParamsUtils, get_logger from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( +from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, Cluster, diff --git a/kfp/kfp_support_lib/python_apiserver_client/.gitignore b/kfp/kfp_support_lib/python_apiserver_client/.gitignore new file mode 100644 index 000000000..3ff12a7a8 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/.gitignore @@ -0,0 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +htmlcov +.coverage +.cache +nosetests.xml +coverage.xml \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile new file mode 100644 index 000000000..70c9365f6 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -0,0 +1,53 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +include ${REPOROOT}/.make.versions +include ${REPOROOT}/kfp/requirements.env + +# Include the common rules. +# Use "make help" to see them. +include ../../../.make.defaults + +# Command to run pytest +PYTHON_VERSION=$(shell $(PYTHON) --version) +VENV_ACTIVATE=venv/bin/activate + +DEPLOY_KUBEFLOW ?= 0 + +clean:: + @# Help: Clean up the distribution build and the venv + rm -r dist venv || true + rm -rf src/*egg-info || true + rm -rf *.back || true + + +.check-env:: .check_python_version + @echo "Checks passed" + +build:: .check-env venv + @# Help: Build the distribution for publishing to a pypi + rm -r dist || true + rm -rf src/*egg-info || true + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m build + +publish:: .check-env + @# Help: Publish the wheel to testpypi + if [ -d "dist"]; then rm -r dist; fi + ${PYTHON} -m pip install --upgrade build + ${PYTHON} -m twine check dist/* + ${PYTHON} -m twine upload --verbose --non-interactive dist/* + +venv::pyproject.toml .check-env + @# Help: Create the virtual environment using pyproject.toml + rm -rf venv + $(PYTHON) -m venv venv + . ${VENV_ACTIVATE}; \ + pip install --upgrade pip; + pip install ray==${RAY}; \ + pip install -e .; \ + pip install pytest pytest-cov + +test:: venv + @# Help: Use the already-built virtual environment to run pytest on the test directory. + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; pip list | grep python_apiserver_client ; cd test; $(PYTEST) api_params_test.py; diff --git a/kfp/kfp_support_lib/python_apiserver_client/README.md b/kfp/kfp_support_lib/python_apiserver_client/README.md new file mode 100644 index 000000000..de489adcd --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/README.md @@ -0,0 +1,4 @@ +# KubeRay API server APIs + +This is a copy of [Kuberay API server-client python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) +Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml new file mode 100644 index 000000000..ea992c823 --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[options] +package_dir = ["src"] +[project] +name = "python_apiserver_client" +version = "0.0.1" +dependencies = [ + "requests", + "kubernetes", + "data-prep-toolkit==0.1.1", +] +authors = [ + { name="KubeRay project"}, +] +description = "A Kuberay python client library to manage clusters based on the KubeRay API server" +readme = {file = "README.md", content-type = "text/markdown"} +license = {text = "Apache-2.0"} +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: Apache License 2.0", + "Operating System :: OS Independent", +] + +[project.urls] +"Homepage" = "https://github.com/ray-project/kuberay" \ No newline at end of file diff --git a/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py new file mode 100644 index 000000000..e6cdbec9a --- /dev/null +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/__init__.py @@ -0,0 +1 @@ +from python_apiserver_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py similarity index 99% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py index 270815e77..9051e7c73 100644 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/kuberay_apis.py @@ -14,7 +14,7 @@ import requests from data_processing.utils import get_logger -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( Cluster, RayJobInfo, RayJobRequest, diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py similarity index 65% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py index e5a7d70fa..207f961a9 100644 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/__init__.py @@ -1,4 +1,4 @@ -from kfp_support.api_server_client.params.templates import ( +from python_apiserver_client.params.templates import ( TolerationOperation, TolerationEffect, Toleration, @@ -7,7 +7,7 @@ template_decoder, templates_decoder, ) -from kfp_support.api_server_client.params.volumes import ( +from python_apiserver_client.params.volumes import ( HostPath, MountPropagationMode, AccessMode, @@ -20,25 +20,25 @@ SecretVolume, volume_decoder, ) -from kfp_support.api_server_client.params.environmentvariables import ( +from python_apiserver_client.params.environmentvariables import ( EnvVarSource, EnvVarFrom, EnvironmentVariables, env_var_from_decoder, environment_variables_decoder, ) -from kfp_support.api_server_client.params.headnode import ( +from python_apiserver_client.params.headnode import ( ServiceType, HeadNodeSpec, DEFAULT_HEAD_START_PARAMS, head_node_spec_decoder, ) -from kfp_support.api_server_client.params.workernode import ( +from python_apiserver_client.params.workernode import ( WorkerNodeSpec, DEFAULT_WORKER_START_PARAMS, worker_node_spec_decoder, ) -from kfp_support.api_server_client.params.cluster import ( +from python_apiserver_client.params.cluster import ( Environment, AutoscalerOptions, ClusterSpec, @@ -50,4 +50,4 @@ cluster_decoder, clusters_decoder, ) -from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo +from python_apiserver_client.params.jobsubmission import RayJobRequest, RayJobInfo \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py similarity index 99% rename from kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py index 922a14bef..5e1ee4867 100644 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/cluster.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/cluster.py @@ -13,7 +13,7 @@ import enum from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, HeadNodeSpec, diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/environmentvariables.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/environmentvariables.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py similarity index 99% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py index 7a9d4120f..37c2e2572 100644 --- a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/headnode.py @@ -13,7 +13,7 @@ import enum from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, environment_variables_decoder, diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/jobsubmission.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/jobsubmission.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/templates.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/templates.py diff --git a/kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/volumes.py similarity index 100% rename from kfp/kfp_ray_components/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/volumes.py diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py similarity index 99% rename from kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py rename to kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py index ddcf193cc..3a9f8e439 100644 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/workernode.py +++ b/kfp/kfp_support_lib/python_apiserver_client/src/python_apiserver_client/params/workernode.py @@ -12,7 +12,7 @@ from typing import Any -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( BaseVolume, EnvironmentVariables, environment_variables_decoder, diff --git a/kfp/kfp_support_lib_v2/test/api_params_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py similarity index 99% rename from kfp/kfp_support_lib_v2/test/api_params_test.py rename to kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py index 804c84aad..53740c939 100644 --- a/kfp/kfp_support_lib_v2/test/api_params_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/api_params_test.py @@ -12,7 +12,7 @@ import json -from kfp_support.api_server_client.params import ( +from python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, AccessMode, diff --git a/kfp/kfp_support_lib/test/configmaps.py b/kfp/kfp_support_lib/python_apiserver_client/test/configmaps.py similarity index 100% rename from kfp/kfp_support_lib/test/configmaps.py rename to kfp/kfp_support_lib/python_apiserver_client/test/configmaps.py diff --git a/kfp/kfp_support_lib/test/kuberay_api_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py similarity index 98% rename from kfp/kfp_support_lib/test/kuberay_api_test.py rename to kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py index b2a444ce3..ad0c2b766 100644 --- a/kfp/kfp_support_lib/test/kuberay_api_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py @@ -12,9 +12,9 @@ import time -from configmaps import ConfigmapsManager -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( +from python_apiserver_client.test.configmaps import ConfigmapsManager +from python_apiserver_client.src.api_server_client import KubeRayAPIs +from python_apiserver_client.src.api_server_client.params import ( DEFAULT_WORKER_START_PARAMS, Cluster, ClusterSpec, diff --git a/kfp/kfp_support_lib/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py similarity index 94% rename from kfp/kfp_support_lib/test/ray_remote_jobs_test.py rename to kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py index 5ae76a5f5..2e8588d7e 100644 --- a/kfp/kfp_support_lib/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py @@ -10,8 +10,8 @@ # limitations under the License. ################################################################################ -from configmaps import ConfigmapsManager -from kfp_support.api_server_client.params import ConfigMapVolume +from kfp.kfp_support_lib.python_apiserver_client.test.configmaps import ConfigmapsManager +from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ConfigMapVolume from kfp_support.workflow_support.utils import RayRemoteJobs diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md b/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md deleted file mode 100644 index 423f743a1..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# KubeRay API server APIs - -This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) -Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py deleted file mode 100644 index 60cbbc2f2..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py deleted file mode 100644 index 270815e77..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/kuberay_apis.py +++ /dev/null @@ -1,636 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import time - -import requests -from data_processing.utils import get_logger -from kfp_support.api_server_client.params import ( - Cluster, - RayJobInfo, - RayJobRequest, - Template, - cluster_decoder, - clusters_decoder, - template_decoder, - templates_decoder, -) - - -logger = get_logger(__name__) - - -_headers = {"Content-Type": "application/json", "accept": "application/json"} - -CONNECT_TIMEOUT = 50 -READ_TIMEOUT = 50 -TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) - - -class KubeRayAPIs: - """ - This class implements KubeRay APIs based on the API server. - To create a class, the following parameters are required: - base - the URL of the API server (default is set to the standalone API server) - wait interval - the amount of sec to wait between checking for cluster ready - """ - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - token: str = None, - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initializer - :param server_url: API server url - default assuming running it inside the cluster - :param token: token, only used for API server with security enabled - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.server_url = server_url - if token is not None: - _headers["Authorization"] = token - self.wait_interval = wait_interval - self.api_base = "/apis/v1/" - self.http_retries = http_retries - - def list_compute_templates(self) -> tuple[int, str, list[Template]]: - """ - List compute templates across all namespaces of the k8 cluster - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning(f"Failed to list compute templates, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: - """ - List compute templates across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: return tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning( - f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: - """ - get a compute template - :param ns: namespace - :param name: template name - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, template_decoder(response.json()) - else: - logger.warning( - f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_compute_template(self, template: Template) -> tuple[int, str]: - """ - Create a compute template - :param template - definition of a template - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: - """ - delete a compute template - :param ns: namespace - :param name: template name - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def list_clusters(self) -> tuple[int, str, list[Cluster]]: - """ - List clusters across all namespaces of the k8 cluster - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list cluster, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list cluster, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: - """ - List clusters across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: - """ - get cluster - :param ns: namespace - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - clusters definition - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, cluster_decoder(response.json()) - else: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_cluster(self, cluster: Cluster) -> tuple[int, str]: - """ - create cluster - :param cluster: cluster definition - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" - for i in range(self.http_retries): - try: - response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: - """ - get cluster status - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - # Execute HTTP request - status, error, cluster = self.get_cluster(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error, None - cluster_status = "creating" - if cluster.cluster_status is not None: - cluster_status = cluster.cluster_status - return status, None, cluster_status - - def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: - """ - wait for cluster to be ready - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) - :returns: A tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - current_wait = 0 - while True: - status, error, c_status = self.get_cluster_status(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error - if c_status == "ready": - return status, None - if current_wait > wait > 0: - return 408, f"Timed out waiting for cluster ready in {current_wait} sec" - time.sleep(self.wait_interval) - current_wait += self.wait_interval - - def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: - """ - get cluster endpoint - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) for cluster to be ready - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - endpoint (service for dashboard endpoint) - """ - # Ensure that the cluster is ready - status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) - if status // 100 != 2: - return status, error, None - # Get cluster - status, error, cluster = self.get_cluster(ns=ns, name=name) - if status // 100 != 2: - return status, error, None - return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" - - def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: - """ - delete cluster - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: - """ - submit Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param job_request: job submission - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json()["submissionId"] - else: - logger.warning( - f"Failed to submit job to the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, None - - def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: - """ - get Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, RayJobInfo(response.json()) - else: - logger.warning( - f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: - """ - list Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - job_info_array = response.json().get("submissions", None) - if job_info_array is None: - return response.status_code, None, [] - else: - return response.status_code, None, [RayJobInfo(i) for i in job_info_array] - else: - logger.warning( - f"Failed to list jobs from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, [] - - def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: - """ - get Ray job log - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - log - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json().get("log", "") - else: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" - ) - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - stop Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.post(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - delete Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py deleted file mode 100644 index e5a7d70fa..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from kfp_support.api_server_client.params.templates import ( - TolerationOperation, - TolerationEffect, - Toleration, - Template, - toleration_decoder, - template_decoder, - templates_decoder, -) -from kfp_support.api_server_client.params.volumes import ( - HostPath, - MountPropagationMode, - AccessMode, - BaseVolume, - HostPathVolume, - PVCVolume, - EphemeralVolume, - EmptyDirVolume, - ConfigMapVolume, - SecretVolume, - volume_decoder, -) -from kfp_support.api_server_client.params.environmentvariables import ( - EnvVarSource, - EnvVarFrom, - EnvironmentVariables, - env_var_from_decoder, - environment_variables_decoder, -) -from kfp_support.api_server_client.params.headnode import ( - ServiceType, - HeadNodeSpec, - DEFAULT_HEAD_START_PARAMS, - head_node_spec_decoder, -) -from kfp_support.api_server_client.params.workernode import ( - WorkerNodeSpec, - DEFAULT_WORKER_START_PARAMS, - worker_node_spec_decoder, -) -from kfp_support.api_server_client.params.cluster import ( - Environment, - AutoscalerOptions, - ClusterSpec, - ClusterEvent, - Cluster, - UpscalingMode, - autoscaling_decoder, - cluster_spec_decoder, - cluster_decoder, - clusters_decoder, -) -from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py deleted file mode 100644 index d1056f6f6..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/environmentvariables.py +++ /dev/null @@ -1,158 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class EnvVarSource(enum.Enum): - """ - Enumeration of environment sources - """ - - CONFIGMAP = 0 # config map - SECRET = 1 # secret - RESOURCE_FIELD = 2 # resource field - FIELD = 3 # field - - -class EnvVarFrom: - """ - EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - Source required - source of environment variable - name required name for config map or secret, container name for resource, path for field - key required Key for config map or secret, resource name for resource - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, source: EnvVarSource, name: str, key: str): - """ - Initialize - :param source - source - :param name source name - :param key source key - """ - self.source = source - self.name = name - self.key = key - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of environment from - """ - return f"source = {self.source.name}, name = {self.name}, key = {self.key}" - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of environment from - """ - return {"source": self.source.value, "name": self.name, "key": self.key} - - -class EnvironmentVariables: - """ - EnvironmentVariables is used to define environment variables. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - key_value - optional, dictionary of key/value environment variables - from_ref - optional, dictionary of reference environment variables - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): - """ - Initialization - :param key_value: dictionary of key/value pairs for environment variables - :param from_ref: dictionary of key/value pairs for environment from variables - """ - self.key_val = key_value - self.from_ref = from_ref - - def to_string(self) -> str: - """ - convert to string - :return: string representation of environment variables - """ - val = "" - if self.key_val is not None: - val = f"values = {str(self.key_val)}" - if self.from_ref is not None: - if val != "": - val += " , " - val += "valuesFrom = {" - first = True - for k, v in self.from_ref.items(): - if not first: - val += ", " - else: - first = False - val += f"{k} = [{v.to_string()}]" - val += "}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of environment variables - """ - dst = {} - if self.key_val is not None: - dst["values"] = self.key_val - if self.from_ref is not None: - fr = {} - for k, v in self.from_ref.items(): - fr[k] = v.to_dict() - dst["valuesFrom"] = fr - return dst - - -""" - Creates new environment variable from from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: - """ - Create environment from from dictionary - :param dct: dictionary representations of environment from - :return: environment from - """ - return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) - - -def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: - """ - Create environment variables from from dictionary - :param dct: dictionary representations of environment variables - :return: environment variables - """ - keyvalues = None - fr = None - if "values" in dct: - keyvalues = dct.get("values") - if "valuesFrom" in dct: - from_ref = dct.get("valuesFrom") - fr = {} - for k, v in from_ref.items(): - fr[k] = env_var_from_decoder(v) - return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py deleted file mode 100644 index 7a9d4120f..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/headnode.py +++ /dev/null @@ -1,202 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} - - -class ServiceType(enum.Enum): - """ - Enumeration of head node service types - """ - - ClusterIP = "ClusterIP" # cluster IP - NodePort = "NodePort" # node port - LoadBalancer = "LoadBalancer" # load balancer - - -class HeadNodeSpec: - """ - HeadNodeSpec is used to define Ray cluster head node configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create head node specification: gets the following parameters: - compute_template - required, the computeTemplate of head node group - ray_start_params - required, Ray start parameters - image - optional, image used for head node - service_type - optional (ServiceType), service type foe headnode - enable_ingress - optional, allow to enable ingress for dashboard - volumes - optional, a list of volumes to attach to head node - service_account - optional, a service account (has to exist) to run head node - image_pull_secret - optional, secret to pull head node image from registry - environment - optional, environment variables for head pod - annotations - optional, annotations for head node - labels - optional, labels for head node - image_pull_policy - optional, head node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - compute_template: str, - image: str, - ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, - service_type: ServiceType = ServiceType.ClusterIP, - enable_ingress: bool = False, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param compute_template: compute template - :param ray_start_params: ray start parameters - :param image: node image - :param service_type: service type - :param enable_ingress: enable ingress flag - :param volumes: volumes for head node - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: head node environment - :param annotations: head node annotation - :param labels: labels - :param image_pull_policy: image pull policy - """ - - self.compute_template = compute_template - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) - self.image = image - self.service_type = service_type - self.enable_ingress = enable_ingress - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" - if self.image is not None: - val += f", image = {self.image}" - if self.service_type is not None: - val += f", service_type = {self.service_type.name}" - if self.enable_ingress: - val += ", enable_ingress = True" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} - if self.image is not None: - dct["image"] = self.image - if self.service_type is not None: - dct["serviceType"] = self.service_type.value - if self.enable_ingress: - dct["enableIngress"] = True - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["image_pull_secret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new head node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: - """ - Create head node spec from dictionary - :param dct: dictionary representation of head node spec - :return: Head node spec - """ - service_type = None - if "serviceType" in dct: - service_type = ServiceType(dct.get("serviceType", "ClusterIP")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return HeadNodeSpec( - compute_template=dct.get("computeTemplate"), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - service_type=service_type, - enable_ingress=dct.get("enableIngress", False), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py deleted file mode 100644 index a0b2bfcb0..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/jobsubmission.py +++ /dev/null @@ -1,163 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -from typing import Any - - -class RayJobRequest: - """ - RayJobRequest used to define job to be submitted to a Ray cluster - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - required, the command to start a job on the cluster - submission_id - optional, submission id for the job submission - runtime_env - optional, yaml string specifying job runtime environment - metadata - optional, dictionary of the submission metadata - num_cpus - optional, number of cpus for job execution - num_gpus - optional, number of gpus for job execution - resources - optional, dictionary of the resources for job execution - """ - - def __init__( - self, - entrypoint: str, - submission_id: str = None, - runtime_env: str = None, - metadata: dict[str, str] = None, - num_cpu: float = -1.0, - num_gpu: float = -1.0, - resources: dict[str, str] = None, - ): - """ - Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html - :param entrypoint: entrypoint - :param submission_id: submission id - :param runtime_env: runtime environment - :param metadata: submission metadata - :param num_cpu: job number cpus - :param num_gpu: job number gpus - :param resources: job custom resources - """ - self.entrypoint = entrypoint - self.submission_id = submission_id - self.runtime_env = runtime_env - self.metadata = metadata - self.num_cpu = num_cpu - self.num_gpu = num_gpu - self.resources = resources - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of job submission - """ - val = f"entrypoint = {self.entrypoint}" - if self.submission_id is not None: - val += f", submission_id = {self.submission_id}" - if self.num_cpu > 0: - val += f", num_cpu = {self.num_cpu}" - if self.num_gpu > 0: - val += f", num_gpu = {self.num_gpu}" - if self.runtime_env is not None: - val += f", runtime_env = {self.runtime_env}" - if self.metadata is not None: - val += f", metadata = {self.metadata}" - if self.resources is not None: - val += f", resources = {self.resources}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of job submission - """ - dct = {"entrypoint": self.entrypoint} - if self.submission_id is not None: - dct["submissionId"] = self.submission_id - if self.runtime_env is not None: - dct["runtimeEnv"] = self.runtime_env - if self.metadata is not None: - dct["metadata"] = self.metadata - if self.num_cpu > 0: - dct["numCpus"] = self.num_cpu - if self.num_gpu > 0: - dct["numGpus"] = self.num_gpu - if self.resources is not None: - dct["resources"] = self.resources - return dct - - -class RayJobInfo: - """ - RayJobInfo used to define information about the job in a Ray cluster - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - the command to start a job on the cluster - job_id - job execution id - submission_id - submission id for the job submission - runtime_env - job runtime environment - status - job execution status - message - status message - start_time - job start time - end-time - job ind time - error_type - type of error - metadata - optional, dictionary of the submission metadata - """ - - def __init__(self, dct: dict[str, Any]): - """ - Initialize from dictionary - :param dct: dictionary representation of Ray job info - """ - self.entrypoint = dct.get("entrypoint", "") - self.job_id = dct.get("jobId", "") - self.submission_id = dct.get("submissionId", "") - self.status = dct.get("status", "") - self.message = dct.get("message", None) - self.start_time = int(dct.get("startTime", "0")) - self.end_time = int(dct.get("endTime", "0")) - self.error_type = dct.get("ErrorType", None) - self.metadata = dct.get("Metadata", None) - self.runtime_env = dct.get("runtimeEnv", None) - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of Ray job info - """ - val = ( - f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," - f" status = {self.status}" - ) - if self.message is not None: - val += f" message = {self.message}" - if self.start_time > 0: - val += ( - f" start time = " - f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.end_time > 0: - val += ( - f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.error_type is not None: - val += f" error type = {self.error_type}" - if self.runtime_env is not None: - val += f" runtime env = {str(self.runtime_env)}" - if self.metadata is not None: - val += f" metadata = {str(self.metadata)}" - return val diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py deleted file mode 100644 index 0ef4c1583..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/templates.py +++ /dev/null @@ -1,224 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class TolerationOperation(enum.Enum): - """ - Toleration operation types - """ - - Exists = "Exists" # exists - Equal = "Equal" # equal - - -class TolerationEffect(enum.Enum): - """ - Toleration effect - """ - - NoSchedule = "NoSchedule" # not schedule - PreferNoSchedule = "PreferNoSchedule" # prefer not schedule - NoExecute = "NoExecute" # not execute - - -class Toleration: - """ - Toleration is used by compute template to pick specific nodes for placing pods. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create toleration: gets the following parameters: - key - required, key created by the node's taint - operator - required, operator to apply, supported operators are "Exists" and "Equal" - effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" - value - optional, value - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): - """ - Initialization - :param key: key - :param operator: operator - :param effect: effect - :param value: value - """ - self.key = key - self.operator = operator - self.value = value - self.effect = effect - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of toleration - """ - val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" - if self.value is None: - return val - else: - return val + f", value = {self.value}" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to string - :return: string representation of toleration - """ - dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} - if self.value is not None: - dct["value"] = self.value - return dct - - -# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. -# If it is specified, it has to be in the format that is understood by kubernetes as a valid -# The following devices are currently supported by kubernetes: -# AMD - gpu accelerator amd.com/gpu -# Intel - gpu accelerator gpu.intel.com/i915 -# NVIDIA - gpu accelerator nvidia.com/gpu - - -class Template: - """ - Template is used to define specific nodes configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create templates: gets the following parameters: - name - required, template name - namespace - required, template namespace - cpus - required, template number of cpus - memory - required, template memory (GB) - gpus - optional, number of GPUs, default 0 - gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed - tolerations - optional, tolerations for pod placing, default none - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - - to_json() -> str convert to json string - """ - - def __init__( - self, - name: str, - namespace: str, - cpu: int, - memory: int, - gpu: int = 0, - gpu_accelerator: str = None, - tolerations: list[Toleration] = None, - ): - """ - Initialization - :param name: name - :param namespace: namespace - :param cpu: cpu - :param memory: memory - :param gpu: gpu - :param gpu_accelerator: accelerator type - :param tolerations: tolerations - """ - self.name = name - self.namespace = namespace - self.cpu = cpu - self.memory = memory - self.gpu = gpu - self.gpu_accelerator = gpu_accelerator - self.tolerations = tolerations - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of template - """ - val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" - if self.gpu > 0: - val = val + f", gpu {self.gpu}" - if self.gpu_accelerator is not None: - val = val + f", gpu accelerator {self.gpu_accelerator}" - if self.tolerations is None: - return val - val = val + ", tolerations [" - first = True - for tol in self.tolerations: - if first: - first = False - val = val + "{" + tol.to_string() + "}" - else: - val = val + ", {" + tol.to_string() + "}" - return val + "]" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of template - """ - dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} - if self.gpu > 0: - dct["gpu"] = self.gpu - if self.gpu_accelerator is not None: - dct["gpu accelerator"] = self.gpu_accelerator - if self.tolerations is not None: - dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] - return dct - - -""" - Creates new toleration from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def toleration_decoder(dct: dict[str, Any]) -> Toleration: - """ - Create toleration from dictionary - :param dct: dictionary representation of toleration - :return: toleration - """ - return Toleration( - key=dct.get("key"), - operator=TolerationOperation(dct.get("operator", "Exists")), - effect=TolerationEffect(dct.get("effect", "NoSchedule")), - value=dct.get("value"), - ) - - -def template_decoder(dct: dict[str, Any]) -> Template: - """ - Create template from dictionary - :param dct: dictionary representation of template - :return: template - """ - tolerations = None - if "tolerations" in dct: - tolerations = [toleration_decoder(d) for d in dct["tolerations"]] - return Template( - name=dct.get("name"), - namespace=dct.get("namespace"), - cpu=int(dct.get("cpu", "0")), - memory=int(dct.get("memory", "0")), - gpu=int(dct.get("gpu", "0")), - gpu_accelerator=dct.get("gpu_accelerator"), - tolerations=tolerations, - ) - - -def templates_decoder(dct: dict[str, Any]) -> list[Template]: - """ - Create list of template from dictionary - :param dct: dictionary representation of list of template - :return: list of template - """ - return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py deleted file mode 100644 index fee0e1ea4..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/volumes.py +++ /dev/null @@ -1,449 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class HostPath(enum.Enum): - """ - Host path enumeration - """ - - DIRECTORY = 0 # directory - FILE = 1 # files - - -class MountPropagationMode(enum.Enum): - """ - Mount propagation enumeration - """ - - NONE = 0 # None - HOSTTOCONTAINER = 1 # host to container - BIDIRECTIONAL = 2 # bi directional - - -class AccessMode(enum.Enum): - """ - Access mode enumeration - """ - - RWO = 0 # read write once - ROX = 1 # read only many - RWX = 2 # read write many - - -class BaseVolume: - """ - KubeRay currently support several types of volumes, including hostPat, PVC, - ephemeral volumes, config maps, secrets and empty dir. All of them use slightly - different parameters. Base Volume is a base class for all different volume types. - """ - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - -class HostPathVolume(BaseVolume): - """ - This class implements HostPath volume. In addition to name and mount path it requires host - path volume specific parameters: - source - data location on host - hostPathType - host path type: directory (0) or file (1) - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - host_path_type: HostPath = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param host_path_type: host path type - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.host_path_type = host_path_type - self.volume_type = 1 - self.mount_propagation = mount_propagation - - def to_string(self) -> str: - """ - Convert to string - :return: HostPathVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - if self.host_path_type is not None: - val += f", host path type = {self.host_path_type.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: HostPathVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - if self.host_path_type is not None: - dst["hostPathType"] = self.host_path_type.value - return dst - - -class PVCVolume(BaseVolume): - """ - This class implements PVC volume. In addition to name and mount path it requires - PVC volume specific parameters: - source - PVC claim name - read_only - read only flag - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - read_only: bool = False, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param read_only: read only - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.volume_type = 0 - self.mount_propagation = mount_propagation - self.readonly = read_only - - def to_string(self) -> str: - """ - Convert to string - :return: PVCVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" - if self.readonly: - val += ", read only = True" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: PVCVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.readonly: - dst["readOnly"] = True - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - return dst - - -class EphemeralVolume(BaseVolume): - """ - This class implements Ephemeral volume. In addition to name and mount path it requires - Ephemeral volume specific parameters: - storage - disk size (valid k8 value, for example 5Gi) - storageClass - storage class - optional, if not specified, use default - accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) - mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - storage: str, - storage_class: str = None, - access_mode: AccessMode = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param storage: storage - :param storage_class: storage class - :param access_mode: access mode - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 2 - self.mount_propagation = mount_propagation - self.storage_class = storage_class - self.access_mode = access_mode - - def to_string(self) -> str: - """ - Convert to string - :return: EphemeralVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" - ) - if self.storage_class is not None: - val += f", storage class = {self.storage_class}" - if self.access_mode is not None: - val += f", access mode = {self.access_mode.name}" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: EphemeralVolume dictionary representation - """ - dct = { - "name": self.name, - "mountPath": self.mount_path, - "storage": self.storage, - "volumeType": self.volume_type, - } - if self.storage_class is not None: - dct["storageClassName"] = self.storage_class - if self.access_mode is not None: - dct["accessMode"] = self.access_mode.value - if self.mount_propagation is not None: - dct["mountPropagationMode"] = self.mount_propagation.value - return dct - - -class EmptyDirVolume(BaseVolume): - """ - This class implements EmptyDir volume. In addition to name and mount path it requires - Empty Dir specific parameters: - storage - optional max storage size (valid k8 value, for example 5Gi) - """ - - def __init__(self, name: str, mount_path: str, storage: str = None): - """ - Initialization - :param name: name - :param mount_path: mount_path - :param storage: storage - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 5 - - def to_string(self) -> str: - """ - Convert to string - :return: EmptyDirVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" - if self.storage is not None: - val += f", storage = {self.storage}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} - if self.storage is not None: - dct["storage"] = self.storage - return dct - - -class ConfigMapVolume(BaseVolume): - """ - This class implements ConfigMap volume. In addition to name and mount path it requires - configMap volume specific parameters: - source - required, config map name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param items: items - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 3 - - def to_string(self) -> str: - """ - Convert to string - :return: ConfigMapVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" - ) - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: ConfigMapVolume dictionary representation - """ - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -class SecretVolume(BaseVolume): - """ - This class implements Secret volume. In addition to name and mount path it requires - Secret volume specific parameters: - source - required, secret name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 4 - - def to_string(self) -> str: - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -""" - Creates new Volume from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def volume_decoder(dst: dict[str, Any]) -> BaseVolume: - def _get_mount_propagation() -> MountPropagationMode: - if "mountPropagationMode" in dst: - return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) - return None - - def _get_host_path() -> HostPath: - if "hostPathType" in dst: - return HostPath(int(dst.get("hostPathType", "0"))) - return None - - def _get_access_mode() -> AccessMode: - if "accessMode" in dst: - return AccessMode(int(dst.get("accessMode", "0"))) - return None - - match dst["volumeType"]: - case 0: - # PVC - return PVCVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - read_only=dst.get("readOnly", False), - mount_propagation=_get_mount_propagation(), - ) - case 1: - # host path - return HostPathVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - host_path_type=_get_host_path(), - mount_propagation=_get_mount_propagation(), - ) - case 2: - # Ephemeral volume - return EphemeralVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - storage=dst.get("storage", ""), - storage_class=dst.get("storageClassName"), - access_mode=_get_access_mode(), - mount_propagation=_get_mount_propagation(), - ) - case 3: - # ConfigMap Volume - return ConfigMapVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 4: - # Secret Volume - return SecretVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 5: - # Empty dir volume - return EmptyDirVolume( - name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") - ) - case _: - raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py b/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py deleted file mode 100644 index ddcf193cc..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/api_server_client/params/workernode.py +++ /dev/null @@ -1,206 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_WORKER_START_PARAMS = {"node-ip-address": "$MY_POD_IP"} - - -class WorkerNodeSpec: - """ - WorkerNodeSpec is used to define Ray cluster worker node pool configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create worker node pool specification: gets the following parameters: - group_name - required, group name of the worker group - compute_template - required, the computeTemplate of worker node group - replicas - required, desired replicas of the worker group - min_replicas - required Min replicas of the worker group, can't be greater than max_replicas - max_replicas - required, max replicas of the worker group - ray_start_params - required, Ray start parameters - image - optional, image used for worker node - volumes - optional, a list of volumes to attach to worker node - service_account - optional, a service account (has to exist) to run worker node - image_pull_secret - optional, secret to pull worker node image from registry - environment - optional, environment variables for worker pod - annotations - optional, annotations for worker node - labels - optional, labels for worker node - image_pull_policy - optional, worker node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - group_name: str, - compute_template: str, - image: str, - max_replicas: int, - replicas: int = 1, - min_replicas: int = 0, - ray_start_params: dict[str, str] = DEFAULT_WORKER_START_PARAMS, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param group_name: name - :param compute_template: compute template - :param replicas: number of replicas - :param min_replicas: min number of replicas - :param max_replicas: max number of replicas - :param ray_start_params: ray start parameters - :param image: image name - :param volumes: volumes - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: environment - :param annotations: annotations - :param labels: labels - :param image_pull_policy: image pull policy - """ - # Validate replicas - if min_replicas > replicas: - raise RuntimeError(f"min_replicas {min_replicas} is can't be greater then replicas {replicas} ") - if replicas > max_replicas: - raise RuntimeError(f"replicas {replicas} is can't be greater then max_replicas {max_replicas} ") - - self.group_name = group_name - self.compute_template = compute_template - self.replicas = replicas - self.min_replicas = min_replicas - self.max_replicas = max_replicas - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_WORKER_START_PARAMS) - self.image = image - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of worker node spec - """ - val = ( - f"group_name = {self.group_name}, compute template = {self.compute_template}, " - f"replicas = {self.replicas}, min_replicas = {self.min_replicas}, " - f"max_replicas = {self.max_replicas}, ray start params = {str(self.ray_start_params)}" - ) - if self.image is not None: - val += f", image = {self.image}" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of worker node spec - """ - dct = { - "groupName": self.group_name, - "computeTemplate": self.compute_template, - "replicas": self.replicas, - "minReplicas": self.min_replicas, - "maxReplicas": self.max_replicas, - "rayStartParams": self.ray_start_params, - } - if self.image is not None: - dct["image"] = self.image - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["imagePullSecret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new worker node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def worker_node_spec_decoder(dct: dict[str, Any]) -> WorkerNodeSpec: - """ - Create worker node spec from dictionary - :param dct: dictionary definition of worker node spec - :return: worker node spec - """ - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return WorkerNodeSpec( - group_name=dct.get("groupName"), - compute_template=dct.get("computeTemplate"), - replicas=dct.get("replicas", 0), - min_replicas=dct.get("minReplicas", 0), - max_replicas=dct.get("maxReplicas", 0), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md b/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md deleted file mode 100644 index b477e9a42..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Workflow Utils - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## PipelinesUtils - -This class provides some higher level functionality based on the capabilities of the python KFP client, including" -* get_experiment_by_name obtains KFP experiment object based on its name -* get_pipeline_by_name obtains KFP pipeline object based on its name -* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a -dictionary of parameters. It returns kfp run ID -* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py deleted file mode 100644 index 166032380..000000000 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from kfp_support.workflow_support.utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.utils.pipeline_utils import PipelinesUtils -from kfp_support.workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC -from kfp_support.workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/test/api_params_test.py b/kfp/kfp_support_lib/test/api_params_test.py deleted file mode 100644 index 804c84aad..000000000 --- a/kfp/kfp_support_lib/test/api_params_test.py +++ /dev/null @@ -1,433 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import json - -from kfp_support.api_server_client.params import ( - DEFAULT_HEAD_START_PARAMS, - DEFAULT_WORKER_START_PARAMS, - AccessMode, - AutoscalerOptions, - Cluster, - ClusterEvent, - ClusterSpec, - ConfigMapVolume, - EmptyDirVolume, - Environment, - EnvironmentVariables, - EnvVarFrom, - EnvVarSource, - EphemeralVolume, - HeadNodeSpec, - HostPath, - HostPathVolume, - MountPropagationMode, - PVCVolume, - RayJobInfo, - RayJobRequest, - SecretVolume, - ServiceType, - Template, - Toleration, - TolerationEffect, - TolerationOperation, - WorkerNodeSpec, - autoscaling_decoder, - cluster_decoder, - cluster_spec_decoder, - env_var_from_decoder, - environment_variables_decoder, - head_node_spec_decoder, - template_decoder, - toleration_decoder, - volume_decoder, - worker_node_spec_decoder, -) - - -def test_toleration(): - - tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) - print(f"\ntoleration 1: {tol1.to_string()}") - t1_json = json.dumps(tol1.to_dict()) - print(f"toleration 1 JSON: {t1_json}") - - tol2 = Toleration( - key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" - ) - print(f"toleration 2: {tol2.to_string()}") - t2_json = json.dumps(tol2.to_dict()) - print(f"toleration 2 JSON: {t2_json}") - - assert tol1.to_string() == toleration_decoder(json.loads(t1_json)).to_string() - assert tol2.to_string() == toleration_decoder(json.loads(t2_json)).to_string() - - -def test_templates(): - - tol1 = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) - tol2 = Toleration( - key="blah2", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute, value="value" - ) - - temp1 = Template(name="template1", namespace="namespace", cpu=1, memory=4, tolerations=[tol1, tol2]) - print(f"\ntemplate 1: {temp1.to_string()}") - tm1_json = json.dumps(temp1.to_dict()) - print(f"template 1 JSON: {tm1_json}") - - temp2 = Template(name="template2", namespace="namespace", cpu=2, memory=8, gpu=1) - print(f"template 2: {temp2.to_string()}") - tm2_json = json.dumps(temp2.to_dict()) - print(f"template 2 JSON: {tm2_json}") - - assert temp1.to_string() == template_decoder(json.loads(tm1_json)).to_string() - assert temp2.to_string() == template_decoder(json.loads(tm2_json)).to_string() - - -def test_volumes(): - - # hostPath - vol = HostPathVolume( - name="hostPath", - mount_path="tmp/hostPath", - source="source", - host_path_type=HostPath.FILE, - mount_propagation=MountPropagationMode.NONE, - ) - print(f"\nhostPath volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"host path volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - vol = PVCVolume( - name="pvc", - mount_path="tmp/pvc", - source="claim", - read_only=True, - mount_propagation=MountPropagationMode.BIDIRECTIONAL, - ) - print(f"PVC volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"PVC volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - vol = EphemeralVolume( - name="ephemeral", mount_path="tmp/ephemeral", storage="5Gi", storage_class="blah", access_mode=AccessMode.RWX - ) - print(f"Ephemeral volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"Ephemeral volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - vol = EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir") - print(f"Empty dir volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"Empty dir volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - vol = ConfigMapVolume( - name="confmap", mount_path="tmp/confmap", source="my-map", items={"sample_code.py": "sample_code.py"} - ) - print(f"config map volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"config map volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - vol = SecretVolume(name="secret", mount_path="tmp/secret", source="my-secret") - print(f"secret volume: {vol.to_string()}") - vol_json = json.dumps(vol.to_dict()) - print(f"secret volume json: {vol_json}") - assert volume_decoder(json.loads(vol_json)).to_string() == vol.to_string() - - -def test_environment(): - - env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") - print(f"\nEnv variable from: {env_v.to_string()}") - env_v_json = json.dumps(env_v.to_dict()) - print(f"Env variable from JSON: {env_v_json}") - assert env_var_from_decoder(json.loads(env_v_json)).to_string() == env_v.to_string() - - envs = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) - print(f"Env variables: {envs.to_string()}") - envs_json = json.dumps(envs.to_dict()) - print(f"Env variables JSON: {envs_json}") - assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() - - envs = EnvironmentVariables(from_ref={"key_ref": env_v}) - print(f"Env variables: {envs.to_string()}") - envs_json = json.dumps(envs.to_dict()) - print(f"Env variables JSON: {envs_json}") - assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() - - envs = EnvironmentVariables(key_value={"key": "val"}) - print(f"Env variables: {envs.to_string()}") - envs_json = json.dumps(envs.to_dict()) - print(f"Env variables JSON: {envs_json}") - assert environment_variables_decoder(json.loads(envs_json)).to_string() == envs.to_string() - - -def test_head_node_spec(): - - env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") - env_s = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) - volumes = [ - PVCVolume( - name="pvc", - mount_path="tmp/pvc", - source="claim", - read_only=True, - mount_propagation=MountPropagationMode.BIDIRECTIONAL, - ), - EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), - ] - - head = HeadNodeSpec( - compute_template="template", - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_HEAD_START_PARAMS, - enable_ingress=True, - service_type=ServiceType.ClusterIP, - volumes=volumes, - environment=env_s, - image_pull_policy="Always", - ) - print(f"\nhead node: {head.to_string()}") - head_json = json.dumps(head.to_dict()) - print(f"head node JSON: {head_json}") - assert head_node_spec_decoder(json.loads(head_json)).to_string() == head.to_string() - - -def test_worker_node_spec(): - - env_v = EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key") - env_s = EnvironmentVariables(key_value={"key": "val"}, from_ref={"key_ref": env_v}) - volumes = [ - PVCVolume( - name="pvc", - mount_path="tmp/pvc", - source="claim", - read_only=True, - mount_propagation=MountPropagationMode.BIDIRECTIONAL, - ), - EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), - ] - - worker = WorkerNodeSpec( - group_name="group", - compute_template="template", - image="rayproject/ray:2.9.0-py310", - replicas=2, - min_replicas=2, - max_replicas=2, - volumes=volumes, - ray_start_params=DEFAULT_WORKER_START_PARAMS, - environment=env_s, - labels={"key": "value"}, - image_pull_policy="IfNotPresent", - ) - print(f"\nworker node: {worker.to_string()}") - worker_json = json.dumps(worker.to_dict()) - print(f"worker node JSON: {worker_json}") - assert worker_node_spec_decoder(json.loads(worker_json)).to_string() == worker.to_string() - - -def test_autoscaler_options(): - options = AutoscalerOptions() - print(f"\nautoscaler options: {options.to_string()}") - options_json = json.dumps(options.to_dict()) - print(f"autoscaler options JSON: {options_json}") - assert autoscaling_decoder(json.loads(options_json)).to_string() == options.to_string() - - options = AutoscalerOptions(cpus="1.0", memory="64GB") - print(f"\nautoscaler options: {options.to_string()}") - options_json = json.dumps(options.to_dict()) - print(f"autoscaler options JSON: {options_json}") - assert autoscaling_decoder(json.loads(options_json)).to_string() == options.to_string() - - -def test_cluster_spec(): - env_s = EnvironmentVariables( - key_value={"key": "val"}, - from_ref={"key_ref": EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key")}, - ) - volumes = [ - PVCVolume( - name="pvc", - mount_path="tmp/pvc", - source="claim", - read_only=True, - mount_propagation=MountPropagationMode.BIDIRECTIONAL, - ), - EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), - ] - spec = ClusterSpec( - head_node=HeadNodeSpec( - compute_template="template", - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_HEAD_START_PARAMS, - volumes=volumes, - enable_ingress=True, - service_type=ServiceType.ClusterIP, - environment=env_s, - ), - worker_groups=[ - WorkerNodeSpec( - group_name="group", - compute_template="template", - replicas=2, - min_replicas=2, - max_replicas=2, - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_WORKER_START_PARAMS, - volumes=volumes, - environment=env_s, - labels={"key": "value"}, - ), - WorkerNodeSpec( - group_name="group1", - compute_template="template1", - replicas=2, - min_replicas=2, - max_replicas=2, - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_WORKER_START_PARAMS, - volumes=volumes, - environment=env_s, - labels={"key": "value"}, - ), - ], - autoscaling_options=AutoscalerOptions(), - ) - print(f"\ncluster spec: {spec.to_string()}") - spec_json = json.dumps(spec.to_dict()) - print(f"cluster spec JSON: {spec_json}") - assert cluster_spec_decoder(json.loads(spec_json)).to_string() == spec.to_string() - - -def test_cluster(): - - event = { - "id": "id", - "name": "name", - "created_at": "ts", - "first_timestamp": "ts", - "last_timestamp": "ts", - "reason": "reason", - "message": "message", - "type": "warning", - "count": "1", - } - print(f"\ncluster event: {ClusterEvent(event).to_string()}") - env_s = EnvironmentVariables( - key_value={"key": "val"}, - from_ref={"key_ref": EnvVarFrom(source=EnvVarSource.SECRET, name="my-secret", key="key")}, - ) - volumes = [ - PVCVolume( - name="pvc", - mount_path="tmp/pvc", - source="claim", - read_only=True, - mount_propagation=MountPropagationMode.BIDIRECTIONAL, - ), - EmptyDirVolume(name="emptyDir", mount_path="tmp/emptyDir"), - ] - spec = ClusterSpec( - head_node=HeadNodeSpec( - compute_template="template", - ray_start_params=DEFAULT_HEAD_START_PARAMS, - enable_ingress=True, - service_type=ServiceType.ClusterIP, - volumes=volumes, - environment=env_s, - annotations={"a_key": "a_val"}, - image="rayproject/ray:2.9.0-py310", - ), - worker_groups=[ - WorkerNodeSpec( - group_name="group", - compute_template="template", - replicas=2, - min_replicas=2, - max_replicas=2, - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_WORKER_START_PARAMS, - volumes=volumes, - environment=env_s, - labels={"key": "value"}, - ), - WorkerNodeSpec( - group_name="group1", - compute_template="template1", - replicas=2, - min_replicas=2, - max_replicas=2, - image="rayproject/ray:2.9.0-py310", - ray_start_params=DEFAULT_WORKER_START_PARAMS, - volumes=volumes, - environment=env_s, - labels={"key": "value"}, - ), - ], - ) - cluster = Cluster( - name="test", - namespace="default", - user="boris", - version="2.9.0", - cluster_spec=spec, - deployment_environment=Environment.DEV, - cluster_environment=env_s, - ) - print(f"cluster: {cluster.to_string()}") - cluster_json = json.dumps(cluster.to_dict()) - print(f"cluster JSON: {cluster_json}") - assert cluster_decoder(json.loads(cluster_json)).to_string() == cluster.to_string() - - cluster_dict = cluster.to_dict() - cluster_dict["created_at"] = "created" - cluster_dict["created_status"] = "status" - cluster_dict["events"] = [event] - print(f"cluster with output: {cluster_decoder(cluster_dict).to_string()}") - - -def test_submission(): - yaml = """ - pip: - - requests==2.26.0 - - pendulum==2.1.2 - env_vars: - counter_name: test_counter - """ - request = RayJobRequest(entrypoint="python /home/ray/samples/sample_code.py", runtime_env=yaml, num_cpu=0.5) - print(f"job request: {request.to_string()}") - request_json = json.dumps(request.to_dict()) - print(f"request JSON: {request_json}") - - info_json = """ - { - "entrypoint":"python /home/ray/samples/sample_code.py", - "jobId":"02000000", - "submissionId":"raysubmit_KWZLwme56esG3Wcr", - "status":"SUCCEEDED", - "message":"Job finished successfully.", - "startTime":"1699442662879", - "endTime":"1699442682405", - "runtimeEnv":{ - "env_vars":"map[counter_name:test_counter]", - "pip":"[requests==2.26.0 pendulum==2.1.2]" - } - } - """ - job_info = RayJobInfo(json.loads(info_json)) - print(job_info.to_string()) diff --git a/kfp/kfp_support_lib_v2/README.md b/kfp/kfp_support_lib_v2/README.md deleted file mode 100644 index 86f3f4360..000000000 --- a/kfp/kfp_support_lib_v2/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# KFP support library - -This provides support for implementing KFP pipelines automating transform's execution. -It comprises 2 main modules - -* [api server client](src/kfp_support/api_server_client/README.md) -* [workflow support](src/kfp_support/workflow_support/README.md) - -## Development - -### Requirements -1. python 3.10 or later -2. git command line tools -3. [pre-commit](https://pre-commit.com/) -4. twine (pip install twine) - * but on Mac you may have to include a dir in your PATH, such as `export PATH=$PATH:/Library/Frameworks/Python.framework/Versions/3.10/bin` - -### Git -Simple clone the repo and set up the pre-commit hooks. -```shell -git clone git@github.com:IBM/data-prep-kit.git -cd kfp/kfp_support_lib -pre-commit install -``` -If you don't have pre-commit, you can install from [here](https://pre-commit.com/) - -## Library Artifact Build and Publish - -The process of creating a release for `fm_data_processing_kfp` package involves the following steps: - -cd to the package directory. - -update the version in [requirements.env](../requirements.env) file. - -run `make build` and `make publish`. - -## Testing - -To run the package tests perform the following: - -To begin with, establish a Kind cluster and deploy all required components by executing the makfefile command in the main directory of this repository. As an alternative, you can manually execute the instructions provided in the [README.md](../../kind/README.md) file. - -```bash -make setup -``` - -The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. - -```bash -make build -``` - -lastly, execute the tests: - -```bash -make test -``` - -### Cleanup - -It is advisable to execute the following command prior to running `make test` once more. This will ensure that any -previous test runs resources are removed before starting new tests. - -```bash -kubectl delete workflows -n kubeflow --all -``` - - diff --git a/kfp/kfp_support_lib_v2/doc/kfp_support_library.md b/kfp/kfp_support_lib_v2/doc/kfp_support_library.md deleted file mode 100644 index 60494b9f9..000000000 --- a/kfp/kfp_support_lib_v2/doc/kfp_support_library.md +++ /dev/null @@ -1,10 +0,0 @@ -# KFP Support Library - -This library is aimed to simplify transform pipelines implementations and consists of 3 main parts: - -* [API Server Client](../src/kfp_support/api_server_client/README.md) -* [workflow support](../src/kfp_support/workflow_support/README.md) -* workflow support_v2 - -See also how this library is used for [kfp components](../../kfp_ray_components/README.md) implementation -and implementation of the actual [workflow](../../doc/simple_transform_pipeline.md) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md deleted file mode 100644 index 423f743a1..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# KubeRay API server APIs - -This is a copy of [Kuberay API server python APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) -Because these APIs are not exposed by any PyPi, we added them to the project \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py deleted file mode 100644 index 60cbbc2f2..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kfp_support.api_server_client.kuberay_apis import KubeRayAPIs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py deleted file mode 100644 index 270815e77..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/kuberay_apis.py +++ /dev/null @@ -1,636 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import time - -import requests -from data_processing.utils import get_logger -from kfp_support.api_server_client.params import ( - Cluster, - RayJobInfo, - RayJobRequest, - Template, - cluster_decoder, - clusters_decoder, - template_decoder, - templates_decoder, -) - - -logger = get_logger(__name__) - - -_headers = {"Content-Type": "application/json", "accept": "application/json"} - -CONNECT_TIMEOUT = 50 -READ_TIMEOUT = 50 -TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT) - - -class KubeRayAPIs: - """ - This class implements KubeRay APIs based on the API server. - To create a class, the following parameters are required: - base - the URL of the API server (default is set to the standalone API server) - wait interval - the amount of sec to wait between checking for cluster ready - """ - - def __init__( - self, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - token: str = None, - http_retries: int = 5, - wait_interval: int = 2, - ): - """ - Initializer - :param server_url: API server url - default assuming running it inside the cluster - :param token: token, only used for API server with security enabled - :param wait_interval: wait interval - :param http_retries: http retries - """ - self.server_url = server_url - if token is not None: - _headers["Authorization"] = token - self.wait_interval = wait_interval - self.api_base = "/apis/v1/" - self.http_retries = http_retries - - def list_compute_templates(self) -> tuple[int, str, list[Template]]: - """ - List compute templates across all namespaces of the k8 cluster - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning(f"Failed to list compute templates, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_compute_templates_namespace(self, ns: str) -> tuple[int, str, list[Template]]: - """ - List compute templates across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: return tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, templates_decoder(response.json()) - else: - logger.warning( - f"Failed to list compute templates for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list compute templates for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_compute_template(self, ns: str, name: str) -> tuple[int, str, Template]: - """ - get a compute template - :param ns: namespace - :param name: template name - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - compute templates - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, template_decoder(response.json()) - else: - logger.warning( - f"Failed to get compute template {name} for namespace {ns}, status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get compute template {name} for namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_compute_template(self, template: Template) -> tuple[int, str]: - """ - Create a compute template - :param template - definition of a template - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{template.namespace}/compute_templates" - for i in range(self.http_retries): - try: - response = requests.post(url, json=template.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_compute_template(self, ns: str, name: str) -> tuple[int, str]: - """ - delete a compute template - :param ns: namespace - :param name: template name - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/compute_templates/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete compute template, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete compute template, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def list_clusters(self) -> tuple[int, str, list[Cluster]]: - """ - List clusters across all namespaces of the k8 cluster - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + "clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list cluster, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list cluster, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_clusters_namespace(self, ns: str) -> tuple[int, str, list[Cluster]]: - """ - List clusters across for a given namespaces of the k8 cluster - :param ns: namespace to query - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of clusters - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, clusters_decoder(response.json()) - else: - logger.warning(f"Failed to list clusters in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list clusters in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def get_cluster(self, ns: str, name: str) -> tuple[int, str, Cluster]: - """ - get cluster - :param ns: namespace - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - clusters definition - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, cluster_decoder(response.json()) - else: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def create_cluster(self, cluster: Cluster) -> tuple[int, str]: - """ - create cluster - :param cluster: cluster definition - :return: tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{cluster.namespace}/clusters" - for i in range(self.http_retries): - try: - response = requests.post(url, json=cluster.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning(f"Failed to create cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to create cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def get_cluster_status(self, ns: str, name: str) -> tuple[int, str, str]: - """ - get cluster status - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - # Execute HTTP request - status, error, cluster = self.get_cluster(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error, None - cluster_status = "creating" - if cluster.cluster_status is not None: - cluster_status = cluster.cluster_status - return status, None, cluster_status - - def wait_cluster_ready(self, ns: str, name: str, wait: int = -1) -> tuple[int, str]: - """ - wait for cluster to be ready - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) - :returns: A tuple containing - http return code - message - only returned if http return code is not equal to 200 - cluster status - """ - current_wait = 0 - while True: - status, error, c_status = self.get_cluster_status(ns=ns, name=name) - # Check execution status - if status // 100 != 2: - return status, error - if c_status == "ready": - return status, None - if current_wait > wait > 0: - return 408, f"Timed out waiting for cluster ready in {current_wait} sec" - time.sleep(self.wait_interval) - current_wait += self.wait_interval - - def get_cluster_endpoints(self, ns: str, name: str, wait: int = -1) -> tuple[int, str, str]: - """ - get cluster endpoint - :param ns: namespace of the cluster - :param name: name of the cluster - :param wait: wait time (-1 waits forever) for cluster to be ready - :returns: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - endpoint (service for dashboard endpoint) - """ - # Ensure that the cluster is ready - status, error = self.wait_cluster_ready(ns=ns, name=name, wait=wait) - if status // 100 != 2: - return status, error, None - # Get cluster - status, error, cluster = self.get_cluster(ns=ns, name=name) - if status // 100 != 2: - return status, error, None - return status, None, f"{name}-head-svc.{ns}.svc.cluster.local:{cluster.service_endpoint['dashboard']}" - - def delete_cluster(self, ns: str, name: str) -> tuple[int, str]: - """ - delete cluster - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/clusters/{name}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers) - if response.status_code // 100 == 2: - return response.status_code, None - elif response.status_code == 404: - # not found - no need to retry - return response.status_code, response.json()["message"] - else: - logger.warning(f"Failed to delete cluster , status : {response.status_code}") - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to delete cluster , exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def submit_job(self, ns: str, name: str, job_request: RayJobRequest) -> tuple[int, str, str]: - """ - submit Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param job_request: job submission - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - submission id - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.post(url, json=job_request.to_dict(), headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json()["submissionId"] - else: - logger.warning( - f"Failed to submit job to the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to submit job to the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, None - - def get_job_info(self, ns: str, name: str, sid: str) -> tuple[int, str, RayJobInfo]: - """ - get Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, RayJobInfo(response.json()) - else: - logger.warning( - f"Failed to get job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to get job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def list_job_info(self, ns: str, name: str) -> tuple[int, str, list[RayJobInfo]]: - """ - list Ray job details - :param ns: namespace of the cluster - :param name: name of the cluster - :return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - list of RayJobInfo object - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - job_info_array = response.json().get("submissions", None) - if job_info_array is None: - return response.status_code, None, [] - else: - return response.status_code, None, [RayJobInfo(i) for i in job_info_array] - else: - logger.warning( - f"Failed to list jobs from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to list jobs from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(5) - return status, message, [] - - def get_job_log(self, ns: str, name: str, sid: str) -> tuple[int, str, str]: - """ - get Ray job log - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - log - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/log/{sid}" - for i in range(self.http_retries): - try: - response = requests.get(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None, response.json().get("log", "") - else: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning( - f"Failed to get log for jobs {sid} from the cluster {name} in namespace {ns}, exception : {e}" - ) - status = 500 - message = str(e) - time.sleep(1) - return status, message, None - - def stop_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - stop Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.post(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message - - def delete_ray_job(self, ns: str, name: str, sid: str) -> tuple[int, str]: - """ - delete Ray job - :param ns: namespace of the cluster - :param name: name of the cluster - :param sid: job submission id - return: a tuple containing - http return code - message - only returned if http return code is not equal to 200 - """ - status = 200 - message = None - # Execute HTTP request - url = self.server_url + self.api_base + f"namespaces/{ns}/jobsubmissions/{name}/{sid}" - for i in range(self.http_retries): - try: - response = requests.delete(url, headers=_headers, timeout=TIMEOUT) - if response.status_code // 100 == 2: - return response.status_code, None - else: - logger.warning( - f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, " - f"status : {response.status_code}" - ) - status = response.status_code - message = response.json()["message"] - except Exception as e: - logger.warning(f"Failed to stop job {sid} from the cluster {name} in namespace {ns}, exception : {e}") - status = 500 - message = str(e) - time.sleep(1) - return status, message diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py deleted file mode 100644 index e5a7d70fa..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from kfp_support.api_server_client.params.templates import ( - TolerationOperation, - TolerationEffect, - Toleration, - Template, - toleration_decoder, - template_decoder, - templates_decoder, -) -from kfp_support.api_server_client.params.volumes import ( - HostPath, - MountPropagationMode, - AccessMode, - BaseVolume, - HostPathVolume, - PVCVolume, - EphemeralVolume, - EmptyDirVolume, - ConfigMapVolume, - SecretVolume, - volume_decoder, -) -from kfp_support.api_server_client.params.environmentvariables import ( - EnvVarSource, - EnvVarFrom, - EnvironmentVariables, - env_var_from_decoder, - environment_variables_decoder, -) -from kfp_support.api_server_client.params.headnode import ( - ServiceType, - HeadNodeSpec, - DEFAULT_HEAD_START_PARAMS, - head_node_spec_decoder, -) -from kfp_support.api_server_client.params.workernode import ( - WorkerNodeSpec, - DEFAULT_WORKER_START_PARAMS, - worker_node_spec_decoder, -) -from kfp_support.api_server_client.params.cluster import ( - Environment, - AutoscalerOptions, - ClusterSpec, - ClusterEvent, - Cluster, - UpscalingMode, - autoscaling_decoder, - cluster_spec_decoder, - cluster_decoder, - clusters_decoder, -) -from kfp_support.api_server_client.params.jobsubmission import RayJobRequest, RayJobInfo diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py deleted file mode 100644 index 922a14bef..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/cluster.py +++ /dev/null @@ -1,475 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - HeadNodeSpec, - WorkerNodeSpec, - environment_variables_decoder, - head_node_spec_decoder, - volume_decoder, - worker_node_spec_decoder, -) - - -class Environment(enum.Enum): - """ - Environment definitions - """ - - DEV = 0 # development - TESTING = 1 # testing - STAGING = 2 # staging - PRODUCTION = 3 # production - - -class UpscalingMode(enum.Enum): - """ - Enumeration of autoscaling mode - """ - - Conservative = ( - "Conservative" # Rate-limited; the number of pending worker pods is at most the size of the Ray cluster - ) - Default = "Default" # no rate limitations - Aggressive = "Aggressive" # same as default - - -class AutoscalerOptions: - """ - AutoscalerOptions is used to define Ray cluster autoscaling. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create autoscaling options specification: gets the following parameters: - idle_timeout - optional, number of seconds to wait before scaling down a worker pod which is not using Ray - resources. Default 60sec (one minute). - upscaling_mode - required autoscaler upscaling mode - image - optional, allows to override the autoscaler's container image - image_pull_policy - optional, allows to override the autoscaler's container image pull policy - cpus - optional, CPUs requirements for autoscaler - default "500m" - memory - optional, memory requirements for autoscaler - default "512Mi" - environment - optional, environment variables for autoscaler container - volumes - optional, a list of volumes to attach to autoscaler container. - This is needed for enabling TLS for the autoscaler container. - """ - - def __init__( - self, - upscaling_mode: UpscalingMode = UpscalingMode.Default, - idle_tmout: int = None, - image: str = None, - image_pull_policy: str = None, - cpus: str = None, - memory: str = None, - environment: EnvironmentVariables = None, - volumes: list[BaseVolume] = None, - ): - """ - Initialization - :param upscaling_mode: upscale mode - :param idle_tmout: idle timeout - :param image: image - :param image_pull_policy: image pull policy - :param cpus: cpu requirement for autoscaling - :param memory: memory requirement for autoscaling - :param environment: autoscaler environment - :param volumes: volumes for autoscaler - """ - self.upscaling_mode = upscaling_mode - self.idle_tmout = idle_tmout - self.image = image - self.image_pull_policy = image_pull_policy - self.cpus = cpus - self.memory = memory - self.environment = environment - self.volumes = volumes - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"upscaling_mode = {self.upscaling_mode}" - if self.idle_tmout is not None: - val += f", idle_timeout = {self.idle_tmout}" - if self.image is not None: - val += f", image = {self.image}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.cpus is not None: - val += f", cpus = {self.cpus}" - if self.memory is not None: - val += f", memory = {self.memory}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"upscalingMode": self.upscaling_mode.value} - if self.idle_tmout is not None: - dct["idleTimeoutSeconds"] = self.idle_tmout - if self.image is not None: - dct["image"] = self.image - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.cpus is not None: - dct["cpu"] = self.cpus - if self.memory is not None: - dct["memory"] = self.memory - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["envs"] = self.environment.to_dict() - return dct - - -class ClusterSpec: - """ - ClusterSpec is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create cluster spec from: gets the following parameters: - head_group_spec - required, specification of the head node - worker_group_spec - optional, list of worker group specs - autoscaler_options - optional, autoscaling options - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - head_node: HeadNodeSpec, - worker_groups: list[WorkerNodeSpec] = None, - autoscaling_options: AutoscalerOptions = None, - ): - """ - Initialization - :param head_node - head node definition - :param worker_groups - worker group definition - :param autoscaling_options - autoscaler options - """ - self.head_node = head_node - self.worker_groups = worker_groups - self.autoscaling_options = autoscaling_options - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster spec - """ - val = f"head_group_spec: {self.head_node.to_string()}" - if self.worker_groups is not None: - val += "\nworker groups: " - for w in self.worker_groups: - val += f"\nworker_group_spec = {w.to_string()}]" - if self.autoscaling_options is not None: - val += f"\nautoscaling options = {self.autoscaling_options.to_string()}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: Dictionary representation of cluster spec - """ - dst = {"headGroupSpec": self.head_node.to_dict()} - if self.worker_groups is not None: - dst["workerGroupSpec"] = [w.to_dict() for w in self.worker_groups] - if self.autoscaling_options is not None: - dst["enableInTreeAutoscaling"] = True - dst["autoscalerOptions"] = self.autoscaling_options.to_dict() - return dst - - -class ClusterEvent: - """ - Cluster event is used to define events emitted during cluster creation. - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create event: gets the dictionary with the following parameters: - id - unique Event Id - name - human readable event name - created_at - event creation time - first_timestamp - first time the event occur - last_timestamp - last time the event occur - reason - reason for the transition into the object's current status - message - human-readable description of the status of this operation - type - type of this event (Normal, Warning), new types could be added in the future - count - number of times this event has occurred - """ - - def __init__(self, dst: dict[str, Any]): - """ - Initialization from dictionary - :param dst: dictionary representation of cluster event - """ - self.id = dst.get("id", "") - self.name = dst.get("name", "") - self.created_at = dst.get("created_at", "") - self.first_timestamp = dst.get("first_timestamp", "") - self.last_timestamp = dst.get("last_timestamp", "") - self.reason = dst.get("reason", "") - self.message = dst.get("message", "") - self.type = dst.get("type", "") - self.count = dst.get("count", "0") - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of cluster event - """ - return ( - f"id = {self.id}, name = {self.name}, created_at = {self.created_at}, " - f"first_timestamp = {self.first_timestamp}, last_timestamp = {self.last_timestamp}," - f"reason = {self.reason}, message = {self.message}, type = {self.type}, count = {self.count}" - ) - - -class Cluster: - """ - Cluster is used to define Ray cluster. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - name - required, unique (per namespace) cluster name - namespace - required, cluster's namespace (should exist) - user - required, user who owns the cluster - version - required, Ray cluster version - typically Ray version - deployment_environment - optional (see Environment) - cluster_spec - required, ray cluster configuration - annotations - optional, annotations, for example, "kubernetes.io/ingress.class" to define Ingress class - cluster_environment - optional, cluster environment variables - created_at - output, cluster creation ts - deleted_at - output, cluster deletion ts - cluster_status - output, cluster status - events - output, cluster events - service_endpoint - output, cluster service endpoints - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__( - self, - name: str, - namespace: str, - user: str, - version: str, - cluster_spec: ClusterSpec, - deployment_environment: Environment = None, - annotations: dict[str, str] = None, - cluster_environment: EnvironmentVariables = None, - created_at: str = None, - deleted_at: str = None, - cluster_status: str = None, - events: list[ClusterEvent] = None, - service_endpoint: dict[str, str] = None, - ): - """ - Initialization - :param name: cluster name - :param namespace: cluster namespace - :param user: user name - :param version: version - :param cluster_spec: cluster spec - :param deployment_environment: cluster deployment environment - :param annotations: cluster annotations - :param cluster_environment: cluster environment - :param created_at: created at - :param deleted_at: deleted at - :param cluster_status: status - :param events: cluster events - :param service_endpoint: service endpoint - """ - self.name = name - self.namespace = namespace - self.user = user - self.version = version - self.cluster_spec = cluster_spec - self.environment = deployment_environment - self.annotations = annotations - self.envs = cluster_environment - self.created_at = created_at - self.deleted_at = deleted_at - self.cluster_status = cluster_status - self.events = events - self.service_endpoint = service_endpoint - - def to_string(self) -> str: - """ - convert to string representation - :return: string representation of cluster - """ - val = ( - f"name: {self.name}, namespace = {self.namespace}, user = {self.user}, version = {self.version} " - f"cluster_spec = {self.cluster_spec.to_string()}" - ) - if self.environment is not None: - val += f"deployment environment = {self.environment.name}" - if self.annotations is not None: - val += f" ,annotations = {str(self.annotations)}" - if self.envs is not None: - val = val + f",cluster environment = {self.envs.to_string()}" - val += "\ncluster output\n" - if self.created_at is not None: - val += f" ,created_at = {self.created_at}" - if self.deleted_at is not None: - val += f" ,deleted_at = {self.deleted_at}" - if self.cluster_status is not None: - val += f" ,cluster status = {self.cluster_status}" - if self.events is not None: - val = val + ",\n cluster events = [" - first = True - for e in self.events: - if first: - first = False - else: - val += ", " - val = val + "{" + e.to_string() + "}" - val = val + "]" - if self.service_endpoint is not None: - val += f" ,service endpoints = {str(self.service_endpoint)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of cluster - """ - # only convert input variables - dst = { - "name": self.name, - "namespace": self.namespace, - "user": self.user, - "version": self.version, - "clusterSpec": self.cluster_spec.to_dict(), - } - if self.environment is not None: - dst["environment"] = self.environment.value - if self.annotations is not None: - dst["annotations"] = self.annotations - if self.envs is not None: - dst["envs"] = self.envs.to_dict() - return dst - - -""" - Creates new cluster from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def autoscaling_decoder(dct: dict[str, Any]) -> AutoscalerOptions: - """ - Create autoscaling options from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: autoscaling options - """ - upscaling_mode = UpscalingMode.Default - if "upscalingMode" in dct: - upscaling_mode = UpscalingMode(dct.get("upscalingMode")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("envs")) > 0: - environments = environment_variables_decoder(dct.get("envs")) - return AutoscalerOptions( - upscaling_mode=upscaling_mode, - idle_tmout=dct.get("idleTimeoutSeconds", None), - image=dct.get("image", None), - image_pull_policy=dct.get("imagePullPolicy", None), - cpus=dct.get("cpu", None), - memory=dct.get("memory", None), - environment=environments, - volumes=volumes, - ) - - -def cluster_spec_decoder(dct: dict[str, Any]) -> ClusterSpec: - """ - Create cluster spec from its dictionary representation - :param dct: dictionary representation of cluster spec - :return: cluster spec - """ - workers = None - autoscaling_options = None - if "workerGroupSpec" in dct: - workers = [worker_node_spec_decoder(w) for w in dct["workerGroupSpec"]] - if "enableInTreeAutoscaling" in dct and dct.get("enableInTreeAutoscaling"): - autoscaling_options = autoscaling_decoder(dct.get("autoscalerOptions", {})) - return ClusterSpec( - head_node=head_node_spec_decoder(dct.get("headGroupSpec")), - worker_groups=workers, - autoscaling_options=autoscaling_options, - ) - - -def cluster_decoder(dct: dict[str, Any]) -> Cluster: - """ - Create cluster from its dictionary representation - :param dct: dictionary representation of cluster - :return: cluster - """ - environment = None - if "environment" in dct: - environment = Environment(int(dct.get("environment", "0"))) - events = None - if "events" in dct: - events = [ClusterEvent(c) for c in dct["events"]] - envs = None - if "envs" in dct: - envs = environment_variables_decoder(dct.get("envs")) - return Cluster( - name=dct.get("name", ""), - namespace=dct.get("namespace", ""), - user=dct.get("user", ""), - version=dct.get("version", ""), - cluster_spec=cluster_spec_decoder(dct.get("clusterSpec")), - deployment_environment=environment, - annotations=dct.get("annotations"), - cluster_environment=envs, - created_at=dct.get("createdAt"), - deleted_at=dct.get("deletedAt"), - cluster_status=dct.get("clusterState"), - events=events, - service_endpoint=dct.get("serviceEndpoint"), - ) - - -def clusters_decoder(dct: dict[str, any]) -> list[Cluster]: - """ - Create list of clusters from its dictionary representation - :param dct: dictionary representation of a list of clusters - :return: list of clusters - """ - return [cluster_decoder(cluster) for cluster in dct["clusters"]] diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py deleted file mode 100644 index d1056f6f6..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/environmentvariables.py +++ /dev/null @@ -1,158 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class EnvVarSource(enum.Enum): - """ - Enumeration of environment sources - """ - - CONFIGMAP = 0 # config map - SECRET = 1 # secret - RESOURCE_FIELD = 2 # resource field - FIELD = 3 # field - - -class EnvVarFrom: - """ - EnvVarFrom is used to define an environment variable from one of the sources (EnvarSource). - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - Source required - source of environment variable - name required name for config map or secret, container name for resource, path for field - key required Key for config map or secret, resource name for resource - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, source: EnvVarSource, name: str, key: str): - """ - Initialize - :param source - source - :param name source name - :param key source key - """ - self.source = source - self.name = name - self.key = key - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of environment from - """ - return f"source = {self.source.name}, name = {self.name}, key = {self.key}" - - def to_dict(self) -> dict[str, Any]: - """ - convert to dictionary - :return: dictionary representation of environment from - """ - return {"source": self.source.value, "name": self.name, "key": self.key} - - -class EnvironmentVariables: - """ - EnvironmentVariables is used to define environment variables. - It provides APIs to create, stringify, convert to dict and json. - - Methods: - - Create env variable from: gets the following parameters: - key_value - optional, dictionary of key/value environment variables - from_ref - optional, dictionary of reference environment variables - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key_value: dict[str, str] = None, from_ref: dict[str, EnvVarFrom] = None): - """ - Initialization - :param key_value: dictionary of key/value pairs for environment variables - :param from_ref: dictionary of key/value pairs for environment from variables - """ - self.key_val = key_value - self.from_ref = from_ref - - def to_string(self) -> str: - """ - convert to string - :return: string representation of environment variables - """ - val = "" - if self.key_val is not None: - val = f"values = {str(self.key_val)}" - if self.from_ref is not None: - if val != "": - val += " , " - val += "valuesFrom = {" - first = True - for k, v in self.from_ref.items(): - if not first: - val += ", " - else: - first = False - val += f"{k} = [{v.to_string()}]" - val += "}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of environment variables - """ - dst = {} - if self.key_val is not None: - dst["values"] = self.key_val - if self.from_ref is not None: - fr = {} - for k, v in self.from_ref.items(): - fr[k] = v.to_dict() - dst["valuesFrom"] = fr - return dst - - -""" - Creates new environment variable from from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def env_var_from_decoder(dct: dict[str, Any]) -> EnvVarFrom: - """ - Create environment from from dictionary - :param dct: dictionary representations of environment from - :return: environment from - """ - return EnvVarFrom(name=dct.get("name", ""), source=EnvVarSource(int(dct.get("source", 0))), key=dct.get("key", "")) - - -def environment_variables_decoder(dct: dict[str, Any]) -> EnvironmentVariables: - """ - Create environment variables from from dictionary - :param dct: dictionary representations of environment variables - :return: environment variables - """ - keyvalues = None - fr = None - if "values" in dct: - keyvalues = dct.get("values") - if "valuesFrom" in dct: - from_ref = dct.get("valuesFrom") - fr = {} - for k, v in from_ref.items(): - fr[k] = env_var_from_decoder(v) - return EnvironmentVariables(key_value=keyvalues, from_ref=fr) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py deleted file mode 100644 index 7a9d4120f..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/headnode.py +++ /dev/null @@ -1,202 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - -from kfp_support.api_server_client.params import ( - BaseVolume, - EnvironmentVariables, - environment_variables_decoder, - volume_decoder, -) - - -DEFAULT_HEAD_START_PARAMS = {"dashboard-host": "0.0.0.0", "metrics-export-port": "8080", "num-cpus": "0"} - - -class ServiceType(enum.Enum): - """ - Enumeration of head node service types - """ - - ClusterIP = "ClusterIP" # cluster IP - NodePort = "NodePort" # node port - LoadBalancer = "LoadBalancer" # load balancer - - -class HeadNodeSpec: - """ - HeadNodeSpec is used to define Ray cluster head node configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create head node specification: gets the following parameters: - compute_template - required, the computeTemplate of head node group - ray_start_params - required, Ray start parameters - image - optional, image used for head node - service_type - optional (ServiceType), service type foe headnode - enable_ingress - optional, allow to enable ingress for dashboard - volumes - optional, a list of volumes to attach to head node - service_account - optional, a service account (has to exist) to run head node - image_pull_secret - optional, secret to pull head node image from registry - environment - optional, environment variables for head pod - annotations - optional, annotations for head node - labels - optional, labels for head node - image_pull_policy - optional, head node pull image policy. Default IfNotPresent - """ - - def __init__( - self, - compute_template: str, - image: str, - ray_start_params: dict[str, str] = DEFAULT_HEAD_START_PARAMS, - service_type: ServiceType = ServiceType.ClusterIP, - enable_ingress: bool = False, - volumes: list[BaseVolume] = None, - service_account: str = None, - image_pull_secret: str = None, - environment: EnvironmentVariables = None, - annotations: dict[str, str] = None, - labels: dict[str, str] = None, - image_pull_policy: str = None, - ): - """ - Initialization - :param compute_template: compute template - :param ray_start_params: ray start parameters - :param image: node image - :param service_type: service type - :param enable_ingress: enable ingress flag - :param volumes: volumes for head node - :param service_account: service account - :param image_pull_secret: image pull secret - :param environment: head node environment - :param annotations: head node annotation - :param labels: labels - :param image_pull_policy: image pull policy - """ - - self.compute_template = compute_template - self.ray_start_params = ray_start_params - self.ray_start_params.update(DEFAULT_HEAD_START_PARAMS) - self.image = image - self.service_type = service_type - self.enable_ingress = enable_ingress - self.volumes = volumes - self.service_account = service_account - self.image_pull_secret = image_pull_secret - self.environment = environment - self.annotations = annotations - self.labels = labels - self.image_pull_policy = image_pull_policy - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of the head node - """ - val = f"compute template = {self.compute_template}, ray start params = {str(self.ray_start_params)}" - if self.image is not None: - val += f", image = {self.image}" - if self.service_type is not None: - val += f", service_type = {self.service_type.name}" - if self.enable_ingress: - val += ", enable_ingress = True" - if self.service_account is not None: - val += f", service_account = {self.service_account}" - if self.image_pull_secret is not None: - val += f", image_pull_secret = {self.image_pull_secret}" - if self.image_pull_policy is not None: - val += f", image_pull_policy = {self.image_pull_policy}" - if self.volumes is not None: - val = val + ",\n volumes = [" - first = True - for v in self.volumes: - if first: - first = False - else: - val += ", " - val = val + "{" + v.to_string() + "}" - val = val + "]" - if self.environment is not None: - val = val + f",\n environment = {self.environment.to_string()}" - if self.annotations is not None: - val = val + f",\n annotations = {str(self.annotations)}" - if self.labels is not None: - val = val + f",\n labels = {str(self.labels)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of the head node - """ - dct = {"computeTemplate": self.compute_template, "rayStartParams": self.ray_start_params} - if self.image is not None: - dct["image"] = self.image - if self.service_type is not None: - dct["serviceType"] = self.service_type.value - if self.enable_ingress: - dct["enableIngress"] = True - if self.service_account is not None: - dct["service_account"] = self.service_account - if self.image_pull_secret is not None: - dct["image_pull_secret"] = self.image_pull_secret - if self.image_pull_policy is not None: - dct["imagePullPolicy"] = self.image_pull_policy - if self.volumes is not None: - dct["volumes"] = [v.to_dict() for v in self.volumes] - if self.environment is not None: - dct["environment"] = self.environment.to_dict() - if self.annotations is not None: - dct["annotations"] = self.annotations - if self.labels is not None: - dct["labels"] = self.labels - return dct - - -""" - Creates new head node from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def head_node_spec_decoder(dct: dict[str, Any]) -> HeadNodeSpec: - """ - Create head node spec from dictionary - :param dct: dictionary representation of head node spec - :return: Head node spec - """ - service_type = None - if "serviceType" in dct: - service_type = ServiceType(dct.get("serviceType", "ClusterIP")) - volumes = None - if "volumes" in dct: - volumes = [volume_decoder(v) for v in dct["volumes"]] - environments = None - if "environment" in dct and len(dct.get("environment")) > 0: - environments = environment_variables_decoder(dct.get("environment")) - return HeadNodeSpec( - compute_template=dct.get("computeTemplate"), - ray_start_params=dct.get("rayStartParams"), - image=dct.get("image"), - service_type=service_type, - enable_ingress=dct.get("enableIngress", False), - volumes=volumes, - service_account=dct.get("service_account", None), - image_pull_secret=dct.get("imagePullSecret", None), - image_pull_policy=dct.get("imagePullPolicy", None), - environment=environments, - annotations=dct.get("annotations", None), - labels=dct.get("labels", None), - ) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py deleted file mode 100644 index a0b2bfcb0..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/jobsubmission.py +++ /dev/null @@ -1,163 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import datetime -from typing import Any - - -class RayJobRequest: - """ - RayJobRequest used to define job to be submitted to a Ray cluster - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - required, the command to start a job on the cluster - submission_id - optional, submission id for the job submission - runtime_env - optional, yaml string specifying job runtime environment - metadata - optional, dictionary of the submission metadata - num_cpus - optional, number of cpus for job execution - num_gpus - optional, number of gpus for job execution - resources - optional, dictionary of the resources for job execution - """ - - def __init__( - self, - entrypoint: str, - submission_id: str = None, - runtime_env: str = None, - metadata: dict[str, str] = None, - num_cpu: float = -1.0, - num_gpu: float = -1.0, - resources: dict[str, str] = None, - ): - """ - Initialization see https://docs.ray.io/en/latest/cluster/running-applications/job-submission/api.html - :param entrypoint: entrypoint - :param submission_id: submission id - :param runtime_env: runtime environment - :param metadata: submission metadata - :param num_cpu: job number cpus - :param num_gpu: job number gpus - :param resources: job custom resources - """ - self.entrypoint = entrypoint - self.submission_id = submission_id - self.runtime_env = runtime_env - self.metadata = metadata - self.num_cpu = num_cpu - self.num_gpu = num_gpu - self.resources = resources - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of job submission - """ - val = f"entrypoint = {self.entrypoint}" - if self.submission_id is not None: - val += f", submission_id = {self.submission_id}" - if self.num_cpu > 0: - val += f", num_cpu = {self.num_cpu}" - if self.num_gpu > 0: - val += f", num_gpu = {self.num_gpu}" - if self.runtime_env is not None: - val += f", runtime_env = {self.runtime_env}" - if self.metadata is not None: - val += f", metadata = {self.metadata}" - if self.resources is not None: - val += f", resources = {self.resources}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of job submission - """ - dct = {"entrypoint": self.entrypoint} - if self.submission_id is not None: - dct["submissionId"] = self.submission_id - if self.runtime_env is not None: - dct["runtimeEnv"] = self.runtime_env - if self.metadata is not None: - dct["metadata"] = self.metadata - if self.num_cpu > 0: - dct["numCpus"] = self.num_cpu - if self.num_gpu > 0: - dct["numGpus"] = self.num_gpu - if self.resources is not None: - dct["resources"] = self.resources - return dct - - -class RayJobInfo: - """ - RayJobInfo used to define information about the job in a Ray cluster - It provides APIs to create and stringify. Its output only data, so we do not need to implement to_dict - - Methods: - - Create RayJobRequest: gets the following parameters: - entrypoint - the command to start a job on the cluster - job_id - job execution id - submission_id - submission id for the job submission - runtime_env - job runtime environment - status - job execution status - message - status message - start_time - job start time - end-time - job ind time - error_type - type of error - metadata - optional, dictionary of the submission metadata - """ - - def __init__(self, dct: dict[str, Any]): - """ - Initialize from dictionary - :param dct: dictionary representation of Ray job info - """ - self.entrypoint = dct.get("entrypoint", "") - self.job_id = dct.get("jobId", "") - self.submission_id = dct.get("submissionId", "") - self.status = dct.get("status", "") - self.message = dct.get("message", None) - self.start_time = int(dct.get("startTime", "0")) - self.end_time = int(dct.get("endTime", "0")) - self.error_type = dct.get("ErrorType", None) - self.metadata = dct.get("Metadata", None) - self.runtime_env = dct.get("runtimeEnv", None) - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of Ray job info - """ - val = ( - f"entrypoint = {self.entrypoint}, job id {self.job_id}, submission id = {self.submission_id}," - f" status = {self.status}" - ) - if self.message is not None: - val += f" message = {self.message}" - if self.start_time > 0: - val += ( - f" start time = " - f"{datetime.datetime.fromtimestamp(self.start_time /1.e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.end_time > 0: - val += ( - f" end time = " f"{datetime.datetime.fromtimestamp(self.end_time / 1e3).strftime('%Y-%m-%d %H:%M:%S')}" - ) - if self.error_type is not None: - val += f" error type = {self.error_type}" - if self.runtime_env is not None: - val += f" runtime env = {str(self.runtime_env)}" - if self.metadata is not None: - val += f" metadata = {str(self.metadata)}" - return val diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py deleted file mode 100644 index 0ef4c1583..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/templates.py +++ /dev/null @@ -1,224 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class TolerationOperation(enum.Enum): - """ - Toleration operation types - """ - - Exists = "Exists" # exists - Equal = "Equal" # equal - - -class TolerationEffect(enum.Enum): - """ - Toleration effect - """ - - NoSchedule = "NoSchedule" # not schedule - PreferNoSchedule = "PreferNoSchedule" # prefer not schedule - NoExecute = "NoExecute" # not execute - - -class Toleration: - """ - Toleration is used by compute template to pick specific nodes for placing pods. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create toleration: gets the following parameters: - key - required, key created by the node's taint - operator - required, operator to apply, supported operators are "Exists" and "Equal" - effect - required, toleration effect supported effects are "NoSchedule", "PreferNoSchedule", "NoExecute" - value - optional, value - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - """ - - def __init__(self, key: str, operator: TolerationOperation, effect: TolerationEffect, value: str = None): - """ - Initialization - :param key: key - :param operator: operator - :param effect: effect - :param value: value - """ - self.key = key - self.operator = operator - self.value = value - self.effect = effect - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of toleration - """ - val = f"key = {self.key}, operator = {self.operator.name}, effect = {self.effect.name}" - if self.value is None: - return val - else: - return val + f", value = {self.value}" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to string - :return: string representation of toleration - """ - dct = {"key": self.key, "operator": self.operator.value, "effect": self.effect.value} - if self.value is not None: - dct["value"] = self.value - return dct - - -# Here the default gpu-accelerator is "nvidia.com/gpu", that is used for generating limits. -# If it is specified, it has to be in the format that is understood by kubernetes as a valid -# The following devices are currently supported by kubernetes: -# AMD - gpu accelerator amd.com/gpu -# Intel - gpu accelerator gpu.intel.com/i915 -# NVIDIA - gpu accelerator nvidia.com/gpu - - -class Template: - """ - Template is used to define specific nodes configuration. - It provides APIs to create, stringify and convert to dict. - - Methods: - - Create templates: gets the following parameters: - name - required, template name - namespace - required, template namespace - cpus - required, template number of cpus - memory - required, template memory (GB) - gpus - optional, number of GPUs, default 0 - gpu_accelerator - optional, if not defined nvidia.com/gpu is assumed - tolerations - optional, tolerations for pod placing, default none - - to_string() -> str: convert toleration to string for printing - - to_dict() -> dict[str, Any] convert to dict - - to_json() -> str convert to json string - """ - - def __init__( - self, - name: str, - namespace: str, - cpu: int, - memory: int, - gpu: int = 0, - gpu_accelerator: str = None, - tolerations: list[Toleration] = None, - ): - """ - Initialization - :param name: name - :param namespace: namespace - :param cpu: cpu - :param memory: memory - :param gpu: gpu - :param gpu_accelerator: accelerator type - :param tolerations: tolerations - """ - self.name = name - self.namespace = namespace - self.cpu = cpu - self.memory = memory - self.gpu = gpu - self.gpu_accelerator = gpu_accelerator - self.tolerations = tolerations - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of template - """ - val = f"name = {self.name}, namespace = {self.namespace}, cpu = {self.cpu}, memory = {self.memory}" - if self.gpu > 0: - val = val + f", gpu {self.gpu}" - if self.gpu_accelerator is not None: - val = val + f", gpu accelerator {self.gpu_accelerator}" - if self.tolerations is None: - return val - val = val + ", tolerations [" - first = True - for tol in self.tolerations: - if first: - first = False - val = val + "{" + tol.to_string() + "}" - else: - val = val + ", {" + tol.to_string() + "}" - return val + "]" - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of template - """ - dct = {"name": self.name, "namespace": self.namespace, "cpu": self.cpu, "memory": self.memory} - if self.gpu > 0: - dct["gpu"] = self.gpu - if self.gpu_accelerator is not None: - dct["gpu accelerator"] = self.gpu_accelerator - if self.tolerations is not None: - dct["tolerations"] = [tl.to_dict() for tl in self.tolerations] - return dct - - -""" - Creates new toleration from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def toleration_decoder(dct: dict[str, Any]) -> Toleration: - """ - Create toleration from dictionary - :param dct: dictionary representation of toleration - :return: toleration - """ - return Toleration( - key=dct.get("key"), - operator=TolerationOperation(dct.get("operator", "Exists")), - effect=TolerationEffect(dct.get("effect", "NoSchedule")), - value=dct.get("value"), - ) - - -def template_decoder(dct: dict[str, Any]) -> Template: - """ - Create template from dictionary - :param dct: dictionary representation of template - :return: template - """ - tolerations = None - if "tolerations" in dct: - tolerations = [toleration_decoder(d) for d in dct["tolerations"]] - return Template( - name=dct.get("name"), - namespace=dct.get("namespace"), - cpu=int(dct.get("cpu", "0")), - memory=int(dct.get("memory", "0")), - gpu=int(dct.get("gpu", "0")), - gpu_accelerator=dct.get("gpu_accelerator"), - tolerations=tolerations, - ) - - -def templates_decoder(dct: dict[str, Any]) -> list[Template]: - """ - Create list of template from dictionary - :param dct: dictionary representation of list of template - :return: list of template - """ - return [template_decoder(tmp) for tmp in dct["computeTemplates"]] diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py b/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py deleted file mode 100644 index fee0e1ea4..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/api_server_client/params/volumes.py +++ /dev/null @@ -1,449 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import enum -from typing import Any - - -class HostPath(enum.Enum): - """ - Host path enumeration - """ - - DIRECTORY = 0 # directory - FILE = 1 # files - - -class MountPropagationMode(enum.Enum): - """ - Mount propagation enumeration - """ - - NONE = 0 # None - HOSTTOCONTAINER = 1 # host to container - BIDIRECTIONAL = 2 # bi directional - - -class AccessMode(enum.Enum): - """ - Access mode enumeration - """ - - RWO = 0 # read write once - ROX = 1 # read only many - RWX = 2 # read write many - - -class BaseVolume: - """ - KubeRay currently support several types of volumes, including hostPat, PVC, - ephemeral volumes, config maps, secrets and empty dir. All of them use slightly - different parameters. Base Volume is a base class for all different volume types. - """ - - def to_string(self) -> str: - """ - Convert to string - :return: string representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: dictionary representation of base volume - """ - raise Exception(f"Base volume cannot be used directly. Pls use one of the derived classes") - - -class HostPathVolume(BaseVolume): - """ - This class implements HostPath volume. In addition to name and mount path it requires host - path volume specific parameters: - source - data location on host - hostPathType - host path type: directory (0) or file (1) - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - host_path_type: HostPath = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param host_path_type: host path type - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.host_path_type = host_path_type - self.volume_type = 1 - self.mount_propagation = mount_propagation - - def to_string(self) -> str: - """ - Convert to string - :return: HostPathVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = hostPath" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - if self.host_path_type is not None: - val += f", host path type = {self.host_path_type.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: HostPathVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - if self.host_path_type is not None: - dst["hostPathType"] = self.host_path_type.value - return dst - - -class PVCVolume(BaseVolume): - """ - This class implements PVC volume. In addition to name and mount path it requires - PVC volume specific parameters: - source - PVC claim name - read_only - read only flag - mountPropagationMode - mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - read_only: bool = False, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param read_only: read only - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.volume_type = 0 - self.mount_propagation = mount_propagation - self.readonly = read_only - - def to_string(self) -> str: - """ - Convert to string - :return: PVCVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = PVC" - if self.readonly: - val += ", read only = True" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: PVCVolume dictionary representation - """ - dst = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.readonly: - dst["readOnly"] = True - if self.mount_propagation is not None: - dst["mountPropagationMode"] = self.mount_propagation.value - return dst - - -class EphemeralVolume(BaseVolume): - """ - This class implements Ephemeral volume. In addition to name and mount path it requires - Ephemeral volume specific parameters: - storage - disk size (valid k8 value, for example 5Gi) - storageClass - storage class - optional, if not specified, use default - accessMode - access mode RWO - optional ReadWriteOnce (0), ReadOnlyMAny (1), ReadWriteMany (2) - mountPropagationMode - optional mount propagation: None (0), host to container (1) or bidirectional (2) - """ - - def __init__( - self, - name: str, - mount_path: str, - storage: str, - storage_class: str = None, - access_mode: AccessMode = None, - mount_propagation: MountPropagationMode = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param storage: storage - :param storage_class: storage class - :param access_mode: access mode - :param mount_propagation: mount propagation - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 2 - self.mount_propagation = mount_propagation - self.storage_class = storage_class - self.access_mode = access_mode - - def to_string(self) -> str: - """ - Convert to string - :return: EphemeralVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, storage = {self.storage} " f"volume type = ephemeral" - ) - if self.storage_class is not None: - val += f", storage class = {self.storage_class}" - if self.access_mode is not None: - val += f", access mode = {self.access_mode.name}" - if self.mount_propagation is not None: - val += f", mount propagation = {self.mount_propagation.name}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: EphemeralVolume dictionary representation - """ - dct = { - "name": self.name, - "mountPath": self.mount_path, - "storage": self.storage, - "volumeType": self.volume_type, - } - if self.storage_class is not None: - dct["storageClassName"] = self.storage_class - if self.access_mode is not None: - dct["accessMode"] = self.access_mode.value - if self.mount_propagation is not None: - dct["mountPropagationMode"] = self.mount_propagation.value - return dct - - -class EmptyDirVolume(BaseVolume): - """ - This class implements EmptyDir volume. In addition to name and mount path it requires - Empty Dir specific parameters: - storage - optional max storage size (valid k8 value, for example 5Gi) - """ - - def __init__(self, name: str, mount_path: str, storage: str = None): - """ - Initialization - :param name: name - :param mount_path: mount_path - :param storage: storage - """ - self.name = name - self.mount_path = mount_path - self.storage = storage - self.volume_type = 5 - - def to_string(self) -> str: - """ - Convert to string - :return: EmptyDirVolume string representation - """ - val = f"name = {self.name}, mount_path = {self.mount_path}, volume type = emptyDir" - if self.storage is not None: - val += f", storage = {self.storage}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "volumeType": self.volume_type} - if self.storage is not None: - dct["storage"] = self.storage - return dct - - -class ConfigMapVolume(BaseVolume): - """ - This class implements ConfigMap volume. In addition to name and mount path it requires - configMap volume specific parameters: - source - required, config map name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - """ - Initialization - :param name: name - :param mount_path: mount path - :param source: source - :param items: items - """ - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 3 - - def to_string(self) -> str: - """ - Convert to string - :return: ConfigMapVolume string representation - """ - val = ( - f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = configmap" - ) - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - """ - Convert to dictionary - :return: ConfigMapVolume dictionary representation - """ - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -class SecretVolume(BaseVolume): - """ - This class implements Secret volume. In addition to name and mount path it requires - Secret volume specific parameters: - source - required, secret name - items - optional, key/path items (optional) - """ - - def __init__( - self, - name: str, - mount_path: str, - source: str, - items: dict[str, str] = None, - ): - self.name = name - self.mount_path = mount_path - self.source = source - self.items = items - self.volume_type = 4 - - def to_string(self) -> str: - val = f"name = {self.name}, mount_path = {self.mount_path}, source = {self.source}, " f"volume type = secret" - if self.items is not None: - val = val + f", items = {str(self.items)}" - return val - - def to_dict(self) -> dict[str, Any]: - dct = {"name": self.name, "mountPath": self.mount_path, "source": self.source, "volumeType": self.volume_type} - if self.items is not None: - dct["items"] = self.items - return dct - - -""" - Creates new Volume from dictionary, used for unmarshalling json. Python does not - support multiple constructors, so do it this way -""" - - -def volume_decoder(dst: dict[str, Any]) -> BaseVolume: - def _get_mount_propagation() -> MountPropagationMode: - if "mountPropagationMode" in dst: - return MountPropagationMode(int(dst.get("mountPropagationMode", "0"))) - return None - - def _get_host_path() -> HostPath: - if "hostPathType" in dst: - return HostPath(int(dst.get("hostPathType", "0"))) - return None - - def _get_access_mode() -> AccessMode: - if "accessMode" in dst: - return AccessMode(int(dst.get("accessMode", "0"))) - return None - - match dst["volumeType"]: - case 0: - # PVC - return PVCVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - read_only=dst.get("readOnly", False), - mount_propagation=_get_mount_propagation(), - ) - case 1: - # host path - return HostPathVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - host_path_type=_get_host_path(), - mount_propagation=_get_mount_propagation(), - ) - case 2: - # Ephemeral volume - return EphemeralVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - storage=dst.get("storage", ""), - storage_class=dst.get("storageClassName"), - access_mode=_get_access_mode(), - mount_propagation=_get_mount_propagation(), - ) - case 3: - # ConfigMap Volume - return ConfigMapVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 4: - # Secret Volume - return SecretVolume( - name=dst.get("name", ""), - mount_path=dst.get("mountPath", ""), - source=dst.get("source", ""), - items=dst.get("items"), - ) - case 5: - # Empty dir volume - return EmptyDirVolume( - name=dst.get("name", ""), mount_path=dst.get("mountPath", ""), storage=dst.get("storage") - ) - case _: - raise Exception(f"Unknown volume type in {dst}") diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md deleted file mode 100644 index 4943a0b06..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Workflow Utils for KFP v1 - -This library provides 3 main classes: -* KFPUtils - helper utilities for KFP implementations -* PipelinesUtils - helper class for pipeline management based on KFP client -* RayRemoteJobs - class supporting Ray remote jobs - -## KFPUtils - -This class contains a collection of functions useful for KFP pipelines implementation, which include: -* credentials - get S3 credentials from the environment -* get_namespace - get the name of the kubernetes namespace we are running in -* runtime_name - generates unique runtime name -* dict_to_req - convert dictionary of request parameters to a proper formatted JSON string -* load_from_json - convert json string to dictionary and exit with error if conversion fails - -## PipelinesUtils - -This class provides some higher level functionality based on the capabilities of the python KFP client, including" -* get_experiment_by_name obtains KFP experiment object based on its name -* get_pipeline_by_name obtains KFP pipeline object based on its name -* start_pipeline start a pipeline represented by pipeline object in experiment represented by experiment object and a -dictionary of parameters. It returns kfp run ID -* wait_pipeline_completion - waits for the completion of the pipeline run with the given ID - -## RayRemoteJobs - -At the moment there is no "standard" approach for KubeRay remote APIs. There are several options available, -including [codeflareSDK](https://github.com/project-codeflare/codeflare-sdk/tree/1fe04c3022d98bc286454dea2cd1e31709961bd2/src/codeflare_sdk) -[KubeRay Python Apis](https://github.com/ray-project/kuberay/tree/master/clients/python-client) and -[KubeRay API server APIs](https://github.com/ray-project/kuberay/tree/master/clients/python-apiserver-client) to name a few. -We are using here KubeRay API server APIs, but in order to simplify possible transition to another APIs. this class -implements 4 high-level methods, that allow to hide the specifics of the particular APIs. This methods are: -* create_ray_cluster - creates Ray cluster. -* delete_ray_cluster - deletes Ray cluster. -* submit_job - submits Ray job to the cluster -* follow_execution - watching job execution to completion, periodically printing out the job log -These basic methods can be used as a foundation of any KFP pipeline implementation - -## ComponentUtils - -This class provides some methods to simplify building pipelines: -* add_settings_to_component - adds settings to component, including timeout, image_pull_policy and cache strategy -* set_cos_env_vars_to_component - sets environment variables to support S3 -* default_compute_execution_params - default implementation of compute execution parameters (based on CPU, GPU and memory requirements) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py deleted file mode 100644 index bbe1476fb..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from kfp_support.workflow_support.compile_utils.component import ( - ComponentUtils -) diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py deleted file mode 100644 index 1f66bf59f..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/compile_utils/component.py +++ /dev/null @@ -1,101 +0,0 @@ -import kfp.dsl as dsl -from kfp import kubernetes -from typing import Dict - -RUN_NAME = "KFP_RUN_NAME" - -class ComponentUtils: - """ - Class containing methods supporting building pipelines - """ - - @staticmethod - def add_settings_to_component( - task: dsl.PipelineTask, - timeout: int, - image_pull_policy: str = "IfNotPresent", - cache_strategy: bool = False, - ) -> None: - """ - Add settings to kfp task - :param task: kfp task - :param timeout: timeout to set to the component in seconds - :param image_pull_policy: pull policy to set to the component - :param cache_strategy: cache strategy - """ - - kubernetes.use_field_path_as_env(task, env_name=RUN_NAME, - field_path="metadata.annotations['pipelines.kubeflow.org/run_name']") - # Set cashing - task.set_caching_options(enable_caching=cache_strategy) - # image pull policy - kubernetes.set_image_pull_policy(task, image_pull_policy) - # Set the timeout for the task to one day (in seconds) - kubernetes.set_timeout(task, seconds=timeout) - - @staticmethod - def set_s3_env_vars_to_component( - task: dsl.PipelineTask, - secret: str = '', - env2key: Dict[str, str] = {'s3-key': 'S3_KEY', 's3-secret': 'S3_SECRET', 's3-endpoint': 'ENDPOINT'}, - prefix: str = None, - ) -> None: - """ - Set S3 env variables to KFP component - :param task: kfp task - :param secret: secret name with the S3 credentials - :param env2key: dict with mapping each env variable to a key in the secret - :param prefix: prefix to add to env name - """ - - if prefix is not None: - for env_name, _ in env2key.items(): - env2key[prefix + "_" + env_name] = env2key.pop(env_name) - kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py deleted file mode 100644 index d2301bd0a..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py deleted file mode 100644 index ef00b0e92..000000000 --- a/kfp/kfp_support_lib_v2/src/kfp_support/workflow_support/runtime_utils/kfp_utils.py +++ /dev/null @@ -1,113 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import json -import os -import re -import sys -from typing import Any - -from data_processing.utils import get_logger - - -logger = get_logger(__name__) - - -class KFPUtils: - """ - Helper utilities for KFP implementations - """ - - @staticmethod - def credentials( - access_key: str = "S3_KEY", secret_key: str = "S3_SECRET", endpoint: str = "ENDPOINT" - ) -> tuple[str, str, str]: - """ - Get credentials from the environment - :param access_key: environment variable for access key - :param secret_key: environment variable for secret key - :param endpoint: environment variable for S3 endpoint - :return: - """ - s3_key = os.getenv(access_key, None) - s3_secret = os.getenv(secret_key, None) - s3_endpoint = os.getenv(endpoint, None) - if s3_key is None or s3_secret is None or s3_endpoint is None: - logger.warning("Failed to load s3 credentials") - return s3_key, s3_secret, s3_endpoint - - @staticmethod - def get_namespace() -> str: - """ - Get k8 namespace that we are running it - :return: - """ - ns = "" - try: - file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") - except Exception as e: - logger.warning( - f"Failed to open /var/run/secrets/kubernetes.io/serviceaccount/namespace file, " f"exception {e}" - ) - else: - with file: - ns = file.read() - return ns - - @staticmethod - def runtime_name(ray_name: str = "", run_id: str = "") -> str: - """ - Get unique runtime name - :param ray_name: - :param run_id: - :return: runtime name - """ - # K8s objects cannot contain special characters, except '_', All characters should be in lower case. - if ray_name != "": - ray_name = ray_name.replace("_", "-").lower() - pattern = r"[^a-zA-Z0-9-]" # the ray_name cannot contain upper case here, but leave it just in case. - ray_name = re.sub(pattern, "", ray_name) - else: - ray_name = "a" - # the return value plus namespace name will be the name of the Ray Route, - # which length is restricted to 64 characters, - # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] - - @staticmethod - def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: - res = f"python {executor} " - for key, value in d.items(): - if str(value) != "": - if isinstance(value, str): - if '"' in value: - logger.warning(f"can't parse inputs with double quotation marks, please use single quotation marks instead") - res += f'--{key}="{value}" ' - elif isinstance(value, bool): - if value: - res += f"--{key} " - else: - res += f"--{key}={value} " - - logger.info(f"request to execute: {res}") - return res - - # Load a string that represents a json to python dictionary - @staticmethod - def load_from_json(js: str) -> dict[str, Any]: - try: - return json.loads(js) - except Exception as e: - logger.warning(f"Failed to load parameters {js} with error {e}") - sys.exit(1) diff --git a/kfp/kfp_support_lib_v2/test/configmaps.py b/kfp/kfp_support_lib_v2/test/configmaps.py deleted file mode 100644 index 65e53e828..000000000 --- a/kfp/kfp_support_lib_v2/test/configmaps.py +++ /dev/null @@ -1,72 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from kubernetes import client, config - - -CMAP_VALUE = """ -import ray -import os -import requests - -ray.init() - -@ray.remote -class Counter: - def __init__(self): - # Used to verify runtimeEnv - self.name = os.getenv("counter_name") - assert self.name == "test_counter" - self.counter = 0 - - def inc(self): - self.counter += 1 - - def get_counter(self): - return "{} got {}".format(self.name, self.counter) - -counter = Counter.remote() - -for _ in range(5): - ray.get(counter.inc.remote()) - print(ray.get(counter.get_counter.remote())) - -# Verify that the correct runtime env was used for the job. -assert requests.__version__ == "2.26.0" -""" -CMAP_NAME = "ray-job-code-sample" - - -class ConfigmapsManager: - """ - Simple support class to manage config maps. Assumes local access to Kubectl - """ - - def __init__(self): - config.load_kube_config() - self.api_instance = client.CoreV1Api() - - def list_configmaps(self) -> list[str]: - cm_list = self.api_instance.list_namespaced_config_map(namespace="default").items - return [cm.metadata.name for cm in cm_list] - - def create_code_map(self) -> None: - cmap = client.V1ConfigMap() - cmap.metadata = client.V1ObjectMeta(name=CMAP_NAME) - cmap.data = {"sample_code.py": CMAP_VALUE} - self.api_instance.create_namespaced_config_map(namespace="default", body=cmap) - - def delete_code_map(self) -> None: - try: - self.api_instance.delete_namespaced_config_map(name="ray-job-code-sample", namespace="default") - except Exception as e: - print("config map ray-job-code-sample does not exist") diff --git a/kfp/kfp_support_lib_v2/test/kuberay_api_test.py b/kfp/kfp_support_lib_v2/test/kuberay_api_test.py deleted file mode 100644 index b2a444ce3..000000000 --- a/kfp/kfp_support_lib_v2/test/kuberay_api_test.py +++ /dev/null @@ -1,297 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import time - -from configmaps import ConfigmapsManager -from kfp_support.api_server_client import KubeRayAPIs -from kfp_support.api_server_client.params import ( - DEFAULT_WORKER_START_PARAMS, - Cluster, - ClusterSpec, - ConfigMapVolume, - EnvironmentVariables, - HeadNodeSpec, - RayJobRequest, - ServiceType, - Template, - Toleration, - TolerationEffect, - TolerationOperation, - WorkerNodeSpec, -) - - -def test_templates(): - """ - Test template - """ - # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") - # cleanup - _, _ = apis.delete_compute_template(ns="default", name="default-template") - # create - toleration = Toleration(key="blah1", operator=TolerationOperation.Exists, effect=TolerationEffect.NoExecute) - template = Template(name="default-template", namespace="default", cpu=2, memory=8, tolerations=[toleration]) - status, error = apis.create_compute_template(template) - assert status == 200 - assert error is None - # duplicate create should fail - status, error = apis.create_compute_template(template) - assert status != 200 - assert error is not None - print(f"\nstatus {status}, error code: {str(error)}") - # get - status, error, t = apis.get_compute_template(ns="default", name="default-template") - assert status == 200 - assert error is None - assert template.to_string() == t.to_string() - # list - status, error, template_array = apis.list_compute_templates() - assert status == 200 - assert error is None - assert template.to_string() == template_array[0].to_string() - # list ns - status, error, template_array = apis.list_compute_templates_namespace(ns="default") - assert status == 200 - assert error is None - assert template.to_string() == template_array[0].to_string() - # delete - status, error = apis.delete_compute_template(ns="default", name="default-template") - assert status == 200 - assert error is None - # duplicate delete should fail - status, error = apis.delete_compute_template(ns="default", name="default-template") - assert status != 200 - assert error is not None - print(f"status: {status}, err = {str(error)}") - - -def test_cluster(): - """ - Test cluster - """ - # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") - # cleanup - _, _ = apis.delete_compute_template(ns="default", name="default-template") - _, _ = apis.delete_cluster(ns="default", name="test") - # Create configmap - cm_manager = ConfigmapsManager() - cm_manager.delete_code_map() - cm_manager.create_code_map() - # Create template first - template = Template(name="default-template", namespace="default", cpu=2, memory=4) - status, error = apis.create_compute_template(template) - assert status == 200 - assert error is None - # cluster - volume = ConfigMapVolume( - name="code-sample", - mount_path="/home/ray/samples", - source="ray-job-code-sample", - items={"sample_code.py": "sample_code.py"}, - ) - environment = EnvironmentVariables(key_value={"key": "value"}) - head = HeadNodeSpec( - compute_template="default-template", - ray_start_params={"metrics-export-port": "8080", "num-cpus": "0"}, - image="rayproject/ray:2.9.3-py310", - service_type=ServiceType.ClusterIP, - volumes=[volume], - environment=environment, - image_pull_policy="Always", - ) - worker = WorkerNodeSpec( - group_name="small", - compute_template="default-template", - replicas=1, - min_replicas=1, - max_replicas=1, - ray_start_params=DEFAULT_WORKER_START_PARAMS, - image="rayproject/ray:2.9.3-py310", - volumes=[volume], - environment=environment, - image_pull_policy="Always", - ) - t_cluster = Cluster( - name="test", - namespace="default", - user="boris", - version="2.9.0", - cluster_spec=ClusterSpec(head_node=head, worker_groups=[worker]), - ) - # create - status, error = apis.create_cluster(t_cluster) - assert status == 200 - assert error is None - # get - status, error, c = apis.get_cluster(ns="default", name="test") - assert status == 200 - assert error is None - print(f"\ngot cluster: {c.to_string()}") - # list - status, error, clusters = apis.list_clusters() - assert status == 200 - assert error is None - assert len(clusters) == 1 - print(f"got cluster: {clusters[0].to_string()}") - # list namespace - status, error, clusters = apis.list_clusters_namespace(ns="default") - assert status == 200 - assert error is None - assert len(clusters) == 1 - print(f"got cluster: {clusters[0].to_string()}") - # get cluster status - status, error, cs = apis.get_cluster_status(ns="default", name="test") - assert status == 200 - assert error is None - print(f"cluster status is {cs}") - # Wait for the cluster to get ready - status, error = apis.wait_cluster_ready(ns="default", name="test") - assert status == 200 - assert error is None - # get endpoints - status, error, endpoint = apis.get_cluster_endpoints(ns="default", name="test") - assert status == 200 - assert error is None - print(f"cluster endpoints is {endpoint}") - # delete cluster - status, error = apis.delete_cluster(ns="default", name="test") - assert status == 200 - assert error is None - # delete template - status, error = apis.delete_compute_template(ns="default", name="default-template") - assert status == 200 - assert error is None - - -def test_job_submission(): - """ - Test job submission - :return: - """ - # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") - # cleanup - _, _ = apis.delete_compute_template(ns="default", name="default-template") - _, _ = apis.delete_cluster(ns="default", name="test-job") - # Create configmap - cm_manager = ConfigmapsManager() - cm_manager.delete_code_map() - cm_manager.create_code_map() - # Create template first - template = Template(name="default-template", namespace="default", cpu=2, memory=4) - status, error = apis.create_compute_template(template) - assert status == 200 - assert error is None - # cluster - volume = ConfigMapVolume( - name="code-sample", - mount_path="/home/ray/samples", - source="ray-job-code-sample", - items={"sample_code.py": "sample_code.py"}, - ) - environment = EnvironmentVariables(key_value={"key": "value"}) - head = HeadNodeSpec( - compute_template="default-template", - ray_start_params={"metrics-export-port": "8080", "num-cpus": "0"}, - image="rayproject/ray:2.9.3-py310", - service_type=ServiceType.ClusterIP, - volumes=[volume], - environment=environment, - image_pull_policy="IfNotPresent", - ) - worker = WorkerNodeSpec( - group_name="small", - compute_template="default-template", - replicas=1, - min_replicas=1, - max_replicas=1, - ray_start_params=DEFAULT_WORKER_START_PARAMS, - image="rayproject/ray:2.9.3-py310", - volumes=[volume], - environment=environment, - image_pull_policy="IfNotPresent", - ) - t_cluster = Cluster( - name="test-job", - namespace="default", - user="boris", - version="2.9.0", - cluster_spec=ClusterSpec(head_node=head, worker_groups=[worker]), - ) - # create - status, error = apis.create_cluster(t_cluster) - assert status == 200 - assert error is None - # Wait for the cluster to get ready - status, error = apis.wait_cluster_ready(ns="default", name="test-job") - assert status == 200 - assert error is None - # submit Ray job - resource_yaml = """ - pip: - - requests==2.26.0 - - pendulum==2.1.2 - env_vars: - counter_name: test_counter - """ - job_request = RayJobRequest( - entrypoint="python /home/ray/samples/sample_code.py", runtime_env=resource_yaml, num_cpu=0.5 - ) - # To ensure that Ray cluster HTTP is ready try to get jobs info from the cluster - status, error, job_info_array = apis.list_job_info(ns="default", name="test-job") - assert status == 200 - assert error is None - print("\n initial jobs info") - for inf in job_info_array: - print(f" {inf.to_string()}") - time.sleep(5) - status, error, sid = apis.submit_job(ns="default", name="test-job", job_request=job_request) - assert status == 200 - assert error is None - time.sleep(10) - # get Ray job info - status, error, jinfo = apis.get_job_info(ns="default", name="test-job", sid=sid) - assert status == 200 - assert error is None - print(f"\njobs info {jinfo.to_string()}") - # get Ray jobs info - status, error, job_info_array = apis.list_job_info(ns="default", name="test-job") - assert status == 200 - assert error is None - print("jobs info") - for inf in job_info_array: - print(f" {inf.to_string()}") - # get Ray job log - time.sleep(5) # wait till log is available - status, error, jlog = apis.get_job_log(ns="default", name="test-job", sid=sid) - assert status == 200 - assert error is None - print(f"job log {jlog}") - # stop Ray job - status, error = apis.stop_ray_job(ns="default", name="test-job", sid=sid) - assert status == 200 - assert error is None - # delete Ray job - status, error = apis.delete_ray_job(ns="default", name="test-job", sid=sid) - assert status == 200 - assert error is None - # delete cluster - status, error = apis.delete_cluster(ns="default", name="test-job") - assert status == 200 - assert error is None - # delete template - status, error = apis.delete_compute_template(ns="default", name="default-template") - assert status == 200 - assert error is None diff --git a/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py deleted file mode 100644 index f9a5cfee8..000000000 --- a/kfp/kfp_support_lib_v2/test/ray_remote_jobs_test.py +++ /dev/null @@ -1,90 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from configmaps import ConfigmapsManager -from kfp_support.api_server_client.params import ConfigMapVolume -from kfp_support.workflow_support.runtime_utils import RayRemoteJobs - - -def test_ray_remote_jobs(): - """ - Test the full cycle of job submission - :return: - """ - # This shows how to create volumes dictionary - volumes = [ - ConfigMapVolume( - name="code-sample", - mount_path="/home/ray/samples", - source="ray-job-code-sample", - items={"sample_code.py": "sample_code.py"}, - ) - ] - dct_volumes = {"volumes": [v.to_dict() for v in volumes]} - - head_node = { - "cpu": 2, - "memory": 4, - "image": "rayproject/ray:2.9.3-py310", - # Ray start params, just to show - "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"}, - "image_pull_policy": "Always", - } | dct_volumes - - worker_node = { - "cpu": 2, - "memory": 4, - "image": "rayproject/ray:2.9.3-py310", - "replicas": 1, - "min_replicas": 1, - "max_replicas": 1, - "image_pull_policy": "Always", - } | dct_volumes - - # Create configmap for testing - cm_manager = ConfigmapsManager() - cm_manager.delete_code_map() - cm_manager.create_code_map() - - # create cluster - remote_jobs = RayRemoteJobs(server_url="http://localhost:8080/ray") - status, error = remote_jobs.create_ray_cluster( - name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] - ) - print(f"Created cluster - status: {status}, error: {error}") - assert status == 200 - assert error is None - # submitting ray job - runtime_env = """ - pip: - - requests==2.26.0 - - pendulum==2.1.2 - env_vars: - counter_name: test_counter - """ - status, error, submission = remote_jobs.submit_job( - name="job-test", - namespace="default", - request={}, - runtime_env=runtime_env, - executor="/home/ray/samples/sample_code.py", - ) - print(f"submit job - status: {status}, error: {error}, submission id {submission}") - assert status == 200 - assert error is None - # print execution log - remote_jobs.follow_execution(name="job-test", namespace="default", submission_id=submission, print_timeout=20) - # cleanup - status, error = remote_jobs.delete_ray_cluster(name="job-test", namespace="default") - print(f"Deleted cluster - status: {status}, error: {error}") - assert status == 200 - assert error is None diff --git a/kfp/requirements.env b/kfp/requirements.env index 7a9c3f360..3b08a3109 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -1,2 +1,3 @@ RAY=2.9.3 -KFP=2.7.0 +KFP_v2=2.7.0 +KFP_v1=1.8.5 \ No newline at end of file diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index b3aba7cdb..02a4bea6a 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -29,23 +29,23 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0-kfp-v21" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1-kfp-v21" + +# path to kfp component specifications files +component_spec_path = "../../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @dsl.component(base_image=base_kfp_image) -def compute_exec_params_op(worker_options: str, actor_options: str) -> str: - from kfp_support.workflow_support.runtime_utils import ComponentUtils - - return ComponentUtils.default_compute_execution_params(worker_options, actor_options) - - +compute_exec_params_op = comp.func_to_container_op( + func=ComponentUtils.default_compute_execution_params +) # create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" @@ -115,11 +115,11 @@ def noop( # pipeline definition with dsl.ExitHandler(clean_up_task): # compute execution params -# compute_exec_params = compute_exec_params_op( - # worker_options=ray_worker_options, - # actor_options=runtime_actor_options, - # ) - # ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2,image_pull_policy="Always") + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2,image_pull_policy="Always") # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, @@ -139,14 +139,14 @@ def noop( # note that the parameters below are specific for NOOP transform exec_params={ "data_s3_config": "{'input_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/input/', 'output_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/output/'}", - "data_max_files": -1, - "data_num_samples": -1, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, "runtime_num_workers": "1", "runtime_worker_options": "{'num_cpus': 0.8}", - "runtime_pipeline_id": "pipeline_id", + "runtime_pipeline_id": runtime_actor_options, "runtime_job_id": RUN_ID, - "runtime_code_location": "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - "noop_sleep_sec": 10, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, }, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, From eeadb25a61e40bc7a45326c088a60d2cf29bdc8c Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Sun, 2 Jun 2024 10:35:47 +0300 Subject: [PATCH 13/64] add kfp_support_lib/Makefile Signed-off-by: Alexey Roytman --- kfp/kfp_support_lib/Makefile | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 kfp/kfp_support_lib/Makefile diff --git a/kfp/kfp_support_lib/Makefile b/kfp/kfp_support_lib/Makefile new file mode 100644 index 000000000..60e5d6ac9 --- /dev/null +++ b/kfp/kfp_support_lib/Makefile @@ -0,0 +1,48 @@ +################################################################################################################# +# +# This is the top level makefile, which is intended to be able to process a common set of rules on all +# sub-projects underneath this directory. Currently, the common/standardized set of rules are as follows +# and supported by .make.defaults +# +# setup: +# clean: +# build: +# test: +# +# When finally getting to a makefile that requires a rule implementation, for example to test the build, +# that makefile should override/implement the rule to meet its needs. Such a rule may continue to recurse +# using "$(MAKE) -recurse", for example "$(MAKE) test-recurse". +# +# Each rule is called recursively on sub-directories and if a similar inclusion is done in the sub-Makefiles, +# the rules will be applied/executed recursively in their sub-directories. +# +################################################################################################################# + +REPOROOT=../.. + +# Get some common rules for the whole repo +include $(REPOROOT)/.make.defaults + +########## ########## ########## ########## ########## ########## ########## ########## +# Global rules that are generally to be implemented in the sub-directories and can +# be overridden there (the double colon on the rule makes the overridable). + +clean:: + @# Help: Recursively $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +setup:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +build:: + @# Help: Recursively $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +test:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +image:: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse \ No newline at end of file From e343bdbdef2db72528400163bd22615dfdc75ef9 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Sun, 2 Jun 2024 14:41:15 +0300 Subject: [PATCH 14/64] kfpv1 intermediate impelmentation Signed-off-by: Alexey Roytman --- .make.versions | 2 +- kfp/kfp_ray_components/Dockerfile | 8 ++++++-- kfp/kfp_ray_components/Makefile | 12 +++++++----- .../kfp_v1_workflow_support/README.md | 0 .../kfp_v1_workflow_support/pyproject.toml | 2 +- .../kfp_v2_workflow_support/README.md | 0 kfp/requirements.env | 2 +- 7 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/README.md create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/README.md diff --git a/.make.versions b/.make.versions index 070fc9519..0d6a5a531 100644 --- a/.make.versions +++ b/.make.versions @@ -5,7 +5,7 @@ ################################################################################ # Data prep lab wheel version -DPK_LIB_VERSION=0.2.0 +DPK_LIB_VERSION=0.1.1 DPK_LIB_KFP_VERSION=0.2.0 DPK_LIB_KFP_VERSION_v2=0.1.1-dev1 diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index c19799977..65bf3fda8 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -17,8 +17,12 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . -COPY --chown=ray:users kfp_support_lib/ kfp_support_lib/ -RUN cd kfp_support_lib && pip install --no-cache-dir -e . +COPY --chown=ray:users python_apiserver_client python_apiserver_client/ +RUN cd python_apiserver_client && pip install --no-cache-dir -e . + +COPY --chown=ray:users python_apiserver_client workflow_support_lib/ +RUN cd workflow_support_lib && pip install --no-cache-dir -e . + # remove credentials-containing file RUN rm requirements.txt # components diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index e489da4ac..a3cc92cf0 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -11,15 +11,15 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed ' include makeenv ifeq ($(KFPv2), 1) -DOCKER_FILE=Dockerfile_v2 +DOCKER_FILE=Dockerfile DOCKER_IMAGE_NAME=kfp-data-processing_v2 DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION_v2} -KFP_SUPPORT_LIB=kfp_support_lib_v2 +WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support else DOCKER_FILE=Dockerfile DOCKER_IMAGE_NAME=kfp-data-processing DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} -KFP_SUPPORT_LIB=kfp_support_lib +WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support endif @@ -31,11 +31,13 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) .lib-src-image:: $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python - # $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib LIB_NAME=kfp_support_lib + $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client LIB_NAME=python_apiserver_client + $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB) LIB_NAME=workflow_support_lib $(MAKE) .defaults.image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python - # rm -rf kfp_support_lib + rm -rf python_api_server_client + rm -rf workflow_support_lib .PHONY: image image: Dockerfile Dockerfile_v2 requirements.txt diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/README.md b/kfp/kfp_support_lib/kfp_v1_workflow_support/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 06a4aab9b..451d4c2a9 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "kfp==1.8.22", "ray==2.9.3", "requests", - "data-prep-toolkit==0.2.0", + "data-prep-toolkit==0.1.1", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/README.md b/kfp/kfp_support_lib/kfp_v2_workflow_support/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/kfp/requirements.env b/kfp/requirements.env index 3b08a3109..57b4dc817 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -1,3 +1,3 @@ RAY=2.9.3 KFP_v2=2.7.0 -KFP_v1=1.8.5 \ No newline at end of file +KFP_v1=1.8.22 \ No newline at end of file From 9fc3070c6f9a905ce4ffc55f3bb8e6885a7d0069 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Sun, 2 Jun 2024 22:47:35 +0300 Subject: [PATCH 15/64] lib fixes Signed-off-by: Alexey Roytman --- .make.versions | 2 +- data-processing-lib/ray/pyproject.toml | 3 +-- kfp/kfp_ray_components/Dockerfile | 2 +- kfp/kfp_ray_components/Makefile | 2 +- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- .../executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- .../kfp_v1_workflow_support/pyproject.toml | 4 ++- .../src/utils/__init__.py | 4 --- .../src/workflow_support/utils/__init__.py | 4 +++ .../utils/components_utils.py | 0 .../{ => workflow_support}/utils/kfp_utils.py | 0 .../utils/pipeline_utils.py | 0 .../utils/pipelines_tests_utils.py | 0 .../utils/remote_jobs_utils.py | 6 ++--- .../test/pipeline_utils_test.py | 2 +- kfp/requirements.env | 10 ++++++- kind/Makefile | 3 ++- transforms/.make.transforms_workflows | 26 ++++--------------- .../noop/kfp_ray/v1/noop_multiple_wf.py | 2 +- .../universal/noop/kfp_ray/v1/noop_wf.py | 10 +++---- 23 files changed, 42 insertions(+), 48 deletions(-) delete mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/{ => workflow_support}/utils/components_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/{ => workflow_support}/utils/kfp_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/{ => workflow_support}/utils/pipeline_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/{ => workflow_support}/utils/pipelines_tests_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/{ => workflow_support}/utils/remote_jobs_utils.py (99%) diff --git a/.make.versions b/.make.versions index 0d6a5a531..ee20974e0 100644 --- a/.make.versions +++ b/.make.versions @@ -29,4 +29,4 @@ DOC_QUALITY_VERSION=0.4.0 INGEST_TO_PARQUET_VERSION=0.4.0 KFP_DOCKER_VERSION_v2=0.1.1 -KFP_DOCKER_VERSION=0.2.0 +KFP_DOCKER_VERSION=0.2.0-v2 diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 88f193e7d..29565b73e 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.2.0" +version = "0.1.1" requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Ray" license = {text = "Apache-2.0"} @@ -10,7 +10,6 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.0", "ray[default]==2.9.3", # These two are to fix security issues identified by quay.io "fastapi>=0.109.1", diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 65bf3fda8..ab0ef7588 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -20,7 +20,7 @@ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . COPY --chown=ray:users python_apiserver_client python_apiserver_client/ RUN cd python_apiserver_client && pip install --no-cache-dir -e . -COPY --chown=ray:users python_apiserver_client workflow_support_lib/ +COPY --chown=ray:users workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . # remove credentials-containing file diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index a3cc92cf0..717ad7754 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -36,7 +36,7 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) $(MAKE) .defaults.image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python - rm -rf python_api_server_client + rm -rf python_apiserver_client rm -rf workflow_support_lib .PHONY: image diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 26e466aa0..71df1893a 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index 00fc1493e..41d03fd5d 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -8,7 +8,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 59c8ac381..b6589dcfb 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index cbc0bf707..ca8f44a55 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index 3db8962b4..d4b862747 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 451d4c2a9..49f005a03 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "ray==2.9.3", "requests", "data-prep-toolkit==0.1.1", + "python_apiserver_client", ] [build-system] @@ -37,7 +38,8 @@ dev = [ package_dir = ["src"] [options.packages.find] -where = ["src/kfp_support"] +where = ["src/workflow_support"] + [tool.pytest.ini_options] addopts = "--cov --cov-report term-missing --cov-fail-under 10" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py deleted file mode 100644 index 8536bacd6..000000000 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from kfp_support.utils import KFPUtils -from kfp_support.utils.pipeline_utils import PipelinesUtils -from kfp_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC -from kfp_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py new file mode 100644 index 000000000..dc57ef4f4 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py @@ -0,0 +1,4 @@ +from workflow_support.utils.kfp_utils import KFPUtils +from workflow_support.utils.pipeline_utils import PipelinesUtils +from workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC +from workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/components_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/components_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/kfp_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/kfp_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipeline_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipeline_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipeline_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/remote_jobs_utils.py similarity index 99% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/remote_jobs_utils.py index e3cef883d..fdbab4af3 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/remote_jobs_utils.py @@ -17,8 +17,8 @@ from data_processing.data_access import DataAccess, DataAccessFactory from data_processing.utils import ParamsUtils, get_logger -from kfp_support.api_server_client import KubeRayAPIs -from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ( +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, Cluster, @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from kfp_support.workflow_support.utils import KFPUtils +from workflow_support.utils import KFPUtils from ray.job_submission import JobStatus diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py index 449dbd79d..000e3e9dc 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from kfp_support.utils import PipelinesUtils +from workflow_support.utils import PipelinesUtils def test_pipelines(): diff --git a/kfp/requirements.env b/kfp/requirements.env index 57b4dc817..c5f60ed03 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -1,3 +1,11 @@ RAY=2.9.3 KFP_v2=2.7.0 -KFP_v1=1.8.22 \ No newline at end of file +KFP_v1=1.8.22 + +ifeq ($(KFPv2), 1) +KFP=$(KFP_v2) +WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support +else +KFP=$(KFP_v1) +WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support +endif \ No newline at end of file diff --git a/kind/Makefile b/kind/Makefile index 2a8fea1da..2d3006b17 100644 --- a/kind/Makefile +++ b/kind/Makefile @@ -1,6 +1,6 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. -export ROOT_DIR=${CURDIR}/../ +export REPOROOT=${CURDIR}/../ ifneq ($(KFPv2), 1) @@ -12,6 +12,7 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kind/requirements.env | sed include makeenv +export ROOT_DIR=${CURDIR} # Include the common rules. # Use "make help" to see them. include ../.make.defaults diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index e5b97d2b5..9f5517015 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -55,29 +55,13 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen rm -rf ${REPOROOT}/transforms/venv $(MAKE) -C ${REPOROOT}/transforms .defaults.python-lib-src-venv . ${WORKFLOW_VENV_ACTIVATE}; \ - pip install -e $(REPOROOT)/kfp/kfp_support_lib/; + pip install -e $(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client; \ + pip install -e $(DPK_PYTHON_LIB_DIR); \ + pip install -e $(DPK_RAY_LIB_DIR); \ + pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ + pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); @# Help: Create the virtual environment common to all workflows - pip install -e $(DPK_RAY_LIB_DIR) -ifeq ($(KFPv2), 1) - . ${WORKFLOW_VENV_ACTIVATE} && pip install -e $(REPOROOT)/kfp/kfp_support_lib_v2/ -else - . ${WORKFLOW_VENV_ACTIVATE} && pip install -e $(REPOROOT)/kfp/kfp_support_lib/ -endif - pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ - pip install -e $($DPK_RAY_LIB_DIR); \ - pip install -e $(REPOROOT)/kfp/kfp_support_lib/; - -#TODO KFPv2 -${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt - @# Help: Create the virtual environment common to all workflows - rm -rf ${REPOROOT}/kfp/transform_workflows/venv - $(PYTHON) -m venv ${REPOROOT}/kfp/transform_workflows/venv - . ${VENV_ACTIVATE}; \ - pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ - pip install kfp-kubernetes --extra-index-url https://pypi.org/simple; \ - pip install -e $(REPOROOT)/kfp/kfp_support_lib/ - .PHONY: .transforms_workflows.upload-pipeline .transforms_workflows.upload-pipeline: $(call set_env_var, CLUSTER_EXISTS, $(shell kind get clusters | grep ${KIND_CLUSTER_NAME})) diff --git a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py index dd8eaa513..aad211b06 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py @@ -26,7 +26,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_wf.py index 872e98238..be629a3f6 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v1/noop_wf.py @@ -13,10 +13,10 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, +from workflow_support.utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, ) @@ -26,7 +26,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" From d67d8a977e9e55dafdc350f633caf1a0ef456eeb Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Sun, 2 Jun 2024 23:09:44 +0300 Subject: [PATCH 16/64] fix import --- .../src/workflow_support/utils/components_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py index 46e55024d..ab0c310e1 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py @@ -107,7 +107,7 @@ def default_compute_execution_params( import sys from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.utils import KFPUtils + from workflow_support.utils import KFPUtils logger = get_logger(__name__) From 0417995c031bd647327fbe62939d98f559bf398b Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Mon, 3 Jun 2024 07:57:24 +0300 Subject: [PATCH 17/64] fix python_appserver_client tests --- .../test/ray_remote_jobs_test.py | 8 ++++---- .../test/kuberay_api_test.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) rename kfp/kfp_support_lib/{python_apiserver_client => kfp_v1_workflow_support}/test/ray_remote_jobs_test.py (90%) diff --git a/kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py similarity index 90% rename from kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py index 2e8588d7e..54e7169c6 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -10,9 +10,9 @@ # limitations under the License. ################################################################################ -from kfp.kfp_support_lib.python_apiserver_client.test.configmaps import ConfigmapsManager -from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ConfigMapVolume -from kfp_support.workflow_support.utils import RayRemoteJobs +from configmaps import ConfigmapsManager +from python_apiserver_client.params import ConfigMapVolume +from workflow_support.utils import RayRemoteJobs def test_ray_remote_jobs(): @@ -56,7 +56,7 @@ def test_ray_remote_jobs(): cm_manager.create_code_map() # create cluster - remote_jobs = RayRemoteJobs(server_url="http://localhost:8080/ray") + remote_jobs = RayRemoteJobs(server_url="http://alexy100.sl.cloud9.ibm.com:8080/ray") status, error = remote_jobs.create_ray_cluster( name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] ) diff --git a/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py index ad0c2b766..d4dd12a5e 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py +++ b/kfp/kfp_support_lib/python_apiserver_client/test/kuberay_api_test.py @@ -12,9 +12,9 @@ import time -from python_apiserver_client.test.configmaps import ConfigmapsManager -from python_apiserver_client.src.api_server_client import KubeRayAPIs -from python_apiserver_client.src.api_server_client.params import ( +from configmaps import ConfigmapsManager +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( DEFAULT_WORKER_START_PARAMS, Cluster, ClusterSpec, @@ -30,13 +30,15 @@ WorkerNodeSpec, ) +server_url = "http://localhost:8080/ray" + def test_templates(): """ Test template """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") # create @@ -81,7 +83,7 @@ def test_cluster(): Test cluster """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") _, _ = apis.delete_cluster(ns="default", name="test") @@ -181,7 +183,7 @@ def test_job_submission(): :return: """ # create API server - apis = KubeRayAPIs(server_url="http://localhost:8080/ray") + apis = KubeRayAPIs(server_url=server_url) # cleanup _, _ = apis.delete_compute_template(ns="default", name="default-template") _, _ = apis.delete_cluster(ns="default", name="test-job") From d280247a9bfa1347f1942752cad0a6bbe1bd086c Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Mon, 3 Jun 2024 09:20:36 +0300 Subject: [PATCH 18/64] some fixes --- .make.versions | 2 +- kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile | 3 ++- .../kfp_v1_workflow_support/pyproject.toml | 2 +- .../kfp_v1_workflow_support/test/pipeline_utils_test.py | 3 ++- .../kfp_v1_workflow_support/test/ray_remote_jobs_test.py | 3 ++- kfp/kfp_support_lib/python_apiserver_client/Makefile | 8 +++++++- .../python_apiserver_client/pyproject.toml | 2 +- transforms/.make.transforms_workflows | 6 +++--- 8 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.make.versions b/.make.versions index ee20974e0..8900a58d4 100644 --- a/.make.versions +++ b/.make.versions @@ -5,7 +5,7 @@ ################################################################################ # Data prep lab wheel version -DPK_LIB_VERSION=0.1.1 +DPK_LIB_VERSION=0.2.0 DPK_LIB_KFP_VERSION=0.2.0 DPK_LIB_KFP_VERSION_v2=0.1.1-dev1 diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 5516a1df4..581c2e305 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -49,6 +49,8 @@ venv:: pyproject.toml .check-env rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ + pip install -e ../python_apiserver_client; \ + pip install -e ../../../data-processing-lib/python; \ pip install -e .; \ pip install ray==${RAY} \ pip install pytest pytest-cov @@ -56,7 +58,6 @@ venv:: pyproject.toml .check-env test:: venv @# Help: Use the already-built virtual environment to run pytest on the test directory. ifeq ($(DEPLOY_KUBEFLOW),1) - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; endif diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 49f005a03..cc3d6e625 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "kfp==1.8.22", "ray==2.9.3", "requests", - "data-prep-toolkit==0.1.1", + "data-prep-toolkit==0.2.0", "python_apiserver_client", ] diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py index 000e3e9dc..77cca5635 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py @@ -12,12 +12,13 @@ from workflow_support.utils import PipelinesUtils +server_url = "http://localhost:8080/" def test_pipelines(): """ Test pipelines utils """ - utils = PipelinesUtils(host="http://localhost:8080") + utils = PipelinesUtils(host=server_url) # get pipeline by name pipeline = utils.get_pipeline_by_name("[Tutorial] Data passing in python components") assert pipeline is not None diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py index 54e7169c6..7b9ad2c13 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -14,6 +14,7 @@ from python_apiserver_client.params import ConfigMapVolume from workflow_support.utils import RayRemoteJobs +server_url = "http:localhost:8080/ray/" def test_ray_remote_jobs(): """ @@ -56,7 +57,7 @@ def test_ray_remote_jobs(): cm_manager.create_code_map() # create cluster - remote_jobs = RayRemoteJobs(server_url="http://alexy100.sl.cloud9.ibm.com:8080/ray") + remote_jobs = RayRemoteJobs(server_url=server_url) status, error = remote_jobs.create_ray_cluster( name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] ) diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile index 70c9365f6..75807e5b6 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/Makefile +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -45,9 +45,15 @@ venv::pyproject.toml .check-env . ${VENV_ACTIVATE}; \ pip install --upgrade pip; pip install ray==${RAY}; \ += pip install -e ../../../data-processing-lib/python; \ pip install -e .; \ pip install pytest pytest-cov test:: venv @# Help: Use the already-built virtual environment to run pytest on the test directory. - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; pip list | grep python_apiserver_client ; cd test; $(PYTEST) api_params_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) configmaps.py; +ifeq ($(DEPLOY_KUBEFLOW),1) + . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; +endif + diff --git a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml index ea992c823..1ea8f9238 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml +++ b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml @@ -9,7 +9,7 @@ version = "0.0.1" dependencies = [ "requests", "kubernetes", - "data-prep-toolkit==0.1.1", + "data-prep-toolkit==0.2.0", ] authors = [ { name="KubeRay project"}, diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index 9f5517015..30195050b 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -56,9 +56,9 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen $(MAKE) -C ${REPOROOT}/transforms .defaults.python-lib-src-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client; \ - pip install -e $(DPK_PYTHON_LIB_DIR); \ - pip install -e $(DPK_RAY_LIB_DIR); \ - pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ + # pip install -e $(DPK_PYTHON_LIB_DIR); \ + # pip install -e $(DPK_RAY_LIB_DIR); \ + # pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); @# Help: Create the virtual environment common to all workflows From 2ae0984a077844d1d42e067e99ada8b05f54c37e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 3 Jun 2024 02:22:26 -0500 Subject: [PATCH 19/64] Fixes after testing. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/src/create_ray_cluster.py | 2 +- kfp/kfp_ray_components/src/delete_ray_cluster.py | 2 +- kfp/kfp_ray_components/src/execute_ray_job.py | 4 ++-- kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py | 4 ++-- kfp/kfp_ray_components/src/subworkflow.py | 4 ++-- transforms/.make.transforms_workflows | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index c9ba1c16e..42cace863 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -17,7 +17,7 @@ from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: - from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + from workflow_support.utils import KFPUtils, RayRemoteJobs print(f"Load KFPv1 libs") def start_ray_cluster( diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 724945fa3..886799453 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -18,7 +18,7 @@ from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: - from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + from workflow_support.utils import KFPUtils, RayRemoteJobs print(f"Load KFPv1 libs") # Cleans and shutdowns the Ray cluster diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 62c252400..4a80f3ae0 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -14,10 +14,10 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: - from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.utils import KFPUtils, execute_ray_jobs print(f"Load KFPv1 libs") diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index dac66f778..123c5a8e7 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -14,10 +14,10 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: - from kfp_v1_workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.utils import KFPUtils, execute_ray_jobs print(f"Load KFPv1 libs") diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 2e9616562..4771390c2 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -3,10 +3,10 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from kfp_v1_workflow_support.utils import KFPUtils, PipelinesUtils + from workflow_support.utils import KFPUtils, PipelinesUtils print(f"Load KFPv2 libs") else: - from kfp_v1_workflow_support.utils import KFPUtils, PipelinesUtils + from workflow_support.utils import KFPUtils, PipelinesUtils print(f"Load KFPv1 libs") from data_processing.utils import ParamsUtils diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index 30195050b..e39176d7a 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -49,7 +49,7 @@ ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) load-image endif - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m kfp_support.workflow_support.utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${REPOROOT}/kfp/kfp_support_lib/ rm -rf ${REPOROOT}/transforms/venv @@ -68,5 +68,5 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen @if [ -z ${CLUSTER_EXISTS} ]; then \ cd ${REPOROOT} && make setup; \ fi - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m kfp_support.workflow_support.utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} From 96f8dd45d89bc9167d3542e159ace65a2a7271af Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Mon, 3 Jun 2024 11:08:49 +0300 Subject: [PATCH 20/64] update noop kfpv2 --- .make.versions | 4 +- .../src/create_ray_cluster.py | 2 +- .../src/delete_ray_cluster.py | 2 +- kfp/kfp_ray_components/src/execute_ray_job.py | 2 +- .../src/execute_ray_job_multi_s3.py | 2 +- kfp/kfp_ray_components/src/subworkflow.py | 2 +- .../utils/pipelines_tests_utils.py | 2 +- .../kfp_v2_workflow_support/Makefile | 2 + .../kfp_v2_workflow_support/pyproject.toml | 8 +- .../compile_utils/__init__.py | 0 .../compile_utils/component.py | 0 .../pipeline_utils/__init__.py | 1 + .../pipeline_utils/pipeline_utils.py | 173 ++++++++++++++++++ .../pipeline_utils/pipelines_tests_utils.py | 75 ++++++++ .../runtime_utils/__init__.py | 0 .../runtime_utils/kfp_utils.py | 0 .../runtime_utils/remote_jobs_utils.py | 0 .../test/pipeline_utils_test.py | 34 ++++ .../test/ray_remote_jobs_test.py | 91 +++++++++ .../universal/noop/kfp_ray/v2/noop_wf.py | 2 +- 20 files changed, 390 insertions(+), 12 deletions(-) rename kfp/kfp_support_lib/kfp_v2_workflow_support/src/{ => workflow_support}/compile_utils/__init__.py (100%) rename kfp/kfp_support_lib/kfp_v2_workflow_support/src/{ => workflow_support}/compile_utils/component.py (100%) create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py rename kfp/kfp_support_lib/kfp_v2_workflow_support/src/{ => workflow_support}/runtime_utils/__init__.py (100%) rename kfp/kfp_support_lib/kfp_v2_workflow_support/src/{ => workflow_support}/runtime_utils/kfp_utils.py (100%) rename kfp/kfp_support_lib/kfp_v2_workflow_support/src/{ => workflow_support}/runtime_utils/remote_jobs_utils.py (100%) create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py create mode 100644 kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py diff --git a/.make.versions b/.make.versions index 8900a58d4..73bb98ce0 100644 --- a/.make.versions +++ b/.make.versions @@ -7,7 +7,7 @@ # Data prep lab wheel version DPK_LIB_VERSION=0.2.0 DPK_LIB_KFP_VERSION=0.2.0 -DPK_LIB_KFP_VERSION_v2=0.1.1-dev1 +DPK_LIB_KFP_VERSION_v2=0.2.0 # Begin transform versions/tags BLOCKLIST_VERSION=0.4.0 @@ -28,5 +28,5 @@ CODE_QUALITY_VERSION=0.4.0 DOC_QUALITY_VERSION=0.4.0 INGEST_TO_PARQUET_VERSION=0.4.0 -KFP_DOCKER_VERSION_v2=0.1.1 +KFP_DOCKER_VERSION_v2=0.2.0-v2 KFP_DOCKER_VERSION=0.2.0-v2 diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 42cace863..ee8312b1b 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -14,7 +14,7 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: from workflow_support.utils import KFPUtils, RayRemoteJobs diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 886799453..ccbb31b93 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -15,7 +15,7 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from kfp_v1_workflow_support.utils import KFPUtils, RayRemoteJobs + from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: from workflow_support.utils import KFPUtils, RayRemoteJobs diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 4a80f3ae0..037a3baaa 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -14,7 +14,7 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: from workflow_support.utils import KFPUtils, execute_ray_jobs diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 123c5a8e7..7a9246cdf 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -14,7 +14,7 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from workflow_support.utils import KFPUtils, execute_ray_jobs + from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: from workflow_support.utils import KFPUtils, execute_ray_jobs diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 4771390c2..a57e1406d 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -3,7 +3,7 @@ kfp_v2 = os.getenv("KFP_v2", 0) if kfp_v2 == 1: - from workflow_support.utils import KFPUtils, PipelinesUtils + from workflow_support.runtime_utils import KFPUtils, PipelinesUtils print(f"Load KFPv2 libs") else: from workflow_support.utils import KFPUtils, PipelinesUtils diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py index 1e7ff9cf7..5fd43ca6b 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py @@ -3,7 +3,7 @@ from data_processing.utils import get_logger, str2bool -from . import PipelinesUtils +from workflow_support.utils import PipelinesUtils logger = get_logger(__name__) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index 135e29514..6d6540d84 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -49,6 +49,8 @@ venv:: pyproject.toml .check-env rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ + pip install -e ../python_apiserver_client; \ + pip install -e ../../../data-processing-lib/python; \ pip install -e .; \ pip install ray==${RAY} \ pip install pytest pytest-cov diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 4238e0417..bedc6f334 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -12,9 +12,11 @@ authors = [ { name = "Revital Eres", email = "eres@il.ibm.com" }, ] dependencies = [ - "kfp==2.2.0", + "kfp==2.7.0", + "ray==2.9.3", "requests", - "data-prep-toolkit==0.1.1", + "data-prep-toolkit==0.2.0", + "python_apiserver_client", ] [build-system] @@ -36,7 +38,7 @@ dev = [ package_dir = ["src"] [options.packages.find] -where = ["src/kfp_support"] +where = ["src/workflow_support"] [tool.pytest.ini_options] addopts = "--cov --cov-report term-missing --cov-fail-under 10" diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/__init__.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v2_workflow_support/src/compile_utils/component.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py new file mode 100644 index 000000000..0e80d97a2 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/__init__.py @@ -0,0 +1 @@ +from workflow_support.pipeline_utils.pipeline_utils import PipelinesUtils diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py new file mode 100644 index 000000000..7566f6b2e --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py @@ -0,0 +1,173 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import datetime +import time +from typing import Any, Optional + +from data_processing.utils import get_logger +from kfp_server_api import models + +from kfp import Client + + +logger = get_logger(__name__) + + +class PipelinesUtils: + """ + Helper class for pipeline management + """ + + def __init__(self, host: str = "http://localhost:8080"): + """ + Initialization + :param host: host to connect to + """ + self.kfp_client = Client(host=host) + + def upload_pipeline( + self, + pipeline_package_path: str = None, + pipeline_name: str = None, + overwrite: bool = False, + description: str = None, + ) -> models.api_pipeline.ApiPipeline: + """ + Uploads the pipeline + :param pipeline_package_path: Local path to the pipeline package. + :param pipeline_name: Optional. Name of the pipeline to be shown in the UI + :param overwrite: Optional. If pipeline exists, delete it before creating a new one. + :param description: Optional. Description of the pipeline to be shown in the UI. + :return: Server response object containing pipeline id and other information. + """ + if overwrite: + pipeline = self.get_pipeline_by_name(name=pipeline_name) + if pipeline is not None: + try: + logger.info(f"pipeline {pipeline_name} already exists. Trying to delete it.") + self.kfp_client.delete_pipeline(pipeline_id=pipeline.id) + except Exception as e: + logger.warning(f"Exception deleting pipeline {e} before uploading") + return None + try: + pipeline = self.kfp_client.upload_pipeline( + pipeline_package_path=pipeline_package_path, pipeline_name=pipeline_name, description=description + ) + except Exception as e: + logger.warning(f"Exception uploading pipeline {e}") + return None + if pipeline is None: + logger.warning(f"Failed to upload pipeline {pipeline_name}.") + return None + logger.info("Pipeline uploaded") + return pipeline + + def delete_pipeline(self, pipeline_id): + """ + Delete pipeline. + :param pipeline_id: id of the pipeline. + :return + Returns: + Object. If the method is called asynchronously, returns the request thread. + Raises: + kfp_server_api.ApiException: If pipeline is not found. + """ + return self.kfp_client.delete_pipeline(pipeline_id) + + def start_pipeline( + self, + pipeline: models.api_pipeline.ApiPipeline, + experiment: models.api_experiment.ApiExperiment, + params: Optional[dict[str, Any]], + ) -> str: + """ + Start a specified pipeline. + :param pipeline: pipeline definition + :param experiment: experiment to use + :param params: pipeline parameters + :return: the id of the run object + """ + job_name = pipeline.name + " " + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + try: + run_id = self.kfp_client.run_pipeline( + experiment_id=experiment.id, job_name=job_name, pipeline_id=pipeline.id, params=params + ) + logger.info(f"Pipeline run {job_name} submitted") + return run_id.id + except Exception as e: + logger.warning(f"Exception starting pipeline {e}") + return None + + def get_experiment_by_name(self, name: str = "Default") -> models.api_experiment.ApiExperiment: + """ + Get experiment by name + :param name: name + :return: experiment + """ + try: + return self.kfp_client.get_experiment(experiment_name=name) + except Exception as e: + logger.warning(f"Exception getting experiment {e}") + return None + + def get_pipeline_by_name(self, name: str, np: int = 100) -> models.api_pipeline.ApiPipeline: + """ + Given pipeline name, return the pipeline + :param name: pipeline name + :param np: page size for pipeline query. For large clusters with many pipelines, you might need to + increase this number + :return: pipeline + """ + try: + # Get all pipelines + pipelines = self.kfp_client.list_pipelines(page_size=np).pipelines + required = list(filter(lambda p: name in p.name, pipelines)) + if len(required) != 1: + logger.warning(f"Failure to get pipeline. Number of pipelines with name {name} is {len(required)}") + return None + return required[0] + + except Exception as e: + logger.warning(f"Exception getting pipeline {e}") + return None + + def wait_pipeline_completion(self, run_id: str, timeout: int = -1, wait: int = 600) -> tuple[str, str]: + """ + Waits for a pipeline run to complete + :param run_id: run id + :param timeout: timeout (sec) (-1 wait forever) + :param wait: internal wait (sec) + :return: Completion status and an error message if such exists + """ + try: + if timeout > 0: + end = time.time() + timeout + else: + end = 2**63 - 1 + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + while status is None or status.lower() not in ["succeeded", "completed", "failed", "skipped", "error"]: + time.sleep(wait) + if (end - time.time()) < 0: + return "failed", f"Execution is taking too long" + run_details = self.kfp_client.get_run(run_id=run_id) + status = run_details.run.status + logger.info(f"Got pipeline execution status {status}") + + if status.lower() in ["succeeded", "completed"]: + return status, "" + return status, run_details.run.error + + except Exception as e: + logger.warning(f"Failed waiting pipeline completion {e}") + return "failed", str(e) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py new file mode 100644 index 000000000..1e7ff9cf7 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -0,0 +1,75 @@ +import os +import sys + +from data_processing.utils import get_logger, str2bool + +from . import PipelinesUtils + + +logger = get_logger(__name__) + + +def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/", overwrite: bool = True): + """ + Upload and run a single pipeline + + :param pipeline_package_path: Local path to the pipeline package. + :param endpoint: endpoint to kfp service. + :return the pipeline name as it appears in the kfp GUI. + """ + tmout: int = 800 + wait: int = 60 + file_name = os.path.basename(pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=overwrite, + ) + if pipeline is None: + return None + experiment = utils.get_experiment_by_name() + run_id = utils.start_pipeline(pipeline, experiment, params=[]) + status, error = utils.wait_pipeline_completion(run_id=run_id, timeout=tmout, wait=wait) + if status.lower() not in ["succeeded", "completed"]: + # Execution failed + logger.warning(f"Pipeline {pipeline_name} failed with error {error} and status {status}") + return None + logger.info(f"Pipeline {pipeline_name} successfully completed") + return pipeline_name + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run sanity test") + parser.add_argument("-c", "--command", type=str, choices=["upload", "sanity-test"]) + parser.add_argument("-e", "--endpoint", type=str, default="http://localhost:8080/") + parser.add_argument("-p", "--pipeline_package_path", type=str, default="") + parser.add_argument("-o", "--overwrite", type=str, default="True") + + args = parser.parse_args() + match args.command: + case "upload": + file_name = os.path.basename(args.pipeline_package_path) + pipeline_name = os.path.splitext(file_name)[0] + utils = PipelinesUtils(host=args.endpoint) + pipeline = utils.upload_pipeline( + pipeline_package_path=args.pipeline_package_path, + pipeline_name=pipeline_name, + overwrite=str2bool(args.overwrite), + ) + if pipeline is None: + sys.exit(1) + case "sanity-test": + run = run_test( + endpoint=args.endpoint, + pipeline_package_path=args.pipeline_package_path, + overwrite=str2bool(args.overwrite), + ) + if run is None: + sys.exit(1) + case _: + logger.warning("Unsupported command") + exit(1) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/__init__.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v2_workflow_support/src/runtime_utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py new file mode 100644 index 000000000..77cca5635 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py @@ -0,0 +1,34 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from workflow_support.utils import PipelinesUtils + +server_url = "http://localhost:8080/" + +def test_pipelines(): + """ + Test pipelines utils + """ + utils = PipelinesUtils(host=server_url) + # get pipeline by name + pipeline = utils.get_pipeline_by_name("[Tutorial] Data passing in python components") + assert pipeline is not None + # get default experiment + experiment = utils.get_experiment_by_name() + assert experiment is not None + # start pipeline + run = utils.start_pipeline(pipeline=pipeline, experiment=experiment, params={}) + assert run is not None + # wait for completion + status, error = utils.wait_pipeline_completion(run_id=run, wait=10) + assert status.lower() == "succeeded" + assert error == "" diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py new file mode 100644 index 000000000..7b9ad2c13 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py @@ -0,0 +1,91 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from configmaps import ConfigmapsManager +from python_apiserver_client.params import ConfigMapVolume +from workflow_support.utils import RayRemoteJobs + +server_url = "http:localhost:8080/ray/" + +def test_ray_remote_jobs(): + """ + Test the full cycle of job submission + :return: + """ + # This shows how to create volumes dictionary + volumes = [ + ConfigMapVolume( + name="code-sample", + mount_path="/home/ray/samples", + source="ray-job-code-sample", + items={"sample_code.py": "sample_code.py"}, + ) + ] + dct_volumes = {"volumes": [v.to_dict() for v in volumes]} + + head_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + # Ray start params, just to show + "ray_start_params": {"metrics-export-port": "8080", "num-cpus": "0", "dashboard-host": "0.0.0.0"}, + "image_pull_policy": "Always", + } | dct_volumes + + worker_node = { + "cpu": 2, + "memory": 4, + "image": "rayproject/ray:2.9.3-py310", + "replicas": 1, + "min_replicas": 1, + "max_replicas": 1, + "image_pull_policy": "Always", + } | dct_volumes + + # Create configmap for testing + cm_manager = ConfigmapsManager() + cm_manager.delete_code_map() + cm_manager.create_code_map() + + # create cluster + remote_jobs = RayRemoteJobs(server_url=server_url) + status, error = remote_jobs.create_ray_cluster( + name="job-test", namespace="default", head_node=head_node, worker_nodes=[worker_node] + ) + print(f"Created cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None + # submitting ray job + runtime_env = """ + pip: + - requests==2.26.0 + - pendulum==2.1.2 + env_vars: + counter_name: test_counter + """ + status, error, submission = remote_jobs.submit_job( + name="job-test", + namespace="default", + request={}, + runtime_env=runtime_env, + executor="/home/ray/samples/sample_code.py", + ) + print(f"submit job - status: {status}, error: {error}, submission id {submission}") + assert status == 200 + assert error is None + # print execution log + remote_jobs.follow_execution(name="job-test", namespace="default", submission_id=submission, print_timeout=20) + # cleanup + status, error = remote_jobs.delete_ray_cluster(name="job-test", namespace="default") + print(f"Deleted cluster - status: {status}, error: {error}") + assert status == 200 + assert error is None diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index 02a4bea6a..613b362fd 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -13,7 +13,7 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( +from workflow_support.compile_utils import ( ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, From a0028b2aa110c6672ce5f728a1f7764c9055c1e7 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 3 Jun 2024 15:06:46 +0300 Subject: [PATCH 21/64] Fixes after testing. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 3 +- kfp/kfp_ray_components/Makefile | 13 ++++- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- .../executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- .../src/create_ray_cluster.py | 3 +- .../src/delete_ray_cluster.py | 2 +- kfp/kfp_ray_components/src/execute_ray_job.py | 2 +- .../src/execute_ray_job_multi_s3.py | 2 +- .../kfp_v2_workflow_support/pyproject.toml | 1 + .../compile_utils/__init__.py | 5 +- .../compile_utils/component.py | 6 ++- .../runtime_utils/__init__.py | 4 +- .../runtime_utils/remote_jobs_utils.py | 6 +-- kfp/requirements.env | 10 ++-- transforms/universal/noop/Makefile | 18 ++++++- .../universal/noop/kfp_ray/v2/noop_wf.py | 47 ++++++++++--------- 19 files changed, 85 insertions(+), 47 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index ab0ef7588..81b391eec 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -2,6 +2,7 @@ FROM docker.io/rayproject/ray:2.9.3-py310 ARG BUILD_DATE ARG GIT_COMMIT +ARG KFP_v2 LABEL build-date=$BUILD_DATE LABEL git-commit=$GIT_COMMIT @@ -22,7 +23,7 @@ RUN cd python_apiserver_client && pip install --no-cache-dir -e . COPY --chown=ray:users workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . - +ENV KFP_v2=$KFP_v2 # remove credentials-containing file RUN rm requirements.txt # components diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 717ad7754..a1e4fe730 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -26,6 +26,17 @@ endif #DOCKER_IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_VERSION} DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) +.PHONY: .kfp_image +.kfp_image:: # Must be called with a DOCKER_IMAGE= settings. + @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) + $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) \ + -f $(DOCKER_FILE) \ + --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ + --build-arg KFP_v2=$(KFPv2) \ + --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . + $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) .PHONY: .lib-src-image .lib-src-image:: @@ -33,7 +44,7 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client LIB_NAME=python_apiserver_client $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB) LIB_NAME=workflow_support_lib - $(MAKE) .defaults.image + $(MAKE) .kfp_image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python rm -rf python_apiserver_client diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 71df1893a..f86af3991 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index 41d03fd5d..d62312d0c 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -8,7 +8,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index b6589dcfb..d339bd05b 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index ca8f44a55..0c6c549fa 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index d4b862747..4187d0893 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index ee8312b1b..900e482ca 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -13,7 +13,8 @@ import sys kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == 1: +print(kfp_v2) +if kfp_v2 == "1": from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index ccbb31b93..02eeeb650 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -14,7 +14,7 @@ import sys kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == 1: +if kfp_v2 == "1": from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs print(f"Load KFPv2 libs") else: diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index 037a3baaa..efbb8e723 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -13,7 +13,7 @@ import os kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == 1: +if kfp_v2 == "1": from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 7a9246cdf..7493c247f 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -13,7 +13,7 @@ import os kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == 1: +if kfp_v2 == "1": from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs print(f"Load KFPv2 libs") else: diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index bedc6f334..05e39be76 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -13,6 +13,7 @@ authors = [ ] dependencies = [ "kfp==2.7.0", + "kfp-kubernetes==1.2.0", "ray==2.9.3", "requests", "data-prep-toolkit==0.2.0", diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py index bbe1476fb..6b99a6be1 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -1,3 +1,6 @@ -from kfp_support.workflow_support.compile_utils.component import ( +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, ComponentUtils ) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index 1f66bf59f..93a604d22 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -4,6 +4,10 @@ RUN_NAME = "KFP_RUN_NAME" +ONE_HOUR_SEC = 60 * 60 +ONE_DAY_SEC = ONE_HOUR_SEC * 24 +ONE_WEEK_SEC = ONE_DAY_SEC * 7 + class ComponentUtils: """ Class containing methods supporting building pipelines @@ -67,7 +71,7 @@ def default_compute_execution_params( import sys from data_processing.utils import GB, get_logger - from kfp_support.workflow_support.runtime_utils import KFPUtils + from workflow_support.runtime_utils import KFPUtils logger = get_logger(__name__) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py index d2301bd0a..8d2cdd648 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/__init__.py @@ -1,2 +1,2 @@ -from kfp_support.workflow_support.runtime_utils.kfp_utils import KFPUtils -from kfp_support.workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs +from workflow_support.runtime_utils.kfp_utils import KFPUtils +from workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py index c7e7cbe45..0b20b28c4 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py @@ -17,8 +17,8 @@ from data_processing.data_access import DataAccess, DataAccessFactory from data_processing.utils import ParamsUtils, get_logger -from kfp_support.api_server_client import KubeRayAPIs -from kfp.kfp_support_lib.python_apiserver_client.src.python_apiserver_client.params import ( +from python_apiserver_client import KubeRayAPIs +from python_apiserver_client.params import ( DEFAULT_HEAD_START_PARAMS, DEFAULT_WORKER_START_PARAMS, Cluster, @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from kfp_support.workflow_support.runtime_utils import KFPUtils +from workflow_support.runtime_utils import KFPUtils from ray.job_submission import JobStatus diff --git a/kfp/requirements.env b/kfp/requirements.env index c5f60ed03..6fa707df5 100644 --- a/kfp/requirements.env +++ b/kfp/requirements.env @@ -3,9 +3,9 @@ KFP_v2=2.7.0 KFP_v1=1.8.22 ifeq ($(KFPv2), 1) -KFP=$(KFP_v2) -WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support + KFP=$(KFP_v2) + WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support else -KFP=$(KFP_v1) -WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support -endif \ No newline at end of file + KFP=$(KFP_v1) + WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support +endif diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 02fd06dc2..6ca460863 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -47,15 +47,29 @@ workflow-venv: .PHONY: workflow-build workflow-build: +ifeq ($(KFPv2), 0) $(MAKE) -C kfp_ray/v1 workflow-build +else + $(MAKE) -C kfp_ray/v2 workflow-build +endif + .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test +ifeq ($(KFPv2), 0) + $(MAKE) -C kfp_ray/v2 workflow-test +else + $(MAKE) -C kfp_ray/v2 workflow-test +endif .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload +ifeq ($(KFPv2), 0) + $(MAKE) -C kfp_ray/v1 workflow-upload +else + $(MAKE) -C kfp_ray/v2 workflow-upload +endif + .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index 613b362fd..a77e3a2b4 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -23,13 +23,13 @@ # FIXME: create a component to get run id RUN_ID = uuid.uuid4().hex -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1-kfp-v21" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2" # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" @@ -37,9 +37,12 @@ # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @dsl.component(base_image=base_kfp_image) -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params -) +def compute_exec_params(worker_options: str, actor_options: str) -> str: + from workflow_support.compile_utils import ComponentUtils + + return ComponentUtils.default_compute_execution_params(worker_options, actor_options) + + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -111,49 +114,49 @@ def noop( """ # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60, image_pull_policy="Always") + ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( + # compute execution params + compute_exec_params_task = compute_exec_params( worker_options=ray_worker_options, actor_options=runtime_actor_options, ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2,image_pull_policy="Always") + ComponentUtils.add_settings_to_component(compute_exec_params_task, ONE_HOUR_SEC * 2) # start Ray cluster - ray_cluster = create_ray_op( + ray_cluster = create_ray_op( ray_name=ray_name, run_id=RUN_ID, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2, image_pull_policy="Always") + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) #ray_cluster.after(compute_exec_params) # Execute job - execute_job = execute_ray_jobs_op( + execute_job = execute_ray_jobs_op( ray_name=ray_name, run_id=RUN_ID, additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params={ "data_s3_config": "{'input_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/input/', 'output_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/output/'}", - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, + "data_max_files": -1, + "data_num_samples": -1, "runtime_num_workers": "1", "runtime_worker_options": "{'num_cpus': 0.8}", - "runtime_pipeline_id": runtime_actor_options, + "runtime_pipeline_id": "{'num_cpus': 0.8}", "runtime_job_id": RUN_ID, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, + "runtime_code_location": "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + "noop_sleep_sec": 10, }, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC,image_pull_policy="Always") - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) # Configure the pipeline level to one week (in seconds) # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) From 47c4c687786e671879c7390948a2062d040275cf Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 3 Jun 2024 17:30:52 +0300 Subject: [PATCH 22/64] Fix compute_exec_params Signed-off-by: Revital Sur --- .../compile_utils/component.py | 47 ------------------- .../runtime_utils/kfp_utils.py | 47 +++++++++++++++++++ .../universal/noop/kfp_ray/v2/noop_wf.py | 4 +- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index 93a604d22..4fa47290f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -56,50 +56,3 @@ def set_s3_env_vars_to_component( for env_name, _ in env2key.items(): env2key[prefix + "_" + env_name] = env2key.pop(env_name) kubernetes.use_secret_as_env(task=task, secret_name='s3-secret', secret_key_to_env=env2key) - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import GB, get_logger - from workflow_support.runtime_utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) \ No newline at end of file diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py index ef00b0e92..0e9951282 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py @@ -111,3 +111,50 @@ def load_from_json(js: str) -> dict[str, Any]: except Exception as e: logger.warning(f"Failed to load parameters {js} with error {e}") sys.exit(1) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index a77e3a2b4..4c9ce970e 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -38,9 +38,9 @@ # a result, instead of creating a component we are creating it in place here. @dsl.component(base_image=base_kfp_image) def compute_exec_params(worker_options: str, actor_options: str) -> str: - from workflow_support.compile_utils import ComponentUtils + from workflow_support.runtime_utils import KFPUtils - return ComponentUtils.default_compute_execution_params(worker_options, actor_options) + return KFPUtils.default_compute_execution_params(worker_options, actor_options) # create Ray cluster From 8ff943493beff1b2dc7cdb70b20e3f83f39a1297 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 3 Jun 2024 17:35:09 +0300 Subject: [PATCH 23/64] Minor fix. Signed-off-by: Revital Sur --- transforms/universal/noop/kfp_ray/v2/noop_wf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index 4c9ce970e..4107fee2b 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -133,7 +133,7 @@ def noop( additional_params=additional_params, ) ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - #ray_cluster.after(compute_exec_params) + ray_cluster.after(compute_exec_params) # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, From 97e51c17c4e1059805af9e299f168c5820965c3a Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 4 Jun 2024 11:37:46 +0300 Subject: [PATCH 24/64] add input paramters to the pipeline Signed-off-by: Alexey Roytman --- transforms/.make.transforms_workflows | 6 +- .../universal/noop/kfp_ray/v2/noop_wf.py | 117 ++++++++++-------- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index e39176d7a..b4abf02b0 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -56,9 +56,9 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen $(MAKE) -C ${REPOROOT}/transforms .defaults.python-lib-src-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client; \ - # pip install -e $(DPK_PYTHON_LIB_DIR); \ - # pip install -e $(DPK_RAY_LIB_DIR); \ - # pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ + pip install -e $(DPK_PYTHON_LIB_DIR); \ + pip install -e $(DPK_RAY_LIB_DIR); \ + pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); @# Help: Create the virtual environment common to all workflows diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py index 4107fee2b..e158e8e3b 100644 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v2/noop_wf.py @@ -23,7 +23,7 @@ # FIXME: create a component to get run id RUN_ID = uuid.uuid4().hex -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" @@ -34,13 +34,33 @@ # path to kfp component specifications files component_spec_path = "../../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. @dsl.component(base_image=base_kfp_image) -def compute_exec_params(worker_options: str, actor_options: str) -> str: +def compute_exec_params(worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, + ) -> dict: from workflow_support.runtime_utils import KFPUtils - - return KFPUtils.default_compute_execution_params(worker_options, actor_options) + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, + actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } # create Ray cluster @@ -58,25 +78,25 @@ def compute_exec_params(worker_options: str, actor_options: str) -> str: description="Pipeline for noop", ) def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ): """ Pipeline to execute NOOP transform @@ -117,48 +137,47 @@ def noop( ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params_task = compute_exec_params( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params_task, ONE_HOUR_SEC * 2) + # compute execution params + compute_exec_params_task = compute_exec_params( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=RUN_ID, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params_task, ONE_HOUR_SEC * 2) # start Ray cluster - ray_cluster = create_ray_op( + ray_cluster = create_ray_op( ray_name=ray_name, run_id=RUN_ID, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) # Execute job - execute_job = execute_ray_jobs_op( + execute_job = execute_ray_jobs_op( ray_name=ray_name, run_id=RUN_ID, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": "{'input_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/input/', 'output_folder': 'dev-code-datasets/data-prep-labs/kfp-v2/noop/output/'}", - "data_max_files": -1, - "data_num_samples": -1, - "runtime_num_workers": "1", - "runtime_worker_options": "{'num_cpus': 0.8}", - "runtime_pipeline_id": "{'num_cpus': 0.8}", - "runtime_job_id": RUN_ID, - "runtime_code_location": "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - "noop_sleep_sec": 10, - }, + exec_params=compute_exec_params_task.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) # Configure the pipeline level to one week (in seconds) + + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) From e5127bdffbbffac79663eb7aa6065a700e7c320d Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 4 Jun 2024 13:58:44 +0300 Subject: [PATCH 25/64] Change kfp_v1_workflow_support. Signed-off-by: Revital Sur --- data-processing-lib/ray/pyproject.toml | 1 + .../src/workflow_support/compile_utils/__init__.py | 6 ++++++ .../components_utils.py => compile_utils/component.py} | 0 .../src/workflow_support/pipeline_utils/__init__.py | 1 + .../{utils => pipeline_utils}/pipeline_utils.py | 0 .../{utils => pipeline_utils}/pipelines_tests_utils.py | 0 .../src/workflow_support/runtime_utils/__init__.py | 2 ++ .../{utils => runtime_utils}/kfp_utils.py | 0 .../{utils => runtime_utils}/remote_jobs_utils.py | 0 .../src/workflow_support/utils/__init__.py | 4 ---- transforms/universal/noop/kfp_ray/v1/noop_wf.py | 9 ++++----- 11 files changed, 14 insertions(+), 9 deletions(-) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/{utils/components_utils.py => compile_utils/component.py} (100%) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/{utils => pipeline_utils}/pipeline_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/{utils => pipeline_utils}/pipelines_tests_utils.py (100%) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/{utils => runtime_utils}/kfp_utils.py (100%) rename kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/{utils => runtime_utils}/remote_jobs_utils.py (100%) delete mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 29565b73e..3acaf7197 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -10,6 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ + "data-prep-toolkit==0.2.0", "ray[default]==2.9.3", # These two are to fix security issues identified by quay.io "fastapi>=0.109.1", diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py new file mode 100644 index 000000000..6b99a6be1 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/__init__.py @@ -0,0 +1,6 @@ +from workflow_support.compile_utils.component import ( + ONE_HOUR_SEC, + ONE_DAY_SEC, + ONE_WEEK_SEC, + ComponentUtils +) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/components_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py new file mode 100644 index 000000000..0e80d97a2 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/__init__.py @@ -0,0 +1 @@ +from workflow_support.pipeline_utils.pipeline_utils import PipelinesUtils diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipeline_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipeline_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipeline_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/pipelines_tests_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py new file mode 100644 index 000000000..8d2cdd648 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/__init__.py @@ -0,0 +1,2 @@ +from workflow_support.runtime_utils.kfp_utils import KFPUtils +from workflow_support.runtime_utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/kfp_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py similarity index 100% rename from kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/remote_jobs_utils.py rename to kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py deleted file mode 100644 index dc57ef4f4..000000000 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from workflow_support.utils.kfp_utils import KFPUtils -from workflow_support.utils.pipeline_utils import PipelinesUtils -from workflow_support.utils.components_utils import ComponentUtils, ONE_HOUR_SEC, ONE_DAY_SEC, ONE_WEEK_SEC -from workflow_support.utils.remote_jobs_utils import RayRemoteJobs, execute_ray_jobs diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_wf.py index be629a3f6..b30b2a403 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/v1/noop_wf.py @@ -13,13 +13,12 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from workflow_support.utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, +from workflow_support.compile_utils import ( + ONE_HOUR_SEC, + ONE_WEEK_SEC, + ComponentUtils, ) - task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" # the name of the job script From 8627a46b536ef8bbc8f27dd5049337a9fecaae75 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 4 Jun 2024 15:06:26 +0300 Subject: [PATCH 26/64] remove Dockerfile_v2, update .make.transforms_workflows Signed-off-by: Alexey Roytman --- kfp/kfp_ray_components/Dockerfile_v2 | 25 ------------------------- transforms/.make.transforms_workflows | 3 --- 2 files changed, 28 deletions(-) delete mode 100644 kfp/kfp_ray_components/Dockerfile_v2 diff --git a/kfp/kfp_ray_components/Dockerfile_v2 b/kfp/kfp_ray_components/Dockerfile_v2 deleted file mode 100644 index 922ac070e..000000000 --- a/kfp/kfp_ray_components/Dockerfile_v2 +++ /dev/null @@ -1,25 +0,0 @@ -FROM docker.io/rayproject/ray:2.9.3-py310 - -ARG BUILD_DATE -ARG GIT_COMMIT - -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT - -# install libraries -COPY requirements.txt requirements.txt -RUN pip install kfp==2.7.0 --extra-index-url https://pypi.org/simple -RUN pip install kfp-kubernetes --extra-index-url https://pypi.org/simple -RUN pip install --no-cache-dir -r requirements.txt - -# install data processing and kfp support libs -# Copy in the frameworks source/project and install them -# This is expected to be placed in the docker context before this is run (see the make image). -COPY --chown=ray:users data-processing-lib/ data-processing-lib/ -RUN cd data-processing-lib && pip install --no-cache-dir -e . -COPY --chown=ray:users kfp_support_lib_v2/ kfp_support_lib_v2/ -RUN cd kfp_support_lib_v2 && pip install --no-cache-dir -e . -# remove credentials-containing file -RUN rm requirements.txt -# components -COPY ./src /pipelines/component/src diff --git a/transforms/.make.transforms_workflows b/transforms/.make.transforms_workflows index b4abf02b0..b0c73c60b 100644 --- a/transforms/.make.transforms_workflows +++ b/transforms/.make.transforms_workflows @@ -56,9 +56,6 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen $(MAKE) -C ${REPOROOT}/transforms .defaults.python-lib-src-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client; \ - pip install -e $(DPK_PYTHON_LIB_DIR); \ - pip install -e $(DPK_RAY_LIB_DIR); \ - pip install kfp==$(KFP) --extra-index-url https://pypi.org/simple; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); @# Help: Create the virtual environment common to all workflows From 245610ddf7464e60611995273a003cf5acae5a1c Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 4 Jun 2024 21:09:38 +0300 Subject: [PATCH 27/64] first universal pipeline Signed-off-by: Alexey Roytman --- .make.versions | 2 +- kfp/kfp_ray_components/Makefile | 2 +- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- .../executeRayJobComponent.yaml | 4 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../runtime_utils/remote_jobs_utils.py | 2 +- transforms/universal/noop/kfp_ray/Makefile | 32 +++ transforms/universal/noop/kfp_ray/noop_wf.py | 194 ++++++++++++++++++ 9 files changed, 234 insertions(+), 8 deletions(-) create mode 100644 transforms/universal/noop/kfp_ray/Makefile create mode 100644 transforms/universal/noop/kfp_ray/noop_wf.py diff --git a/.make.versions b/.make.versions index 73bb98ce0..e4b38765d 100644 --- a/.make.versions +++ b/.make.versions @@ -17,7 +17,7 @@ EDEDUP_VERSION=0.4.0 FDEDUP_VERSION=0.4.0 FILTER_VERSION=0.4.0 FILTER_SPARK_VERSION=0.2.0 -NOOP_VERSION=0.9.0 +NOOP_VERSION=0.8.0 NOOP_SPARK_VERSION=0.2.0 RESIZE_VERSION=0.4.0 LANG_ID_VERSION=0.4.0 diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index a1e4fe730..6d60fef6d 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -51,7 +51,7 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) rm -rf workflow_support_lib .PHONY: image -image: Dockerfile Dockerfile_v2 requirements.txt +image: Dockerfile requirements.txt $(MAKE) reconcile-requirements $(MAKE) .lib-src-image diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index f86af3991..71df1893a 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index d62312d0c..41d03fd5d 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -8,7 +8,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index d339bd05b..2804886ef 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -6,13 +6,13 @@ inputs: - { name: run_id, type: String, description: "The KFP Run ID" } - { name: additional_params, type: String, description: "additional parameters" } # The component converts the dictionary to json string - - { name: exec_params, type: dict, description: "job parameters" } + - { name: exec_params, type: JsonObject, description: "job parameters" } - { name: exec_script_name, type: String, description: "transform script name" } - { name: server_url, type: String, default: "", description: "url of api server" } implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 0c6c549fa..ca8f44a55 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py index fdbab4af3..0b20b28c4 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/remote_jobs_utils.py @@ -30,7 +30,7 @@ environment_variables_decoder, volume_decoder, ) -from workflow_support.utils import KFPUtils +from workflow_support.runtime_utils import KFPUtils from ray.job_submission import JobStatus diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile new file mode 100644 index 000000000..7dfb5941e --- /dev/null +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -0,0 +1,32 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.transforms_workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: workflow-build +workflow-build: workflow-venv + @for file in $(YAML_WF); do \ + $(MAKE) $$file; \ + done + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py new file mode 100644 index 000000000..189b8ad4f --- /dev/null +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -0,0 +1,194 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +import uuid + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + +import kfp.compiler as compiler +import kfp.dsl as dsl + +# if os.getenv("KFPv2", "0") == "1": +import kfp.components as comp + +# else: +# import kfp.components as comp + +RUN_ID = uuid.uuid4().hex + +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" + +# the name of the job script +EXEC_SCRIPT_NAME: str = "noop_transform.py" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" + +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" + + +# compute execution parameters. Here different tranforms might need different implementations. As +# a result, instead of creating a component we are creating it in place here. +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } + + +if os.getenv("KFPv2", "0") == "1": + compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + +# create Ray cluster +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "noop" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for noop", +) +def noop( + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute NOOP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param noop_sleep_sec - noop sleep time + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=RUN_ID, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=RUN_ID, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=RUN_ID, + additional_params=additional_params, + # note that the parameters below are specific for NOOP transform + exec_params=compute_exec_params.output, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # TODO + # Configure the pipeline level to one week (in seconds) + + +# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) From 3ada1576752b0666b051b130f1d048577303bee4 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 4 Jun 2024 21:39:44 +0300 Subject: [PATCH 28/64] move default_compute_execution_params --- .../compile_utils/component.py | 47 ------------------- .../runtime_utils/kfp_utils.py | 47 +++++++++++++++++++ 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py index ab0c310e1..460b20e23 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/compile_utils/component.py @@ -92,50 +92,3 @@ def add_secret_volume_to_com_function(component: dsl.ContainerOp, secretName: st secret=k8s_client.V1SecretVolumeSource(secret_name=secretName, optional=optional), ) component.add_pvolumes({mountPoint: vol}) - - @staticmethod - def default_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # cpus per actor - ) -> str: - """ - This is the most simplistic transform execution parameters computation - :param worker_options: configuration of ray workers - :param actor_options: actor request requirements - :return: number of actors - """ - import sys - - from data_processing.utils import GB, get_logger - from workflow_support.utils import KFPUtils - - logger = get_logger(__name__) - - # convert input - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - # Compute available cluster resources - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_mem = w_options["replicas"] * w_options["memory"] - cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) - logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") - # compute number of actors - n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) - n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) - n_actors = min(n_actors_cpu, n_actors_memory) - # Check if we need gpu calculations as well - actor_gpu = a_options.get("num_gpus", 0) - if actor_gpu > 0: - n_actors_gpu = int(cluster_gpu / actor_gpu) - n_actors = min(n_actors, n_actors_gpu) - logger.info(f"Number of actors - {n_actors}") - if n_actors < 1: - logger.warning( - f"Not enough cpu/gpu/memory to run transform, " - f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " - f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " - f"required cpu {actor_gpu}, available {cluster_gpu}" - ) - sys.exit(1) - - return str(n_actors) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py index ef00b0e92..feb081dd2 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/runtime_utils/kfp_utils.py @@ -111,3 +111,50 @@ def load_from_json(js: str) -> dict[str, Any]: except Exception as e: logger.warning(f"Failed to load parameters {js} with error {e}") sys.exit(1) + + @staticmethod + def default_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # cpus per actor + ) -> str: + """ + This is the most simplistic transform execution parameters computation + :param worker_options: configuration of ray workers + :param actor_options: actor request requirements + :return: number of actors + """ + import sys + + from data_processing.utils import GB, get_logger + from workflow_support.runtime_utils import KFPUtils + + logger = get_logger(__name__) + + # convert input + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + # Compute available cluster resources + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_mem = w_options["replicas"] * w_options["memory"] + cluster_gpu = w_options["replicas"] * w_options.get("gpu", 0.0) + logger.info(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_mem}, GPUs {cluster_gpu}") + # compute number of actors + n_actors_cpu = int(cluster_cpu * 0.85 / a_options.get("num_cpus", 0.5)) + n_actors_memory = int(cluster_mem * 0.85 / (a_options.get("memory", GB) / GB)) + n_actors = min(n_actors_cpu, n_actors_memory) + # Check if we need gpu calculations as well + actor_gpu = a_options.get("num_gpus", 0) + if actor_gpu > 0: + n_actors_gpu = int(cluster_gpu / actor_gpu) + n_actors = min(n_actors, n_actors_gpu) + logger.info(f"Number of actors - {n_actors}") + if n_actors < 1: + logger.warning( + f"Not enough cpu/gpu/memory to run transform, " + f"required cpu {a_options.get('num_cpus', .5)}, available {cluster_cpu}, " + f"required memory {a_options.get('memory', 1)}, available {cluster_mem}, " + f"required cpu {actor_gpu}, available {cluster_gpu}" + ) + sys.exit(1) + + return str(n_actors) \ No newline at end of file From e92803a4e4047d61d27cf0807d59d29de3e1a2d2 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Tue, 4 Jun 2024 22:06:27 +0300 Subject: [PATCH 29/64] fix imports in ray_components --- kfp/kfp_ray_components/Dockerfile | 1 - kfp/kfp_ray_components/Makefile | 3 +-- kfp/kfp_ray_components/src/create_ray_cluster.py | 10 +--------- kfp/kfp_ray_components/src/delete_ray_cluster.py | 11 +---------- kfp/kfp_ray_components/src/execute_ray_job.py | 12 +----------- .../src/execute_ray_job_multi_s3.py | 11 +---------- kfp/kfp_ray_components/src/subworkflow.py | 11 +++-------- 7 files changed, 8 insertions(+), 51 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 81b391eec..90bd04549 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -2,7 +2,6 @@ FROM docker.io/rayproject/ray:2.9.3-py310 ARG BUILD_DATE ARG GIT_COMMIT -ARG KFP_v2 LABEL build-date=$BUILD_DATE LABEL git-commit=$GIT_COMMIT diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 6d60fef6d..78c3dccb0 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -34,8 +34,7 @@ DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ - --build-arg KFP_v2=$(KFPv2) \ - --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . + --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) .PHONY: .lib-src-image diff --git a/kfp/kfp_ray_components/src/create_ray_cluster.py b/kfp/kfp_ray_components/src/create_ray_cluster.py index 900e482ca..a2b16d577 100644 --- a/kfp/kfp_ray_components/src/create_ray_cluster.py +++ b/kfp/kfp_ray_components/src/create_ray_cluster.py @@ -9,17 +9,9 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ -import os import sys +from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs -kfp_v2 = os.getenv("KFP_v2", 0) -print(kfp_v2) -if kfp_v2 == "1": - from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - print(f"Load KFPv2 libs") -else: - from workflow_support.utils import KFPUtils, RayRemoteJobs - print(f"Load KFPv1 libs") def start_ray_cluster( name: str, # name of Ray cluster diff --git a/kfp/kfp_ray_components/src/delete_ray_cluster.py b/kfp/kfp_ray_components/src/delete_ray_cluster.py index 02eeeb650..55cf2f34b 100644 --- a/kfp/kfp_ray_components/src/delete_ray_cluster.py +++ b/kfp/kfp_ray_components/src/delete_ray_cluster.py @@ -9,17 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -import os import sys - -kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == "1": - from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs - print(f"Load KFPv2 libs") -else: - from workflow_support.utils import KFPUtils, RayRemoteJobs - print(f"Load KFPv1 libs") +from workflow_support.runtime_utils import KFPUtils, RayRemoteJobs # Cleans and shutdowns the Ray cluster def cleanup_ray_cluster( diff --git a/kfp/kfp_ray_components/src/execute_ray_job.py b/kfp/kfp_ray_components/src/execute_ray_job.py index efbb8e723..173ccb06a 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job.py +++ b/kfp/kfp_ray_components/src/execute_ray_job.py @@ -9,17 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -import os - -kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == "1": - from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs - print(f"Load KFPv2 libs") -else: - from workflow_support.utils import KFPUtils, execute_ray_jobs - print(f"Load KFPv1 libs") - +from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": import argparse diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 7493c247f..b7b5d9863 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -10,16 +10,7 @@ # limitations under the License. ################################################################################ -import os - -kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == "1": - from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs - print(f"Load KFPv2 libs") -else: - from workflow_support.utils import KFPUtils, execute_ray_jobs - print(f"Load KFPv1 libs") - +from workflow_support.runtime_utils import KFPUtils, execute_ray_jobs if __name__ == "__main__": import argparse diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index a57e1406d..f15877d86 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -1,13 +1,8 @@ -import os import sys -kfp_v2 = os.getenv("KFP_v2", 0) -if kfp_v2 == 1: - from workflow_support.runtime_utils import KFPUtils, PipelinesUtils - print(f"Load KFPv2 libs") -else: - from workflow_support.utils import KFPUtils, PipelinesUtils - print(f"Load KFPv1 libs") +from workflow_support.runtime_utils import KFPUtils +from workflow_support.pipeline_utils import PipelinesUtils + from data_processing.utils import ParamsUtils From b815751b6ab8f812e83cc5a496948432111af8b6 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 09:41:08 +0300 Subject: [PATCH 30/64] Fixes after testing. Signed-off-by: Revital Sur --- .make.defaults | 7 - .make.versions | 1 + data-processing-lib/ray/pyproject.toml | 2 +- kfp/kfp_ray_components/Makefile | 15 +- kfp/kfp_support_lib/Makefile_old | 63 ------ .../kfp_v1_workflow_support/Makefile | 17 +- .../kfp_v1_workflow_support/pyproject.toml | 2 +- .../test/pipeline_utils_test.py | 2 +- .../test/ray_remote_jobs_test.py | 2 +- .../kfp_v2_workflow_support/Makefile | 12 +- .../python_apiserver_client/Makefile | 12 +- .../python_apiserver_client/pyproject.toml | 2 +- transforms/universal/noop/Makefile | 6 +- .../noop/kfp_ray/{v1 => }/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/v1/Makefile | 32 --- .../universal/noop/kfp_ray/v1/noop_wf.py | 160 --------------- transforms/universal/noop/kfp_ray/v2/Makefile | 32 --- .../universal/noop/kfp_ray/v2/noop_wf.py | 186 ------------------ 18 files changed, 40 insertions(+), 515 deletions(-) delete mode 100644 kfp/kfp_support_lib/Makefile_old rename transforms/universal/noop/kfp_ray/{v1 => }/noop_multiple_wf.py (99%) delete mode 100644 transforms/universal/noop/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/noop/kfp_ray/v1/noop_wf.py delete mode 100644 transforms/universal/noop/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/noop/kfp_ray/v2/noop_wf.py diff --git a/.make.defaults b/.make.defaults index 46bf3dab2..88a45621e 100644 --- a/.make.defaults +++ b/.make.defaults @@ -56,12 +56,6 @@ DPK_SPARK_LIB_DIR=$(REPOROOT)/data-processing-lib/spark KFPv2?=0 -ifeq ($(KFPv2), 0) - PIPELINE_PATH="kfp_ray/v1" -else - PIPELINE_PATH="kfp_ray/v2" -endif - ####################################################################################### # Lists all targets and optional help text found in the target. # Adapted from https://stackoverflow.com/a/65243296/45375 @@ -191,7 +185,6 @@ __check_defined = \ .defaults.image:: # Must be called with a DOCKER_IMAGE= settings. @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) \ - -f $(DOCKER_FILE) \ --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ diff --git a/.make.versions b/.make.versions index e4b38765d..bebeb8f1a 100644 --- a/.make.versions +++ b/.make.versions @@ -8,6 +8,7 @@ DPK_LIB_VERSION=0.2.0 DPK_LIB_KFP_VERSION=0.2.0 DPK_LIB_KFP_VERSION_v2=0.2.0 +DPK_LIB_KUBERAY_CLIENT=0.1.0 # Begin transform versions/tags BLOCKLIST_VERSION=0.4.0 diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 3acaf7197..88f193e7d 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.1.1" +version = "0.2.0" requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Ray" license = {text = "Apache-2.0"} diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 78c3dccb0..096ac0a99 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -11,12 +11,10 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed ' include makeenv ifeq ($(KFPv2), 1) -DOCKER_FILE=Dockerfile DOCKER_IMAGE_NAME=kfp-data-processing_v2 DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION_v2} WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support else -DOCKER_FILE=Dockerfile DOCKER_IMAGE_NAME=kfp-data-processing DOCKER_IMAGE_VERSION=${KFP_DOCKER_VERSION} WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support @@ -26,24 +24,13 @@ endif #DOCKER_IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_VERSION} DOCKER_IMG=$(DOCKER_LOCAL_IMAGE) -.PHONY: .kfp_image -.kfp_image:: # Must be called with a DOCKER_IMAGE= settings. - @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) - $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) \ - -f $(DOCKER_FILE) \ - --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ - --build-arg BASE_IMAGE=$(BASE_IMAGE) \ - --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ - --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . - $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) - .PHONY: .lib-src-image .lib-src-image:: $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray $(MAKE) .defaults.copy-lib LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/python_apiserver_client LIB_NAME=python_apiserver_client $(MAKE) .defaults.copy-lib LIB_PATH=$(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB) LIB_NAME=workflow_support_lib - $(MAKE) .kfp_image + $(MAKE) .defaults.image rm -rf data-processing-lib-ray rm -rf data-processing-lib-python rm -rf python_apiserver_client diff --git a/kfp/kfp_support_lib/Makefile_old b/kfp/kfp_support_lib/Makefile_old deleted file mode 100644 index 2b3b3f428..000000000 --- a/kfp/kfp_support_lib/Makefile_old +++ /dev/null @@ -1,63 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../.. -include ${REPOROOT}/.make.versions -include ${REPOROOT}/kfp/requirements.env - -# Include the common rules. -# Use "make help" to see them. -include ../../.make.defaults - -# Command to run pytest -PYTHON_VERSION=$(shell $(PYTHON) --version) -VENV_ACTIVATE=venv/bin/activate - -DEPLOY_KUBEFLOW ?= 0 - -clean:: - @# Help: Clean up the distribution build and the venv - rm -r dist venv || true - rm -rf src/*egg-info || true - rm -rf *.back || true - - -.check-env:: .check_python_version - @echo "Checks passed" - -update-toml:: .check-env - @# Help: Copy the Makefile distribution version into the pyproject.toml - sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml - sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml - sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml - sed -i.back 's/ray==[0-9].*/ray==${RAY}",/' pyproject.toml - -build:: update-toml venv - @# Help: Build the distribution for publishing to a pypi - rm -r dist || true - rm -rf src/*egg-info || true - ${PYTHON} -m pip install --upgrade build - ${PYTHON} -m build - -publish:: .check-env -publish:: - @# Help: Publish the wheel to testpypi - if [ -d "dist"]; then rm -r dist; fi - ${PYTHON} -m pip install --upgrade build - ${PYTHON} -m twine check dist/* - ${PYTHON} -m twine upload --verbose --non-interactive dist/* - -venv:: pyproject.toml .check-env .defaults.venv - $(MAKE) .defaults.install-python-lib-src-venv - . ${VENV_ACTIVATE}; \ - pip install -e .; \ - pip install pytest pytest-cov; - @# Help: Create the virtual environment using pyproject.toml - -test:: venv - @# Help: Use the already-built virtual environment to run pytest on the test directory. - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; -ifeq ($(DEPLOY_KUBEFLOW),1) - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; -endif diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 581c2e305..51cbb7396 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -28,14 +28,20 @@ update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/python_apiserver_client==[0-9].*/python_apiserver_client==${DPK_LIB_KUBERAY_CLIENT}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml + sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml build:: update-toml venv +ifeq ($(KFPv2), 1) + echo "Skipping build as KFPv2 is defined" +else @# Help: Build the distribution for publishing to a pypi rm -r dist || true rm -rf src/*egg-info || true ${PYTHON} -m pip install --upgrade build ${PYTHON} -m build +endif publish:: .check-env @# Help: Publish the wheel to testpypi @@ -49,15 +55,18 @@ venv:: pyproject.toml .check-env rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ - pip install -e ../python_apiserver_client; \ pip install -e ../../../data-processing-lib/python; \ - pip install -e .; \ - pip install ray==${RAY} \ + pip install -e ../python_apiserver_client; \ + pip install -e .; \ pip install pytest pytest-cov -test:: venv +test:: venv +ifeq ($(KFPv2), 1) + echo "Skipping test as KFPv2 is defined" +else @# Help: Use the already-built virtual environment to run pytest on the test directory. ifeq ($(DEPLOY_KUBEFLOW),1) . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; endif +endif diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index cc3d6e625..cc94adf9b 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ "ray==2.9.3", "requests", "data-prep-toolkit==0.2.0", - "python_apiserver_client", + "python_apiserver_client==0.1.0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py index 77cca5635..200bf1676 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/pipeline_utils_test.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from workflow_support.utils import PipelinesUtils +from workflow_support.pipeline_utils import PipelinesUtils server_url = "http://localhost:8080/" diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py index 7b9ad2c13..f409550e9 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -12,7 +12,7 @@ from configmaps import ConfigmapsManager from python_apiserver_client.params import ConfigMapVolume -from workflow_support.utils import RayRemoteJobs +from workflow_support.runtime_utils import RayRemoteJobs server_url = "http:localhost:8080/ray/" diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index 6d6540d84..b8565bb0c 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -29,13 +29,18 @@ update-toml:: .check-env sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION_v2}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v2}",/' pyproject.toml + sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml build:: update-toml venv +ifeq ($(KFPv2), 0) + echo "Skipping build as KFPv2 is not defined" +else @# Help: Build the distribution for publishing to a pypi rm -r dist || true rm -rf src/*egg-info || true ${PYTHON} -m pip install --upgrade build ${PYTHON} -m build +endif publish:: .check-env @# Help: Publish the wheel to testpypi @@ -49,16 +54,19 @@ venv:: pyproject.toml .check-env rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ - pip install -e ../python_apiserver_client; \ pip install -e ../../../data-processing-lib/python; \ + pip install -e ../python_apiserver_client; \ pip install -e .; \ - pip install ray==${RAY} \ pip install pytest pytest-cov test:: venv +ifeq ($(KFPv2), 0) + echo "Skipping test as KFPv2 is not defined" +else @# Help: Use the already-built virtual environment to run pytest on the test directory. ifeq ($(DEPLOY_KUBEFLOW),1) . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) ray_remote_jobs_test.py; . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) pipeline_utils_test.py; endif +endif diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile index 75807e5b6..808cdbc6a 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/Makefile +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -24,7 +24,12 @@ clean:: .check-env:: .check_python_version @echo "Checks passed" -build:: .check-env venv +update-toml:: .check-env + @# Help: Copy the Makefile distribution version into the pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KUBERAY_CLIENT}'"/' pyproject.toml + sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + +build:: update-toml venv @# Help: Build the distribution for publishing to a pypi rm -r dist || true rm -rf src/*egg-info || true @@ -43,9 +48,8 @@ venv::pyproject.toml .check-env rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ - pip install --upgrade pip; - pip install ray==${RAY}; \ -= pip install -e ../../../data-processing-lib/python; \ + pip install --upgrade pip; \ + pip install -e ../../../data-processing-lib/python; \ pip install -e .; \ pip install pytest pytest-cov diff --git a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml index 1ea8f9238..e15bd6583 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml +++ b/kfp/kfp_support_lib/python_apiserver_client/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" package_dir = ["src"] [project] name = "python_apiserver_client" -version = "0.0.1" +version = "0.1.0" dependencies = [ "requests", "kubernetes", diff --git a/transforms/universal/noop/Makefile b/transforms/universal/noop/Makefile index 6ca460863..f3f7ce71e 100644 --- a/transforms/universal/noop/Makefile +++ b/transforms/universal/noop/Makefile @@ -47,11 +47,7 @@ workflow-venv: .PHONY: workflow-build workflow-build: -ifeq ($(KFPv2), 0) - $(MAKE) -C kfp_ray/v1 workflow-build -else - $(MAKE) -C kfp_ray/v2 workflow-build -endif + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test diff --git a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py similarity index 99% rename from transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py rename to transforms/universal/noop/kfp_ray/noop_multiple_wf.py index aad211b06..2710c8c30 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -20,7 +20,7 @@ ) -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" diff --git a/transforms/universal/noop/kfp_ray/v1/Makefile b/transforms/universal/noop/kfp_ray/v1/Makefile deleted file mode 100644 index 1a49cbd49..000000000 --- a/transforms/universal/noop/kfp_ray/v1/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -PYTHON_WF := $(shell find ./ -name '*_wf.py') -YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml - -.PHONY: workflow-upload -workflow-upload: workflow-build - @for file in $(YAML_WF); do \ - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - @for file in $(PYTHON_WF); do \ - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ - done diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_wf.py deleted file mode 100644 index b30b2a403..000000000 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ /dev/null @@ -1,160 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from workflow_support.compile_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "noop" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for noop", -) -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' - '"image": "' + task_image + '"}', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param noop_sleep_sec - noop sleep time - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/noop/kfp_ray/v2/Makefile b/transforms/universal/noop/kfp_ray/v2/Makefile deleted file mode 100644 index 1a49cbd49..000000000 --- a/transforms/universal/noop/kfp_ray/v2/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -PYTHON_WF := $(shell find ./ -name '*_wf.py') -YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml - -.PHONY: workflow-upload -workflow-upload: workflow-build - @for file in $(YAML_WF); do \ - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - @for file in $(PYTHON_WF); do \ - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ - done diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py deleted file mode 100644 index e158e8e3b..000000000 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ /dev/null @@ -1,186 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from workflow_support.compile_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -import uuid - -# FIXME: create a component to get run id -RUN_ID = uuid.uuid4().hex - -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -@dsl.component(base_image=base_kfp_image) -def compute_exec_params(worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - noop_sleep_sec: int, - ) -> dict: - from workflow_support.runtime_utils import KFPUtils - return { - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, - actor_options), - "runtime_worker_options": actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - } - - -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "noop" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for noop", -) -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param noop_sleep_sec - noop sleep time - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params_task = compute_exec_params( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=RUN_ID, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params_task, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=RUN_ID, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=RUN_ID, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params=compute_exec_params_task.output, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - - -# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) From 48a51657a5b8b50114bf7ccefbec728d51187dd9 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 10:13:40 +0300 Subject: [PATCH 31/64] add noop_multiple_wf.py --- .../noop/kfp_ray/{v1 => }/noop_multiple_wf.py | 83 +++++--- transforms/universal/noop/kfp_ray/noop_wf.py | 20 +- transforms/universal/noop/kfp_ray/v1/Makefile | 32 --- .../universal/noop/kfp_ray/v1/noop_wf.py | 160 --------------- transforms/universal/noop/kfp_ray/v2/Makefile | 32 --- .../universal/noop/kfp_ray/v2/noop_wf.py | 186 ------------------ 6 files changed, 64 insertions(+), 449 deletions(-) rename transforms/universal/noop/kfp_ray/{v1 => }/noop_multiple_wf.py (72%) delete mode 100644 transforms/universal/noop/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/noop/kfp_ray/v1/noop_wf.py delete mode 100644 transforms/universal/noop/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/noop/kfp_ray/v2/noop_wf.py diff --git a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py similarity index 72% rename from transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py rename to transforms/universal/noop/kfp_ray/noop_multiple_wf.py index aad211b06..90245427f 100644 --- a/transforms/universal/noop/kfp_ray/v1/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -9,33 +9,64 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os +import uuid + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler -import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) +import kfp.components as comp +RUN_ID = uuid.uuid4().hex -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -107,7 +138,7 @@ def noop( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -115,12 +146,19 @@ def noop( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=RUN_ID, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=RUN_ID, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -131,20 +169,10 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=RUN_ID, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -152,8 +180,9 @@ def noop( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 189b8ad4f..75cd0f5ab 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -16,13 +16,8 @@ import kfp.compiler as compiler import kfp.dsl as dsl - -# if os.getenv("KFPv2", "0") == "1": import kfp.components as comp -# else: -# import kfp.components as comp - RUN_ID = uuid.uuid4().hex task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" @@ -31,12 +26,11 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" - # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( @@ -64,7 +58,10 @@ def compute_exec_params_func( "noop_sleep_sec": noop_sleep_sec, } - +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) else: @@ -87,7 +84,8 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' + '"image": "' + task_image + '" }', ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' '"image_pull_secret": "", "image": "' + task_image + '"}', server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -184,9 +182,7 @@ def noop( # TODO # Configure the pipeline level to one week (in seconds) - - -# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/noop/kfp_ray/v1/Makefile b/transforms/universal/noop/kfp_ray/v1/Makefile deleted file mode 100644 index 1a49cbd49..000000000 --- a/transforms/universal/noop/kfp_ray/v1/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -PYTHON_WF := $(shell find ./ -name '*_wf.py') -YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml - -.PHONY: workflow-upload -workflow-upload: workflow-build - @for file in $(YAML_WF); do \ - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - @for file in $(PYTHON_WF); do \ - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ - done diff --git a/transforms/universal/noop/kfp_ray/v1/noop_wf.py b/transforms/universal/noop/kfp_ray/v1/noop_wf.py deleted file mode 100644 index b30b2a403..000000000 --- a/transforms/universal/noop/kfp_ray/v1/noop_wf.py +++ /dev/null @@ -1,160 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from workflow_support.compile_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.9.0" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "noop" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for noop", -) -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' - '"image": "' + task_image + '"}', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param noop_sleep_sec - noop sleep time - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/noop/kfp_ray/v2/Makefile b/transforms/universal/noop/kfp_ray/v2/Makefile deleted file mode 100644 index 1a49cbd49..000000000 --- a/transforms/universal/noop/kfp_ray/v2/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -PYTHON_WF := $(shell find ./ -name '*_wf.py') -YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=noop_wf.yaml - -.PHONY: workflow-upload -workflow-upload: workflow-build - @for file in $(YAML_WF); do \ - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - @for file in $(PYTHON_WF); do \ - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ - done diff --git a/transforms/universal/noop/kfp_ray/v2/noop_wf.py b/transforms/universal/noop/kfp_ray/v2/noop_wf.py deleted file mode 100644 index e158e8e3b..000000000 --- a/transforms/universal/noop/kfp_ray/v2/noop_wf.py +++ /dev/null @@ -1,186 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from workflow_support.compile_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -import uuid - -# FIXME: create a component to get run id -RUN_ID = uuid.uuid4().hex - -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing_v2:0.2.0-v2" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -@dsl.component(base_image=base_kfp_image) -def compute_exec_params(worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - noop_sleep_sec: int, - ) -> dict: - from workflow_support.runtime_utils import KFPUtils - return { - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, - actor_options), - "runtime_worker_options": actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - } - - -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "noop" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for noop", -) -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param noop_sleep_sec - noop sleep time - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params_task = compute_exec_params( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=RUN_ID, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params_task, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=RUN_ID, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=RUN_ID, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params=compute_exec_params_task.output, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - - -# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) From 6e35f4d959de9d9324760ac0e70faac4cbdda5c1 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 10:50:05 +0300 Subject: [PATCH 32/64] More fixes. Signed-off-by: Revital Sur --- .../test/configmaps.py | 72 +++++++++++++++++++ .../test/ray_remote_jobs_test.py | 2 +- 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py new file mode 100644 index 000000000..65e53e828 --- /dev/null +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/configmaps.py @@ -0,0 +1,72 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from kubernetes import client, config + + +CMAP_VALUE = """ +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0" +""" +CMAP_NAME = "ray-job-code-sample" + + +class ConfigmapsManager: + """ + Simple support class to manage config maps. Assumes local access to Kubectl + """ + + def __init__(self): + config.load_kube_config() + self.api_instance = client.CoreV1Api() + + def list_configmaps(self) -> list[str]: + cm_list = self.api_instance.list_namespaced_config_map(namespace="default").items + return [cm.metadata.name for cm in cm_list] + + def create_code_map(self) -> None: + cmap = client.V1ConfigMap() + cmap.metadata = client.V1ObjectMeta(name=CMAP_NAME) + cmap.data = {"sample_code.py": CMAP_VALUE} + self.api_instance.create_namespaced_config_map(namespace="default", body=cmap) + + def delete_code_map(self) -> None: + try: + self.api_instance.delete_namespaced_config_map(name="ray-job-code-sample", namespace="default") + except Exception as e: + print("config map ray-job-code-sample does not exist") diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py index f409550e9..8aec06bdd 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -14,7 +14,7 @@ from python_apiserver_client.params import ConfigMapVolume from workflow_support.runtime_utils import RayRemoteJobs -server_url = "http:localhost:8080/ray/" +server_url = "http://localhost:8080/ray" def test_ray_remote_jobs(): """ From 76ea8f9918f9aa0d861bf326da7ca027ba24f55e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 11:12:11 +0300 Subject: [PATCH 33/64] Minor fix. Signed-off-by: Revital Sur --- .../noop/kfp_ray/noop_multiple_wf.py | 161 ------------------ 1 file changed, 161 deletions(-) delete mode 100644 transforms/universal/noop/kfp_ray/noop_multiple_wf.py diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py deleted file mode 100644 index 2710c8c30..000000000 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ /dev/null @@ -1,161 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "noop" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for multiple noop", -) -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' - '"image": "' + task_image + '", "image_pull_policy": "Always" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '", "image_pull_policy": "Always" }', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "[{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}]", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration. Note that config here should be an array - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param noop_sleep_sec - noop sleep time - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(noop, __file__.replace(".py", ".yaml")) From caa1de32649c428631e630c28f637549a0de5f32 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 11:25:15 +0300 Subject: [PATCH 34/64] fix run job id --- .../noop/kfp_ray/noop_multiple_wf.py | 68 ++++++----- transforms/universal/noop/kfp_ray/noop_wf.py | 107 ++++++++++-------- 2 files changed, 95 insertions(+), 80 deletions(-) diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 90245427f..f7ff9e4a3 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -10,21 +10,21 @@ # limitations under the License. ################################################################################ import os -import uuid from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler -import kfp.dsl as dsl import kfp.components as comp +import kfp.dsl as dsl -RUN_ID = uuid.uuid4().hex task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" +RUNTIME_JOB_ID = "runtime_job_id" + # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" @@ -34,16 +34,17 @@ # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( - worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - noop_sleep_sec: int, + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_code_location: str, + noop_sleep_sec: int, ) -> dict: + import uuid + from workflow_support.runtime_utils import KFPUtils return { @@ -53,17 +54,20 @@ def compute_exec_params_func( "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), "runtime_worker_options": actor_options, "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, + RUNTIME_JOB_ID: uuid.uuid4().hex, "runtime_code_location": runtime_code_location, "noop_sleep_sec": noop_sleep_sec, } + # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) @@ -137,28 +141,32 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=compute_exec_params.output[RUNTIME_JOB_ID], server_url=server_url + ) + ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=RUN_ID, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=RUN_ID, + run_id=compute_exec_params.output[RUNTIME_JOB_ID], ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -169,7 +177,7 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=RUN_ID, + run_id=compute_exec_params.output[RUNTIME_JOB_ID], additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params=compute_exec_params.output, diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 75cd0f5ab..a1c51a2df 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -10,21 +10,21 @@ # limitations under the License. ################################################################################ import os -import uuid from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler -import kfp.dsl as dsl import kfp.components as comp +import kfp.dsl as dsl -RUN_ID = uuid.uuid4().hex task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" +RUNTIME_JOB_ID = "runtime_job_id" + # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" @@ -34,16 +34,17 @@ # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( - worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - noop_sleep_sec: int, + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_code_location: str, + noop_sleep_sec: int, ) -> dict: + import uuid + from workflow_support.runtime_utils import KFPUtils return { @@ -53,17 +54,20 @@ def compute_exec_params_func( "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), "runtime_worker_options": actor_options, "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, + RUNTIME_JOB_ID: uuid.uuid4().hex, "runtime_code_location": runtime_code_location, "noop_sleep_sec": noop_sleep_sec, } + # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) @@ -82,26 +86,25 @@ def compute_exec_params_func( description="Pipeline for noop", ) def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' - '"image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' '"image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ): """ Pipeline to execute NOOP transform @@ -137,28 +140,32 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=RUN_ID, server_url=server_url) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=compute_exec_params.output[RUNTIME_JOB_ID], server_url=server_url + ) + ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=RUN_ID, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=RUN_ID, + run_id=compute_exec_params.output[RUNTIME_JOB_ID], ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -169,7 +176,7 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=RUN_ID, + run_id=compute_exec_params.output[RUNTIME_JOB_ID], additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params=compute_exec_params.output, From 5fa1596233476bbe4170c1189e989fd5c6a833c0 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 11:53:53 +0300 Subject: [PATCH 35/64] More fixes. Signed-off-by: Revital Sur --- .github/workflows/test.yml | 2 ++ kfp/kfp_support_lib/python_apiserver_client/Makefile | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 82e8c09da..a864bf74f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -108,6 +108,8 @@ jobs: chmod 777 /tmp/kubectl curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc chmod +x /tmp/mc + export DEPLOY_KUBEFLOW=1 make -C kind setup + make -C kfp/kfp_support_lib build test make -C transforms workflow-build make -C transforms/universal/noop workflow-test diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile index 808cdbc6a..1e33cfe59 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/Makefile +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -53,10 +53,9 @@ venv::pyproject.toml .check-env pip install -e .; \ pip install pytest pytest-cov -test:: venv +test:: venv @# Help: Use the already-built virtual environment to run pytest on the test directory. . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) api_params_test.py; - . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) configmaps.py; ifeq ($(DEPLOY_KUBEFLOW),1) . ${VENV_ACTIVATE}; export PYTHONPATH=../src; cd test; $(PYTEST) kuberay_api_test.py; endif From e416148ed1c35502c639019f349410519528b85f Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 12:12:41 +0300 Subject: [PATCH 36/64] Address code review. Signed-off-by: Revital Sur --- .make.versions | 1 - kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile | 1 - .../kfp_v1_workflow_support/test/ray_remote_jobs_test.py | 2 +- kfp/kfp_support_lib/python_apiserver_client/Makefile | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.make.versions b/.make.versions index bebeb8f1a..e4b38765d 100644 --- a/.make.versions +++ b/.make.versions @@ -8,7 +8,6 @@ DPK_LIB_VERSION=0.2.0 DPK_LIB_KFP_VERSION=0.2.0 DPK_LIB_KFP_VERSION_v2=0.2.0 -DPK_LIB_KUBERAY_CLIENT=0.1.0 # Begin transform versions/tags BLOCKLIST_VERSION=0.4.0 diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 51cbb7396..759e1f8a3 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -28,7 +28,6 @@ update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml - sed -i.back 's/python_apiserver_client==[0-9].*/python_apiserver_client==${DPK_LIB_KUBERAY_CLIENT}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py index 8aec06bdd..ab25573b0 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/test/ray_remote_jobs_test.py @@ -14,7 +14,7 @@ from python_apiserver_client.params import ConfigMapVolume from workflow_support.runtime_utils import RayRemoteJobs -server_url = "http://localhost:8080/ray" +server_url = "http://localhost:8080/ray/" def test_ray_remote_jobs(): """ diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile index 1e33cfe59..5f4f24897 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/Makefile +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -26,7 +26,6 @@ clean:: update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml - sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KUBERAY_CLIENT}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml build:: update-toml venv From cc3decac0444fb4c7c68c86bdb45ebd75d39a90a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 12:58:51 +0300 Subject: [PATCH 37/64] Minor fixes. Signed-off-by: Revital Sur --- kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile | 6 +++++- kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 759e1f8a3..ea2b74e7d 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -50,6 +50,9 @@ publish:: .check-env ${PYTHON} -m twine upload --verbose --non-interactive dist/* venv:: pyproject.toml .check-env +ifeq ($(KFPv2), 1) + echo "Skipping test as KFPv2 is defined" +else @# Help: Create the virtual environment using pyproject.toml rm -rf venv $(PYTHON) -m venv venv @@ -58,10 +61,11 @@ venv:: pyproject.toml .check-env pip install -e ../python_apiserver_client; \ pip install -e .; \ pip install pytest pytest-cov +endif test:: venv ifeq ($(KFPv2), 1) - echo "Skipping test as KFPv2 is defined" + echo "Skipping test as KFPv2 is defined" else @# Help: Use the already-built virtual environment to run pytest on the test directory. ifeq ($(DEPLOY_KUBEFLOW),1) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index b8565bb0c..b3d7ba60d 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -50,6 +50,9 @@ publish:: .check-env ${PYTHON} -m twine upload --verbose --non-interactive dist/* venv:: pyproject.toml .check-env +ifeq ($(KFPv2), 0) + echo "Skipping venv as KFPv2 is not defined" +else @# Help: Create the virtual environment using pyproject.toml rm -rf venv $(PYTHON) -m venv venv @@ -58,6 +61,7 @@ venv:: pyproject.toml .check-env pip install -e ../python_apiserver_client; \ pip install -e .; \ pip install pytest pytest-cov +endif test:: venv ifeq ($(KFPv2), 0) From 8ae25d3e5fa67b0766421feb38319c9628819900 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 14:36:34 +0300 Subject: [PATCH 38/64] Additional fixes. Signed-off-by: Revital Sur --- .github/workflows/test.yml | 8 ++++---- kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile | 6 ++++-- kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile | 6 ++++-- kfp/kfp_support_lib/python_apiserver_client/Makefile | 5 +++-- transforms/universal/noop/kfp_ray/Makefile | 6 ++++++ transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a864bf74f..9324ead7e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,7 +62,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - name: Test KFP lib + - name: Test KFP v1 lib run: | source kind/requirements.env export PATH=$PATH:/tmp/ @@ -93,7 +93,7 @@ jobs: sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true df -h - - name: Test KFP worflow run + - name: Test KFP v1 worflow run timeout-minutes: 120 run: | source kind/requirements.env @@ -110,6 +110,6 @@ jobs: chmod +x /tmp/mc export DEPLOY_KUBEFLOW=1 make -C kind setup - make -C kfp/kfp_support_lib build test - make -C transforms workflow-build + make -C kfp/kfp_support_lib test + make -C transforms/universal/noop/ workflow-build make -C transforms/universal/noop workflow-test diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index ea2b74e7d..34ab9d34f 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -24,14 +24,14 @@ clean:: .check-env:: .check_python_version @echo "Checks passed" -update-toml:: .check-env +set-versions:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml -build:: update-toml venv +build:: set-versions venv ifeq ($(KFPv2), 1) echo "Skipping build as KFPv2 is defined" else @@ -57,7 +57,9 @@ else rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ pip install -e ../../../data-processing-lib/python; \ + cd ../python_apiserver_client && make set-versions && cd -; \ pip install -e ../python_apiserver_client; \ pip install -e .; \ pip install pytest pytest-cov diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index b3d7ba60d..c7d302707 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -24,14 +24,14 @@ clean:: .check-env:: .check_python_version @echo "Checks passed" -update-toml:: .check-env +set-versions:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION_v2}'"/' pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v2}",/' pyproject.toml sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml -build:: update-toml venv +build:: set-versions venv ifeq ($(KFPv2), 0) echo "Skipping build as KFPv2 is not defined" else @@ -57,7 +57,9 @@ else rm -rf venv $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ pip install -e ../../../data-processing-lib/python; \ + cd ../python_apiserver_client && make set-versions && cd -; \ pip install -e ../python_apiserver_client; \ pip install -e .; \ pip install pytest pytest-cov diff --git a/kfp/kfp_support_lib/python_apiserver_client/Makefile b/kfp/kfp_support_lib/python_apiserver_client/Makefile index 5f4f24897..642d475d1 100644 --- a/kfp/kfp_support_lib/python_apiserver_client/Makefile +++ b/kfp/kfp_support_lib/python_apiserver_client/Makefile @@ -24,11 +24,11 @@ clean:: .check-env:: .check_python_version @echo "Checks passed" -update-toml:: .check-env +set-versions:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml -build:: update-toml venv +build:: set-versions venv @# Help: Build the distribution for publishing to a pypi rm -r dist || true rm -rf src/*egg-info || true @@ -48,6 +48,7 @@ venv::pyproject.toml .check-env $(PYTHON) -m venv venv . ${VENV_ACTIVATE}; \ pip install --upgrade pip; \ + cd ../../../data-processing-lib/python && make set-versions && cd -; \ pip install -e ../../../data-processing-lib/python; \ pip install -e .; \ pip install pytest pytest-cov diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index fa67cdb0a..e1774b7e8 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -9,6 +9,12 @@ YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} +venv:: + +build:: + +test:: + .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index f7ff9e4a3..e3473f425 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -26,7 +26,7 @@ RUNTIME_JOB_ID = "runtime_job_id" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" From 1ea0cf5746df5b1442ecc9c66d98cc1ca101d5b4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 14:52:50 +0300 Subject: [PATCH 39/64] Fix transform test. Signed-off-by: Revital Sur --- transforms/universal/noop/kfp_ray/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index e1774b7e8..cb052d1ab 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -15,6 +15,8 @@ build:: test:: +test-src:: + .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) From 40023681efdc113f938b282bd8a5302bb691e8ac Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 21:35:23 +0300 Subject: [PATCH 40/64] noop --- transforms/universal/noop/kfp_ray/noop_wf.py | 119 +++++++++---------- 1 file changed, 57 insertions(+), 62 deletions(-) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index a1c51a2df..9aa0c1856 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -14,17 +14,14 @@ from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler -import kfp.components as comp import kfp.dsl as dsl - +import kfp.components as comp task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" -RUNTIME_JOB_ID = "runtime_job_id" - # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" @@ -34,17 +31,16 @@ # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( - worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_code_location: str, - noop_sleep_sec: int, + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, ) -> dict: - import uuid - from workflow_support.runtime_utils import KFPUtils return { @@ -54,22 +50,23 @@ def compute_exec_params_func( "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), "runtime_worker_options": actor_options, "runtime_pipeline_id": runtime_pipeline_id, - RUNTIME_JOB_ID: uuid.uuid4().hex, + "runtime_job_id": runtime_job_id, "runtime_code_location": runtime_code_location, "noop_sleep_sec": noop_sleep_sec, } -# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the -# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. -# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use -# this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - compute_exec_params_op = dsl.component_decorator.component( - func=compute_exec_params_func, base_image=base_kfp_image - ) + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -86,25 +83,25 @@ def compute_exec_params_func( description="Pipeline for noop", ) def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' '"image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ): """ Pipeline to execute NOOP transform @@ -140,32 +137,28 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ - - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # create clean_up task - clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=compute_exec_params.output[RUNTIME_JOB_ID], server_url=server_url - ) - + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=compute_exec_params.output[RUNTIME_JOB_ID], + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -176,7 +169,7 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=compute_exec_params.output[RUNTIME_JOB_ID], + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params=compute_exec_params.output, @@ -189,7 +182,9 @@ def noop( # TODO # Configure the pipeline level to one week (in seconds) - # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": From 6f1b417dd8982358d204be94c4dee71df83667b0 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 21:41:47 +0300 Subject: [PATCH 41/64] Fix noop --- .../noop/kfp_ray/noop_multiple_wf.py | 49 ++++++------ transforms/universal/noop/kfp_ray/noop_wf.py | 74 ++++++++++--------- 2 files changed, 66 insertions(+), 57 deletions(-) diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index e3473f425..71098c3f3 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,8 +23,6 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" -RUNTIME_JOB_ID = "runtime_job_id" - # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" @@ -40,6 +38,7 @@ def compute_exec_params_func( data_max_files: int, data_num_samples: int, runtime_pipeline_id: str, + runtime_job_id: str, runtime_code_location: str, noop_sleep_sec: int, ) -> dict: @@ -54,7 +53,7 @@ def compute_exec_params_func( "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), "runtime_worker_options": actor_options, "runtime_pipeline_id": runtime_pipeline_id, - RUNTIME_JOB_ID: uuid.uuid4().hex, + "runtime_job_id": runtime_job_id, "runtime_code_location": runtime_code_location, "noop_sleep_sec": noop_sleep_sec, } @@ -65,11 +64,19 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -141,32 +148,28 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ - - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # create clean_up task - clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=compute_exec_params.output[RUNTIME_JOB_ID], server_url=server_url - ) - + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=compute_exec_params.output[RUNTIME_JOB_ID], + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -177,7 +180,7 @@ def noop( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=compute_exec_params.output[RUNTIME_JOB_ID], + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params=compute_exec_params.output, diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 9aa0c1856..91da68620 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -14,8 +14,9 @@ from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler -import kfp.dsl as dsl import kfp.components as comp +import kfp.dsl as dsl + task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" @@ -23,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v5" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" @@ -31,15 +32,15 @@ # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. def compute_exec_params_func( - worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - noop_sleep_sec: int, + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, ) -> dict: from workflow_support.runtime_utils import KFPUtils @@ -56,13 +57,20 @@ def compute_exec_params_func( } +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at # compilation time. import uuid - compute_exec_params_op = dsl.component_decorator.component(func=compute_exec_params_func, base_image=base_kfp_image) + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) @@ -83,25 +91,25 @@ def compute_exec_params_func( description="Pipeline for noop", ) def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # noop parameters - noop_sleep_sec: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', + # Ray cluster + ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/noop/input/', 'output_folder': 'test/noop/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # noop parameters + noop_sleep_sec: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', ): """ Pipeline to execute NOOP transform @@ -182,9 +190,7 @@ def noop( # TODO # Configure the pipeline level to one week (in seconds) - - -# dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": From 3bdf6a7ae8ac63f855adfa16c8ab08625719ad5a Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 22:06:19 +0300 Subject: [PATCH 42/64] image name fixes, dock_id --- transforms/universal/doc_id/kfp_ray/Makefile | 42 +++++ .../doc_id/kfp_ray/{v1 => }/doc_id_wf.py | 98 +++++++---- .../universal/doc_id/kfp_ray/v1/Makefile | 25 --- .../universal/doc_id/kfp_ray/v2/Makefile | 25 --- .../universal/doc_id/kfp_ray/v2/doc_id_wf.py | 166 ------------------ transforms/universal/noop/kfp_ray/Makefile | 2 + transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- 7 files changed, 110 insertions(+), 250 deletions(-) create mode 100644 transforms/universal/doc_id/kfp_ray/Makefile rename transforms/universal/doc_id/kfp_ray/{v1 => }/doc_id_wf.py (69%) delete mode 100644 transforms/universal/doc_id/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/doc_id/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile new file mode 100644 index 000000000..bb58a8a63 --- /dev/null +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -0,0 +1,42 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + @for file in $(YAML_WF); do \ + $(MAKE) $$file; \ + done + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=doc_id_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py similarity index 69% rename from transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py rename to transforms/universal/doc_id/kfp_ray/doc_id_wf.py index ab8a65b3c..9ea94fd97 100644 --- a/transforms/universal/doc_id/kfp_ray/v1/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -9,40 +9,79 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.4.0" # the name of the job script -EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" +EXEC_SCRIPT_NAME: str = "doc_id_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + noop_sleep_sec: int, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "noop_sleep_sec": noop_sleep_sec, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") - # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "doc_id" @@ -114,7 +153,7 @@ def doc_id( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -122,12 +161,19 @@ def doc_id( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -138,25 +184,10 @@ def doc_id( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "data_checkpointing": data_checkpointing, - "data_data_sets": data_data_sets, - "data_files_to_use": data_files_to_use, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "doc_id_doc_column": doc_id_doc_column, - "doc_id_hash_column": doc_id_hash_column, - "doc_id_int_column": doc_id_int_column, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -164,8 +195,9 @@ def doc_id( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/doc_id/kfp_ray/v1/Makefile b/transforms/universal/doc_id/kfp_ray/v1/Makefile deleted file mode 100644 index e33049af4..000000000 --- a/transforms/universal/doc_id/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=doc_id_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=doc_id_wf.py diff --git a/transforms/universal/doc_id/kfp_ray/v2/Makefile b/transforms/universal/doc_id/kfp_ray/v2/Makefile deleted file mode 100644 index 3cf6c4084..000000000 --- a/transforms/universal/doc_id/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=proglang_select_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py deleted file mode 100644 index f99a6aaee..000000000 --- a/transforms/universal/doc_id/kfp_ray/v2/doc_id_wf.py +++ /dev/null @@ -1,166 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "doc_id" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for doc_id", -) -def doc_id( - # Ray cluster - ray_name: str = "doc_id-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/doc_id/input/', 'output_folder': 'test/doc_id/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # doc id parameters - doc_id_doc_column: str = "contents", - doc_id_hash_column: str = "hash_column", - doc_id_int_column: str = "int_id_column", - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param doc_id_doc_column - document column - :param doc_id_hash_column - hash id column - :param doc_id_int_column - integer id column - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "doc_id_doc_column": doc_id_doc_column, - "doc_id_hash_column": doc_id_hash_column, - "doc_id_int_column": doc_id_int_column, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(doc_id, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index cb052d1ab..2097300a7 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -17,6 +17,8 @@ test:: test-src:: +test-image:: + .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 91da68620..b01831386 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0-v2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" From 2edfd67557307927ffda77c7b9570db9c6968ad1 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 22:40:49 +0300 Subject: [PATCH 43/64] fix tests --- .../workflow_support/pipeline_utils/pipelines_tests_utils.py | 2 +- .../kfp_v2_workflow_support/test/pipeline_utils_test.py | 2 +- .../kfp_v2_workflow_support/test/ray_remote_jobs_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 5fd43ca6b..183331a2b 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -3,7 +3,7 @@ from data_processing.utils import get_logger, str2bool -from workflow_support.utils import PipelinesUtils +from workflow_support.pipeline_utils import PipelinesUtils logger = get_logger(__name__) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py index 77cca5635..200bf1676 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/pipeline_utils_test.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from workflow_support.utils import PipelinesUtils +from workflow_support.pipeline_utils import PipelinesUtils server_url = "http://localhost:8080/" diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py index 7b9ad2c13..f409550e9 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/test/ray_remote_jobs_test.py @@ -12,7 +12,7 @@ from configmaps import ConfigmapsManager from python_apiserver_client.params import ConfigMapVolume -from workflow_support.utils import RayRemoteJobs +from workflow_support.runtime_utils import RayRemoteJobs server_url = "http:localhost:8080/ray/" From 09ea1ccb47b5ef66dcddcf0a99dd05c0587cef20 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 23:12:22 +0300 Subject: [PATCH 44/64] fix doc_id and tests --- transforms/.make.workflows | 4 ++-- .../universal/doc_id/kfp_ray/doc_id_wf.py | 21 ++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 7860dddc9..98cacb359 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -50,7 +50,7 @@ ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) load-image endif - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipelines_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${REPOROOT}/kfp/kfp_support_lib/ rm -rf ${REPOROOT}/transforms/venv @@ -66,5 +66,5 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen @if [ -z ${CLUSTER_EXISTS} ]; then \ cd ${REPOROOT} && make setup; \ fi - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipelines_utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 9ea94fd97..4ce041657 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -37,10 +37,15 @@ def compute_exec_params_func( data_s3_config: str, data_max_files: int, data_num_samples: int, + data_checkpointing: bool, + data_data_sets: str, + data_files_to_use: str, runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: str, - noop_sleep_sec: int, + doc_id_doc_column: str, + doc_id_hash_column: str, + doc_id_int_column: str, ) -> dict: from workflow_support.runtime_utils import KFPUtils @@ -48,12 +53,17 @@ def compute_exec_params_func( "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, + "data_checkpointing": data_checkpointing, + "data_data_sets": data_data_sets, + "data_files_to_use": data_files_to_use, "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), "runtime_worker_options": actor_options, "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": runtime_code_location, - "noop_sleep_sec": noop_sleep_sec, + "doc_id_doc_column": doc_id_doc_column, + "doc_id_hash_column": doc_id_hash_column, + "doc_id_int_column": doc_id_int_column, } @@ -164,10 +174,15 @@ def doc_id( data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, + data_data_sets=data_data_sets, + data_files_to_use=data_files_to_use, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - noop_sleep_sec=noop_sleep_sec, + doc_id_doc_column=doc_id_doc_column, + doc_id_hash_column=doc_id_hash_column, + doc_id_int_column=doc_id_int_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster From 5e1504000e4030eaf4513b6a9fe0fa12f314d2b9 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Wed, 5 Jun 2024 23:41:32 +0300 Subject: [PATCH 45/64] fix .make.workflows --- transforms/.make.workflows | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 98cacb359..62e6fc457 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -50,7 +50,7 @@ ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) load-image endif - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipelines_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${REPOROOT}/kfp/kfp_support_lib/ rm -rf ${REPOROOT}/transforms/venv @@ -66,5 +66,5 @@ ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requiremen @if [ -z ${CLUSTER_EXISTS} ]; then \ cd ${REPOROOT} && make setup; \ fi - . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipelines_utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} + . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "upload" -p ${CURDIR}/${PIPELINE_FILE} From 33a3d977d8fe955d1f4011d95906e74ede58e436 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 09:25:17 +0300 Subject: [PATCH 46/64] fix ededup --- .../universal/ededup/kfp_ray/v1/Makefile | 25 --- .../universal/ededup/kfp_ray/v1/ededup_wf.py | 169 ------------------ .../v1/src/ededup_compute_execution_params.py | 102 ----------- .../universal/ededup/kfp_ray/v2/Makefile | 25 --- .../universal/ededup/kfp_ray/v2/ededup_wf.py | 165 ----------------- .../v2/src/ededup_compute_execution_params.py | 98 ---------- transforms/universal/noop/kfp_ray/noop_wf.py | 1 + 7 files changed, 1 insertion(+), 584 deletions(-) delete mode 100644 transforms/universal/ededup/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/ededup/kfp_ray/v1/ededup_wf.py delete mode 100644 transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py delete mode 100644 transforms/universal/ededup/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/ededup/kfp_ray/v2/ededup_wf.py delete mode 100644 transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py diff --git a/transforms/universal/ededup/kfp_ray/v1/Makefile b/transforms/universal/ededup/kfp_ray/v1/Makefile deleted file mode 100644 index 66331ebfb..000000000 --- a/transforms/universal/ededup/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=ededup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=ededup_wf.py diff --git a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py b/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py deleted file mode 100644 index bf5c18f36..000000000 --- a/transforms/universal/ededup/kfp_ray/v1/ededup_wf.py +++ /dev/null @@ -1,169 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.ededup_compute_execution_params import ededup_compute_execution_params - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" - -task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") - -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "ededup" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for ededup", -) -def ededup( - # Ray cluster - ray_name: str = "ededup-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/ededup/input/', 'output_folder': 'test/ededup/output'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # ededup - ededup_hash_cpu: float = 0.5, - ededup_doc_column: str = "contents", - # data sampling - ededup_n_samples: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute EDEDUP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param ededup_hash_cpu - number of CPUs per hash - :param ededup_doc_column - key for accessing data - :param ededup_n_samples - number of samples for parameters computation - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, - n_samples=ededup_n_samples, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "ededup_doc_column": ededup_doc_column, - "ededup_hash_cpu": ededup_hash_cpu, - "ededup_num_hashes": compute_exec_params.outputs["hashes"], - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(ededup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py deleted file mode 100644 index 8d2fc6180..000000000 --- a/transforms/universal/ededup/kfp_ray/v1/src/ededup_compute_execution_params.py +++ /dev/null @@ -1,102 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any, NamedTuple - - -def ededup_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # actor's resource requirements - params: dict[str, Any], # exact dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple("Output", [("workers", int), ("hashes", int)]): - """ - Compute exact dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements - :param n_samples: number of samples to use - :param params: exact dedup specific parameters containing the following keys: - s3_config - s3 config - hash_cpu - hash cpu requirements - :return: json string, containing computed number of workers and hashes - """ - # required import - import math - import sys - - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from kfp_support.workflow_support.runtime_utils import KFPUtils - - EXECUTION_OF_KB_DOC = 0.00025 - - # Get cluster parameters - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_memory = w_options["replicas"] * w_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu *= 0.85 - cluster_memory *= 0.85 - # get actor requirements - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - actor_cpu = a_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) - if type(s3_config) is list: - # S3 config is list. take the first element - s3_config = s3_config[0] - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - if number_of_docs == 0: - print(f"Estimated number of documents and documents size is zero. Please verify the input path.") - sys.exit(1) - avg_table_size = sampling.get("average table size MB") / KB - # compute number of hashes - n_hashes = math.ceil(number_of_docs * 32 / GB) - print(f"Estimated Required hashes {n_hashes}") - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - hash_cpu: float = float(params.get("hash_cpu")) - required_hash_cpu = n_hashes * hash_cpu - required_hash_mem = n_hashes * 2 - if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: - print( - f"Cluster is too small - hashes required cpus {required_hash_cpu}; " - f"hashes required memory {required_hash_mem}" - ) - sys.exit(1) - # Define number of workers - n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu) - print(f"Number of workers - {n_workers}") - if n_workers < 2: - print(f"Cluster is too small - estimated number of workers {n_workers}") - sys.exit(1) - # Limit amount of workers and processors to prevent S3 saturation - if n_workers > 1000: - n_workers = 1000 - # validate that we have enough memory - r_mem = required_hash_mem * 2 + avg_table_size * 4 * n_workers - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker") - sys.exit(1) - print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") - # return json.dumps({"workers": n_workers, "hashes": n_hashes}) - return (n_workers, n_hashes) - # return (1, 1) diff --git a/transforms/universal/ededup/kfp_ray/v2/Makefile b/transforms/universal/ededup/kfp_ray/v2/Makefile deleted file mode 100644 index 98b0b5332..000000000 --- a/transforms/universal/ededup/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=ededup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=ededup_wf.py diff --git a/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py b/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py deleted file mode 100644 index 6c8fc4e0d..000000000 --- a/transforms/universal/ededup/kfp_ray/v2/ededup_wf.py +++ /dev/null @@ -1,165 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.ededup_compute_execution_params import ededup_compute_execution_params - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "ededup_transform.py" - -task_image = "quay.io/dataprep1/data-prep-kit/ededup:0.3.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" - -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "ededup" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for ededup", -) -def ededup( - # Ray cluster - ray_name: str = "ededup-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/ededup/input/', 'output_folder': 'test/ededup/output'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # ededup - ededup_hash_cpu: float = 0.5, - ededup_doc_column: str = "contents", - # data sampling - ededup_n_samples: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute EDEDUP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param ededup_hash_cpu - number of CPUs per hash - :param ededup_doc_column - key for accessing data - :param ededup_n_samples - number of samples for parameters computation - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, - n_samples=ededup_n_samples, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "ededup_doc_column": ededup_doc_column, - "ededup_hash_cpu": ededup_hash_cpu, - "ededup_num_hashes": compute_exec_params.outputs["hashes"], - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(ededup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py deleted file mode 100644 index 5304def12..000000000 --- a/transforms/universal/ededup/kfp_ray/v2/src/ededup_compute_execution_params.py +++ /dev/null @@ -1,98 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any, NamedTuple - - -def ededup_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # actor's resource requirements - params: dict[str, Any], # exact dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple("Output", [("workers", int), ("hashes", int)]): - """ - Compute exact dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements - :param n_samples: number of samples to use - :param params: exact dedup specific parameters containing the following keys: - s3_config - s3 config - hash_cpu - hash cpu requirements - :return: json string, containing computed number of workers and hashes - """ - # required import - import json - import math - import sys - - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from kfp_support.workflow_support.runtime_utils import KFPUtils - - EXECUTION_OF_KB_DOC = 0.00025 - - # Get cluster parameters - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_memory = w_options["replicas"] * w_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu *= 0.85 - cluster_memory *= 0.85 - # get actor requirements - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - actor_cpu = a_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) - - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - # compute number of hashes - n_hashes = math.ceil(number_of_docs * 32 / GB) - print(f"Estimated Required hashes {n_hashes}") - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - hash_cpu: float = float(params.get("hash_cpu")) - required_hash_cpu = n_hashes * hash_cpu - required_hash_mem = n_hashes * 2 - if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: - print( - f"Cluster is too small - hashes required cpus {required_hash_cpu}; " - f"hashes required memory {required_hash_mem}" - ) - sys.exit(1) - # Define number of workers - n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu) - print(f"Number of workers - {n_workers}") - if n_workers < 2: - print(f"Cluster is too small - estimated number of workers {n_workers}") - sys.exit(1) - # Limit amount of workers and processors to prevent S3 saturation - if n_workers > 1000: - n_workers = 1000 - # validate that we have enough memory - r_mem = required_hash_mem * 2 + avg_table_size * 4 * n_workers - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker") - sys.exit(1) - print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") - # return json.dumps({"workers": n_workers, "hashes": n_hashes}) - return (n_workers, n_hashes) - # return (1, 1) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index b01831386..fc296b284 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -82,6 +82,7 @@ def compute_exec_params_func( execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") + # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" From b31b04c76e2037aafc44d30ad58342c08b029767 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 09:53:45 +0300 Subject: [PATCH 47/64] fix ededup2 --- .../src/ededup_compute_execution_params.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py diff --git a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py new file mode 100644 index 000000000..f61d7b45e --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py @@ -0,0 +1,101 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + + +def ededup_compute_execution_params( + worker_options: str, # ray worker configuration + actor_options: str, # actor's resource requirements + params: dict[str, Any], # exact dedup specific parameters + n_samples: int = 10, # number of samples to use +) -> (int, int): + """ + Compute exact dedup execution parameters + :param worker_options: cluster parameters + :param actor_options: actor request requirements + :param n_samples: number of samples to use + :param params: exact dedup specific parameters containing the following keys: + s3_config - s3 config + hash_cpu - hash cpu requirements + :return: json string, containing computed number of workers and hashes + """ + # required import + import math + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB, KB + from workflow_support.runtime_utils import KFPUtils + + EXECUTION_OF_KB_DOC = 0.00025 + + # Get cluster parameters + w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) + cluster_cpu = w_options["replicas"] * w_options["cpu"] + cluster_memory = w_options["replicas"] * w_options["memory"] + print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") + cluster_cpu *= 0.85 + cluster_memory *= 0.85 + # get actor requirements + a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) + actor_cpu = a_options["num_cpus"] + print(f"actor required cpu {actor_cpu}") + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + if type(s3_config) is list: + # S3 config is list. take the first element + s3_config = s3_config[0] + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling = data_access.sample_input_data(n_samples=n_samples) + avg_doc_size = sampling.get("average doc size KB") + number_of_docs = sampling.get("estimated number of docs") + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) + avg_table_size = sampling.get("average table size MB") / KB + # compute number of hashes + n_hashes = math.ceil(number_of_docs * 32 / GB) + print(f"Estimated Required hashes {n_hashes}") + print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") + hash_cpu: float = float(params.get("hash_cpu")) + required_hash_cpu = n_hashes * hash_cpu + required_hash_mem = n_hashes * 2 + if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: + print( + f"Cluster is too small - hashes required cpus {required_hash_cpu}; " + f"hashes required memory {required_hash_mem}" + ) + sys.exit(1) + # Define number of workers + n_workers = int((0.85 * cluster_cpu - required_hash_cpu) / actor_cpu) + print(f"Number of workers - {n_workers}") + if n_workers < 2: + print(f"Cluster is too small - estimated number of workers {n_workers}") + sys.exit(1) + # Limit amount of workers and processors to prevent S3 saturation + if n_workers > 1000: + n_workers = 1000 + # validate that we have enough memory + r_mem = required_hash_mem * 2 + avg_table_size * 4 * n_workers + print(f"Required execution memory {r_mem} GB") + if r_mem > cluster_memory: + print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") + print(f"Try to increase the size of the cluster or increase size of the cpu per worker") + sys.exit(1) + print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") + # return json.dumps({"workers": n_workers, "hashes": n_hashes}) + return n_workers, n_hashes From 6e76bd828bde9e0958b36b206b3484750a387e9a Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 09:54:34 +0300 Subject: [PATCH 48/64] fix ededup3 --- transforms/universal/ededup/kfp_ray/Makefile | 40 ++++ .../universal/ededup/kfp_ray/ededup_wf.py | 214 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 transforms/universal/ededup/kfp_ray/Makefile create mode 100644 transforms/universal/ededup/kfp_ray/ededup_wf.py diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile new file mode 100644 index 000000000..226dac080 --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -0,0 +1,40 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py new file mode 100644 index 000000000..ad3d979cc --- /dev/null +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -0,0 +1,214 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl + + +task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" + +# the name of the job script +EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" + +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" + +# compute execution parameters +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + ededup_doc_column: str, + ededup_hash_cpu: float, + ededup_n_samples: int, +) -> dict: + from src.ededup_compute_execution_params import ededup_compute_execution_params + workers, hashes = ededup_compute_execution_params(worker_options=worker_options, + actor_options=actor_options, + params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, + n_samples=ededup_n_samples) + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": workers, + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "ededup_doc_column": ededup_doc_column, + "ededup_hash_cpu": ededup_hash_cpu, + "ededup_num_hashes": hashes, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + +# create Ray cluster +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") + +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "ededup" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for ededup", +) +def ededup( + # Ray cluster + ray_name: str = "ededup-kfp-ray", # name of Ray cluster + ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', + ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' + '"image_pull_secret": "", "image": "' + task_image + '"}', + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access. checkpointing is not supported by dedup + data_s3_config: str = "{'input_folder': 'test/ededup/input/', 'output_folder': 'test/ededup/output'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + # orchestrator + runtime_actor_options: str = "{'num_cpus': 0.8}", + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", + # ededup + ededup_hash_cpu: float = 0.5, + ededup_doc_column: str = "contents", + # data sampling + ededup_n_samples: int = 10, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', +): + """ + Pipeline to execute EDEDUP transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param ededup_hash_cpu - number of CPUs per hash + :param ededup_doc_column - key for accessing data + :param ededup_n_samples - number of samples for parameters computation + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) + ComponentUtils.add_settings_to_component(clean_up_task, 60) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + ededup_doc_column=ededup_doc_column, + ededup_hash_cpu=ededup_hash_cpu, + ededup_n_samples=ededup_n_samples, + ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=run_id, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + # note that the parameters below are specific for NOOP transform + exec_params=compute_exec_params.output, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + # TODO + # Configure the pipeline level to one week (in seconds) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(ededup, __file__.replace(".py", ".yaml")) From 3a07e682a6305a29d18884ab3ccafb9c9be74a84 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 21:38:15 +0300 Subject: [PATCH 49/64] Adjuest code workflows. Signed-off-by: Revital Sur --- .../executeRayJobComponent_multi_s3.yaml | 2 +- transforms/code/code_quality/Makefile | 10 +- transforms/code/code_quality/kfp_ray/Makefile | 44 +++++ .../kfp_ray/{v1 => }/code_quality_wf.py | 99 +++++++--- .../code/code_quality/kfp_ray/v1/Makefile | 25 --- .../code/code_quality/kfp_ray/v2/Makefile | 25 --- .../kfp_ray/v2/code_quality_wf.py | 174 ------------------ transforms/code/malware/Makefile | 10 +- transforms/code/malware/kfp_ray/Makefile | 44 +++++ .../malware/kfp_ray/{v1 => }/malware_wf.py | 92 ++++++--- transforms/code/malware/kfp_ray/v1/Makefile | 25 --- transforms/code/malware/kfp_ray/v2/Makefile | 25 --- .../code/malware/kfp_ray/v2/malware_wf.py | 169 ----------------- transforms/code/proglang_select/Makefile | 11 +- .../code/proglang_select/kfp_ray/Makefile | 44 +++++ .../kfp_ray/{v1 => }/proglang_select_wf.py | 90 ++++++--- .../code/proglang_select/kfp_ray/v1/Makefile | 25 --- .../code/proglang_select/kfp_ray/v2/Makefile | 25 --- .../kfp_ray/v2/proglang_select_wf.py | 165 ----------------- 19 files changed, 354 insertions(+), 750 deletions(-) create mode 100644 transforms/code/code_quality/kfp_ray/Makefile rename transforms/code/code_quality/kfp_ray/{v1 => }/code_quality_wf.py (67%) delete mode 100644 transforms/code/code_quality/kfp_ray/v1/Makefile delete mode 100644 transforms/code/code_quality/kfp_ray/v2/Makefile delete mode 100644 transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py create mode 100644 transforms/code/malware/kfp_ray/Makefile rename transforms/code/malware/kfp_ray/{v1 => }/malware_wf.py (67%) delete mode 100644 transforms/code/malware/kfp_ray/v1/Makefile delete mode 100644 transforms/code/malware/kfp_ray/v2/Makefile delete mode 100644 transforms/code/malware/kfp_ray/v2/malware_wf.py create mode 100644 transforms/code/proglang_select/kfp_ray/Makefile rename transforms/code/proglang_select/kfp_ray/{v1 => }/proglang_select_wf.py (69%) delete mode 100644 transforms/code/proglang_select/kfp_ray/v1/Makefile delete mode 100644 transforms/code/proglang_select/kfp_ray/v2/Makefile delete mode 100644 transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 9f17afed4..fe0700b33 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -8,7 +8,7 @@ inputs: - { name: server_url, type: String, default: "", description: "url of api server" } - { name: prefix, type: String, default: "", description: "prefix for extra credentials" } # The component converts the dictionary to json string - - { name: exec_params, type: dict, description: "job parameters" } + - { name: exec_params, type: JsonObject, description: "job parameters" } - { name: additional_params, type: String, description: "additional parameters" } implementation: diff --git a/transforms/code/code_quality/Makefile b/transforms/code/code_quality/Makefile index c13740cc9..5cc85aab2 100644 --- a/transforms/code/code_quality/Makefile +++ b/transforms/code/code_quality/Makefile @@ -47,21 +47,21 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/code/code_quality/kfp_ray/Makefile b/transforms/code/code_quality/kfp_ray/Makefile new file mode 100644 index 000000000..d93c668c1 --- /dev/null +++ b/transforms/code/code_quality/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=code_quality_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py similarity index 67% rename from transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py rename to transforms/code/code_quality/kfp_ray/code_quality_wf.py index 23d7f45dd..21cfa1380 100644 --- a/transforms/code/code_quality/kfp_ray/v1/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -9,35 +9,80 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -# NOTE: This file is auto generated by Pipeline Generator. +import os import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils # the name of the job script EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" -component_spec_path = "../../../../../kfp/kfp_ray_components/" +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + cq_contents_column_name: str, + cq_language_column_name: str, + cq_tokenizer: str, + cq_hf_token: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "cq_contents_column_name": cq_contents_column_name, + "cq_language_column_name": cq_language_column_name, + "cq_tokenizer": cq_tokenizer, + "cq_hf_token": cq_hf_token, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -120,7 +165,7 @@ def code_quality( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -128,7 +173,18 @@ def code_quality( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + cq_contents_column_name=cq_contents_column_name, + cq_language_column_name=cq_language_column_name, + cq_tokenizer=cq_tokenizer, + cq_hf_token=cq_hf_token, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( @@ -147,19 +203,8 @@ def code_quality( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "cq_contents_column_name": cq_contents_column_name, - "cq_language_column_name": cq_language_column_name, - "cq_tokenizer": cq_tokenizer, - "cq_hf_token": cq_hf_token, - }, + # note that the parameters below are specific for NOOP transform + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/code/code_quality/kfp_ray/v1/Makefile b/transforms/code/code_quality/kfp_ray/v1/Makefile deleted file mode 100644 index ae484ed12..000000000 --- a/transforms/code/code_quality/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=code_quality_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=code_quality_wf.py diff --git a/transforms/code/code_quality/kfp_ray/v2/Makefile b/transforms/code/code_quality/kfp_ray/v2/Makefile deleted file mode 100644 index bd34d6f3c..000000000 --- a/transforms/code/code_quality/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=code_quality_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=code_quality_wf.py \ No newline at end of file diff --git a/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py deleted file mode 100644 index 9de9a9e55..000000000 --- a/transforms/code/code_quality/kfp_ray/v2/code_quality_wf.py +++ /dev/null @@ -1,174 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# NOTE: This file is auto generated by Pipeline Generator. - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from kubernetes import client as k8s_client - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "code_quality_transform.py" -PREFIX: str = "" - -task_image = "quay.io/dataprep1/data-prep-kit/code_quality:0.3.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1" -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, insted of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "code_quality" - - -# Pipeline to invoke execution on remote resource -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for code quality task", -) -def code_quality( - # Ray cluster - ray_name: str = "code_quality-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "",\ - "image": "' - + task_image - + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "",\ - "image": "' - + task_image - + '" }', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/code_quality/input/', 'output_folder': 'test/code_quality/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "runtime_pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # code quality parameters - cq_contents_column_name: str = "contents", - cq_language_column_name: str = "language", - cq_tokenizer: str = "codeparrot/codeparrot", - cq_hf_token: str = "None", - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute Code Quality transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param cq_contents_column_name - Name of the column holds the data to process - :param cq_language_column_name - Name of the column holds the programming language details - :param cq_tokenizer - Name or path to the tokenizer - :param cq_hf_token - Huggingface auth token to download and use the tokenizer - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "cq_contents_column_name": cq_contents_column_name, - "cq_language_column_name": cq_language_column_name, - "cq_tokenizer": cq_tokenizer, - "cq_hf_token": cq_hf_token, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(code_quality, __file__.replace(".py", ".yaml")) diff --git a/transforms/code/malware/Makefile b/transforms/code/malware/Makefile index 08f3837b7..41413c041 100644 --- a/transforms/code/malware/Makefile +++ b/transforms/code/malware/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/code/malware/kfp_ray/Makefile b/transforms/code/malware/kfp_ray/Makefile new file mode 100644 index 000000000..ce744072d --- /dev/null +++ b/transforms/code/malware/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=malware_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/malware/kfp_ray/v1/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py similarity index 67% rename from transforms/code/malware/kfp_ray/v1/malware_wf.py rename to transforms/code/malware/kfp_ray/malware_wf.py index dff8125cd..84e20fd36 100644 --- a/transforms/code/malware/kfp_ray/v1/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -10,32 +10,79 @@ # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) # the name of the job script EXEC_SCRIPT_NAME: str = "malware_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.5.0" +task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:0.5.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + cq_contents_column_name: str, + cq_language_column_name: str, + malware_input_column: str, + malware_output_column: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "cq_contents_column_name": cq_contents_column_name, + "cq_language_column_name": cq_language_column_name, + "malware_input_column": malware_input_column, + "malware_output_column": malware_output_column, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -107,7 +154,7 @@ def malware( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -115,6 +162,14 @@ def malware( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + malware_input_column=malware_input_column, + malware_output_column=malware_output_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -134,18 +189,7 @@ def malware( run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, # note that the parameters below are specific for malware transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "malware_input_column": malware_input_column, - "malware_output_column": malware_output_column, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/code/malware/kfp_ray/v1/Makefile b/transforms/code/malware/kfp_ray/v1/Makefile deleted file mode 100644 index d673ca682..000000000 --- a/transforms/code/malware/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=malware_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=malware_wf.py diff --git a/transforms/code/malware/kfp_ray/v2/Makefile b/transforms/code/malware/kfp_ray/v2/Makefile deleted file mode 100644 index 8bf51274c..000000000 --- a/transforms/code/malware/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=malware_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=malware_wf.py diff --git a/transforms/code/malware/kfp_ray/v2/malware_wf.py b/transforms/code/malware/kfp_ray/v2/malware_wf.py deleted file mode 100644 index 9cda17845..000000000 --- a/transforms/code/malware/kfp_ray/v2/malware_wf.py +++ /dev/null @@ -1,169 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" - -# path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "malware" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for malware", -) -<<<<<<< HEAD:transforms/code/malware/kfp_ray/v2/malware_wf.py -def malware( - ray_name: str = "malware-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', -======= -def noop( - # Ray cluster - ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", ' '"image": "' + task_image + '"}', ->>>>>>> dev:transforms/universal/noop/kfp_ray/v1/noop_wf.py - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/malware/input', 'output_folder': 'test/malware/output'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # mallware - malware_input_column: str = "contents", - malware_output_column: str = "virus_detection", - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute malware transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - httpt retries for API server calls - :param data_s3_config - s3 configuration - :param data_s3_access_secret - s3 access secret - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param malware_input_column - input column name - :param malware_output_column - output column name - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for malware transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "malware_input_column": malware_input_column, - "malware_output_column": malware_output_column, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(malware, __file__.replace(".py", ".yaml")) diff --git a/transforms/code/proglang_select/Makefile b/transforms/code/proglang_select/Makefile index f803fbe3b..e7ad671da 100644 --- a/transforms/code/proglang_select/Makefile +++ b/transforms/code/proglang_select/Makefile @@ -47,20 +47,21 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements + diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile new file mode 100644 index 000000000..6d2d93ed7 --- /dev/null +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE= proglang_select_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py similarity index 69% rename from transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py rename to transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index a32678aa9..b3b26a9db 100644 --- a/transforms/code/proglang_select/kfp_ray/v1/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -10,32 +10,75 @@ # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) # the name of the job script EXEC_SCRIPT_NAME: str = "proglang_select_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + proglang_select_allowed_langs_file: str, + proglang_select_language_column: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, + "proglang_select_language_column": proglang_select_language_column, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -111,7 +154,7 @@ def lang_select( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -119,6 +162,14 @@ def lang_select( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + proglang_select_allowed_langs_file=proglang_select_allowed_langs_file, + proglang_select_language_column=proglang_select_language_column, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster @@ -137,19 +188,8 @@ def lang_select( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, - "proglang_select_language_column": proglang_select_language_column, - }, + # note that the parameters below are specific for this transform + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, prefix=PREFIX, diff --git a/transforms/code/proglang_select/kfp_ray/v1/Makefile b/transforms/code/proglang_select/kfp_ray/v1/Makefile deleted file mode 100644 index e2c8c8b14..000000000 --- a/transforms/code/proglang_select/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=proglang_select_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/code/proglang_select/kfp_ray/v2/Makefile b/transforms/code/proglang_select/kfp_ray/v2/Makefile deleted file mode 100644 index 3cf6c4084..000000000 --- a/transforms/code/proglang_select/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=proglang_select_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=proglang_select_wf.py diff --git a/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py deleted file mode 100644 index b9bca1cfc..000000000 --- a/transforms/code/proglang_select/kfp_ray/v2/proglang_select_wf.py +++ /dev/null @@ -1,165 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "proglang_select_transform.py" - -task_image = "quay.io/dataprep1/data-prep-kit/proglang_select:0.3.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.1" - -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, insted of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent_multi_s3.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "proglang_select" -PREFIX: str = "proglang_select" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for select language", -) -def lang_select( - ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/proglang_select/input/', 'output_folder': 'test/proglang_select/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # Proglang match parameters - proglang_select_allowed_langs_file: str = "test/proglang_select/languages/allowed-code-languages.txt", - proglang_select_language_column: str = "language", - proglang_select_s3_access_secret: str = "s3-secret", - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -) -> None: - """ - Pipeline to execute NOOP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - httpt retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param proglang_select_allowed_langs_file - file to store allowed languages - :param proglang_select_language_column - name of select language annotation column - :param proglang_select_s3_access_secret - block list access secret - (here we are assuming that select language info is in S3, but potentially in the different bucket) - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - # note that the parameters below are specific for NOOP transform - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "proglang_select_allowed_langs_file": proglang_select_allowed_langs_file, - "proglang_select_language_column": proglang_select_language_column, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - prefix=PREFIX, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component(execute_job, proglang_select_s3_access_secret, prefix=PREFIX) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(lang_select, __file__.replace(".py", ".yaml")) From 5da574beb6dd1693322c2d2f969083bea7ec9026 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 22:39:59 +0300 Subject: [PATCH 50/64] Adjust tokenization and filter workflows. Signed-off-by: Revital Sur --- .../code_quality/kfp_ray/code_quality_wf.py | 2 +- .../kfp_ray/proglang_select_wf.py | 2 +- transforms/universal/filter/Makefile | 10 +- transforms/universal/filter/kfp_ray/Makefile | 44 +++++ .../filter/kfp_ray/{v1 => }/filter_wf.py | 92 +++++++--- .../universal/filter/kfp_ray/v1/Makefile | 25 --- .../universal/filter/kfp_ray/v2/Makefile | 25 --- .../universal/filter/kfp_ray/v2/filter_wf.py | 167 ----------------- transforms/universal/tokenization/Makefile | 10 +- .../universal/tokenization/kfp_ray/Makefile | 44 +++++ .../kfp_ray/{v1 => }/tokenization_wf.py | 112 ++++++++---- .../tokenization/kfp_ray/v1/Makefile | 25 --- .../tokenization/kfp_ray/v2/Makefile | 25 --- .../kfp_ray/v2/tokenization_wf.py | 171 ------------------ 14 files changed, 247 insertions(+), 507 deletions(-) create mode 100644 transforms/universal/filter/kfp_ray/Makefile rename transforms/universal/filter/kfp_ray/{v1 => }/filter_wf.py (68%) delete mode 100644 transforms/universal/filter/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/filter/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/filter/kfp_ray/v2/filter_wf.py create mode 100644 transforms/universal/tokenization/kfp_ray/Makefile rename transforms/universal/tokenization/kfp_ray/{v1 => }/tokenization_wf.py (67%) delete mode 100644 transforms/universal/tokenization/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/tokenization/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 21cfa1380..e53880caf 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -21,7 +21,7 @@ EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" +task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:0.4.0.dev6" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index b3b26a9db..0ed95ff64 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "proglang_select_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:0.4.0.dev6" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" diff --git a/transforms/universal/filter/Makefile b/transforms/universal/filter/Makefile index f803fbe3b..6104574ea 100644 --- a/transforms/universal/filter/Makefile +++ b/transforms/universal/filter/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/filter/kfp_ray/Makefile b/transforms/universal/filter/kfp_ray/Makefile new file mode 100644 index 000000000..95e1914f9 --- /dev/null +++ b/transforms/universal/filter/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=filter_wf.py + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/filter/kfp_ray/v1/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py similarity index 68% rename from transforms/universal/filter/kfp_ray/v1/filter_wf.py rename to transforms/universal/filter/kfp_ray/filter_wf.py index 9780f06c4..17bd22ab1 100644 --- a/transforms/universal/filter/kfp_ray/v1/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -10,35 +10,77 @@ # limitations under the License. ################################################################################ -# NOTE: This file is auto generated by Pipeline Generator. +import os import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils # the name of the job script EXEC_SCRIPT_NAME: str = "filter_transform_ray.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + filter_criteria_list: str, + filter_logical_operator: str, + filter_columns_to_drop: str, +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "filter_criteria_list": filter_criteria_list, + "filter_logical_operator": filter_logical_operator, + "filter_columns_to_drop": filter_columns_to_drop, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -113,7 +155,7 @@ def filtering( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -121,7 +163,17 @@ def filtering( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + filter_criteria_list=filter_criteria_list, + filter_logical_operator=filter_logical_operator, + filter_columns_to_drop=filter_columns_to_drop, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster ray_cluster = create_ray_op( @@ -140,19 +192,7 @@ def filtering( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "filter_criteria_list": filter_criteria_list, - "filter_logical_operator": filter_logical_operator, - "filter_columns_to_drop": filter_columns_to_drop, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/universal/filter/kfp_ray/v1/Makefile b/transforms/universal/filter/kfp_ray/v1/Makefile deleted file mode 100644 index b7696b246..000000000 --- a/transforms/universal/filter/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=filter_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=filter_wf.py diff --git a/transforms/universal/filter/kfp_ray/v2/Makefile b/transforms/universal/filter/kfp_ray/v2/Makefile deleted file mode 100644 index c64f90af8..000000000 --- a/transforms/universal/filter/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=filter_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=filter_wf.py diff --git a/transforms/universal/filter/kfp_ray/v2/filter_wf.py b/transforms/universal/filter/kfp_ray/v2/filter_wf.py deleted file mode 100644 index 11cf20b9b..000000000 --- a/transforms/universal/filter/kfp_ray/v2/filter_wf.py +++ /dev/null @@ -1,167 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# NOTE: This file is auto generated by Pipeline Generator. - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "filter_transform.py" -PREFIX: str = "" - -task_image = "quay.io/dataprep1/data-prep-kit/filter:0.3.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" -# compute execution parameters. Here different tranforms might need different implementations. As -# a result, insted of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "filter" - - -# Pipeline to invoke execution on remote resource -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for filtering task", -) -def filtering( - # Ray cluster - ray_name: str = "filter-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/filter/input/', 'output_folder': 'test/filter/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # filtering parameters - filter_criteria_list: str = "['docq_total_words > 100 AND docq_total_words < 200', 'ibmkenlm_docq_perplex_score < 230']", - filter_logical_operator: str = "AND", - filter_columns_to_drop: str = "['extra', 'cluster']", - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute Filtering transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param filter_criteria_list - list of filter criteria (in SQL WHERE clause format) - :param filter_logical_operator - logical operator (AND or OR) that joins filter criteria - :param filter_columns_to_drop - list of columns to drop after filtering - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "filter_criteria_list": filter_criteria_list, - "filter_logical_operator": filter_logical_operator, - "filter_columns_to_drop": filter_columns_to_drop, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(filtering, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile index 08f3837b7..41413c041 100644 --- a/transforms/universal/tokenization/Makefile +++ b/transforms/universal/tokenization/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile new file mode 100644 index 000000000..6c1686246 --- /dev/null +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -0,0 +1,44 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +image:: + +load-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.py + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py similarity index 67% rename from transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py rename to transforms/universal/tokenization/kfp_ray/tokenization_wf.py index d02fca5aa..4ab872ce8 100644 --- a/transforms/universal/tokenization/kfp_ray/v1/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -9,32 +9,86 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - # the name of the job script EXEC_SCRIPT_NAME: str = "tokenization_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.3.0" +task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:0.4.0.dev6" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" + + # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) +def compute_exec_params_func( + worker_options: str, + actor_options: str, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: str, + tkn_tokenizer: str, + tkn_tokenizer_args: str, + tkn_doc_id_column: str, + tkn_doc_content_column: str, + tkn_text_lang: str, + tkn_chunk_size: int + + +) -> dict: + from workflow_support.runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": KFPUtils.default_compute_execution_params(worker_options, actor_options), + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "tkn_tokenizer": tkn_tokenizer, + "tkn_tokenizer_args": tkn_tokenizer_args, + "tkn_doc_id_column": tkn_doc_id_column, + "tkn_doc_content_column": tkn_doc_content_column, + "tkn_text_lang": tkn_text_lang, + "tkn_chunk_size": tkn_chunk_size, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -116,7 +170,7 @@ def tokenization( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -124,10 +178,21 @@ def tokenization( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + tkn_tokenizer=tkn_tokenizer, + tkn_tokenizer_args=tkn_tokenizer_args, + tkn_doc_id_column=tkn_doc_id_column, + tkn_doc_content_column=tkn_doc_content_column, + tkn_text_lang=tkn_text_lang, + tkn_chunk_size=tkn_chunk_size, ) + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, @@ -144,22 +209,7 @@ def tokenization( ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "tkn_tokenizer": tkn_tokenizer, - "tkn_tokenizer_args": tkn_tokenizer_args, - "tkn_doc_id_column": tkn_doc_id_column, - "tkn_doc_content_column": tkn_doc_content_column, - "tkn_text_lang": tkn_text_lang, - "tkn_chunk_size": tkn_chunk_size, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) diff --git a/transforms/universal/tokenization/kfp_ray/v1/Makefile b/transforms/universal/tokenization/kfp_ray/v1/Makefile deleted file mode 100644 index 5814e2935..000000000 --- a/transforms/universal/tokenization/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=tokenization_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=tokenization_wf.py diff --git a/transforms/universal/tokenization/kfp_ray/v2/Makefile b/transforms/universal/tokenization/kfp_ray/v2/Makefile deleted file mode 100644 index 232c8b44a..000000000 --- a/transforms/universal/tokenization/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=tokenization_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=tokenization_wf.py diff --git a/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py deleted file mode 100644 index bed08e80a..000000000 --- a/transforms/universal/tokenization/kfp_ray/v2/tokenization_wf.py +++ /dev/null @@ -1,171 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "tokenization_transform.py" - -task_image = "quay.io/dataprep1/data-prep-kit/tokenization:0.2.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" -# compute execution parameters. Use default one for now. -compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image -) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "tokenization" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for tokenization", -) -def tokenization( - # Ray cluster - ray_name: str = "tkn-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/tokenization/ds01/input/', 'output_folder': 'test/tokenization/ds01/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # tokenizer parameters - tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", - tkn_doc_id_column: str = "document_id", - tkn_doc_content_column: str = "contents", - tkn_text_lang: str = "en", - tkn_tokenizer_args: str = "cache_dir=/tmp/hf", - tkn_chunk_size: int = 0, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute tokenization transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param tkn_tokenizer - Tokenizer used for tokenization - :param tkn_tokenizer_args - Arguments for tokenizer. - :param tkn_doc_id_column - Column contains document id which values should be unique across dataset - :param tkn_doc_content_column - Column contains document content - :param tkn_text_lang - Specify language used in the text content for better text splitting if needed - :param tkn_chunk_size - Specify >0 value to tokenize each row/text in chunks of characters (rounded in words) - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.output, - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "tkn_tokenizer": tkn_tokenizer, - "tkn_tokenizer_args": tkn_tokenizer_args, - "tkn_doc_id_column": tkn_doc_id_column, - "tkn_doc_content_column": tkn_doc_content_column, - "tkn_text_lang": tkn_text_lang, - "tkn_chunk_size": tkn_chunk_size, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(tokenization, __file__.replace(".py", ".yaml")) From d4fb2d5d81af2daee9eaa9f0628250d27957d3e0 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 22:48:27 +0300 Subject: [PATCH 51/64] Fixes after testing. Signed-off-by: Revital Sur --- transforms/code/malware/kfp_ray/malware_wf.py | 4 ---- transforms/code/proglang_select/kfp_ray/Makefile | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 84e20fd36..0fb03884d 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -41,8 +41,6 @@ def compute_exec_params_func( runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: str, - cq_contents_column_name: str, - cq_language_column_name: str, malware_input_column: str, malware_output_column: str, ) -> dict: @@ -57,8 +55,6 @@ def compute_exec_params_func( "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": runtime_code_location, - "cq_contents_column_name": cq_contents_column_name, - "cq_language_column_name": cq_language_column_name, "malware_input_column": malware_input_column, "malware_output_column": malware_output_column, } diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile index 6d2d93ed7..2bdfb2d1d 100644 --- a/transforms/code/proglang_select/kfp_ray/Makefile +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -29,7 +29,7 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE= proglang_select_wf.yaml + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=proglang_select_wf.yaml .PHONY: workflow-upload workflow-upload: workflow-build From c3930e7385a28f5d1b659cf3eb57683887420470 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 23:06:01 +0300 Subject: [PATCH 52/64] More fixes. Signed-off-by: Revital Sur --- transforms/.make.workflows | 15 +-------------- transforms/universal/doc_id/Makefile | 10 +++++----- transforms/universal/doc_id/kfp_ray/Makefile | 8 +++++--- transforms/universal/ededup/kfp_ray/Makefile | 4 ++++ transforms/universal/fdedup/Makefile | 10 +++++----- transforms/universal/noop/kfp_ray/Makefile | 4 ++++ transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- 7 files changed, 25 insertions(+), 28 deletions(-) diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 62e6fc457..4a9d0d0a8 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -6,27 +6,14 @@ include ${REPOROOT}/kfp/requirements.env include ${REPOROOT}/.make.defaults USE_DEV_IMAGES ?= 1 -TRANSFORM_RUNTIME = ray define set_env_var $(eval export $(1)=$(2)) endef +# FIXME .PHONY: .transforms_workflows.reconcile-requirements .transforms_workflows.reconcile-requirements: - cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) reconcile-requirements - @while IFS= read -r line; do \ - [ -z "$$line" ] && continue; \ - [[ $$line == *#* ]] && continue; \ - export DOCKER_IMAGE_NAME=$$(echo $$line |cut -d "=" -f 1 |sed "s/_VERSION//" |tr '[:upper:]' '[:lower:]'); \ - export DOCKER_IMAGE_VERSION=$$(echo $$line |cut -d "=" -f 2); \ - sed -i.back "s/data-prep-kit\/$$DOCKER_IMAGE_NAME\-${TRANSFORM_RUNTIME}:.*/data-prep-kit\/$$DOCKER_IMAGE_NAME\-${TRANSFORM_RUNTIME}:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ - done < ${REPOROOT}/.make.versions -ifeq ($(KFPv2), 1) - @sed -i.back "s/kfp-data-processing_v2:.*/kfp-data-processing_v2:${KFP_DOCKER_VERSION_v2}\"/" ${PIPELINE_FILE} -else - @sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION}\"/" ${PIPELINE_FILE} -endif .PHONY: .transforms_workflows.compile-pipeline diff --git a/transforms/universal/doc_id/Makefile b/transforms/universal/doc_id/Makefile index 2f55ab35a..da86986db 100644 --- a/transforms/universal/doc_id/Makefile +++ b/transforms/universal/doc_id/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile index bb58a8a63..54b7e3781 100644 --- a/transforms/universal/doc_id/kfp_ray/Makefile +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -19,11 +19,13 @@ test-src:: test-image:: +image:: + +load-image:: + .PHONY: workflow-build workflow-build: workflow-venv - @for file in $(YAML_WF); do \ - $(MAKE) $$file; \ - done + $(MAKE) $(YAML_WF) .PHONY: workflow-test workflow-test: workflow-build diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile index 226dac080..a834f7e50 100644 --- a/transforms/universal/ededup/kfp_ray/Makefile +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -19,6 +19,10 @@ test-src:: test-image:: +image:: + +load-image:: + .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index 08f3837b7..41413c041 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -47,20 +47,20 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements workflow-reconcile-requirements: - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index 2097300a7..4f1d5ee7c 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -19,6 +19,10 @@ test-src:: test-image:: +image:: + +load-image:: + .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index fc296b284..3c2c8cf9c 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -18,7 +18,7 @@ import kfp.dsl as dsl -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" From d0d2f594507ba7a4534402977425e1b05a36c98a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 23:16:30 +0300 Subject: [PATCH 53/64] Minor fixes. Signed-off-by: Revital Sur --- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/tokenization/kfp_ray/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 3c2c8cf9c..2076ea05e 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -21,7 +21,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" # the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" +EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile index 6c1686246..7d5aa6687 100644 --- a/transforms/universal/tokenization/kfp_ray/Makefile +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -29,7 +29,7 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.py + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.yaml .PHONY: workflow-upload workflow-upload: workflow-build From 5db777a58c1591a75c7d25d2375ab32cac5e1e49 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 5 Jun 2024 23:53:47 +0300 Subject: [PATCH 54/64] Fixes after testing. Signed-off-by: Revital Sur --- transforms/universal/ededup/Makefile | 10 +++++----- transforms/universal/filter/kfp_ray/Makefile | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/transforms/universal/ededup/Makefile b/transforms/universal/ededup/Makefile index 3cf3ac2ac..a766f453e 100644 --- a/transforms/universal/ededup/Makefile +++ b/transforms/universal/ededup/Makefile @@ -47,19 +47,19 @@ load-image:: .PHONY: workflow-venv workflow-venv: - $(MAKE) -C $(PIPELINE_PATH) workflow-venv + $(MAKE) -C kfp_ray workflow-venv .PHONY: workflow-build workflow-build: - $(MAKE) -C $(PIPELINE_PATH) workflow-build + $(MAKE) -C kfp_ray workflow-build .PHONY: workflow-test workflow-test: - $(MAKE) -C $(PIPELINE_PATH) workflow-test + $(MAKE) -C kfp_ray workflow-test .PHONY: workflow-upload workflow-upload: - $(MAKE) -C $(PIPELINE_PATH) workflow-upload + $(MAKE) -C kfp_ray workflow-upload .PHONY: workflow-reconcile-requirements - $(MAKE) -C $(PIPELINE_PATH) workflow-reconcile-requirements + $(MAKE) -C kfp_ray workflow-reconcile-requirements diff --git a/transforms/universal/filter/kfp_ray/Makefile b/transforms/universal/filter/kfp_ray/Makefile index 95e1914f9..4d8779a25 100644 --- a/transforms/universal/filter/kfp_ray/Makefile +++ b/transforms/universal/filter/kfp_ray/Makefile @@ -29,7 +29,7 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=filter_wf.py + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=filter_wf.yaml .PHONY: workflow-upload workflow-upload: workflow-build From 09ba6bcf468e2d7f0b695d7a0cc71c09936a490a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 6 Jun 2024 13:38:41 +0300 Subject: [PATCH 55/64] Fixes after testing. Signed-off-by: Revital Sur --- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/ededup/kfp_ray/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 4ce041657..79f545b7a 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -18,7 +18,7 @@ import kfp.dsl as dsl -task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.4.0" +task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.4.0.dev6" # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform.py" diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile index a834f7e50..235258fd6 100644 --- a/transforms/universal/ededup/kfp_ray/Makefile +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -29,7 +29,7 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=ededup_wf.yaml .PHONY: workflow-upload workflow-upload: workflow-build From 5a17ebf1002a4ed38d2bd3b62578fa2593be1544 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Thu, 6 Jun 2024 14:34:42 +0300 Subject: [PATCH 56/64] More fixes. Signed-off-by: Revital Sur --- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 3 +-- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 79f545b7a..9bc58e9df 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -21,8 +21,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:0.4.0.dev6" # the name of the job script -EXEC_SCRIPT_NAME: str = "doc_id_transform.py" - +EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index ad3d979cc..85463b963 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -18,7 +18,7 @@ import kfp.dsl as dsl -task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0$(RELEASE_VERSION_SUFFIX)" +task_image = "quay.io/dataprep1/data-prep-kit/ededup-ray:0.4.0.dev6" # the name of the job script EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 71098c3f3..a36ff5ca5 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -18,10 +18,10 @@ import kfp.dsl as dsl -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" +task_image = "quay.io/dataprep1/data-prep-kit/noop-ray:0.9.0.dev6" # the name of the job script -EXEC_SCRIPT_NAME: str = "noop_transform.py" +EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" From a1f759fce251fe6280e107ece2111c747d266c25 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 21:14:12 +0300 Subject: [PATCH 57/64] fix ededup Signed-off-by: Alexey Roytman --- .../universal/ededup/kfp_ray/ededup_wf.py | 39 ++-------------- .../src/ededup_compute_execution_params.py | 45 ++++++++++++++----- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 85463b963..c341cbb6a 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -11,6 +11,7 @@ ################################################################################ import os +from src.ededup_compute_execution_params import ededup_compute_execution_params from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler @@ -29,40 +30,6 @@ # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" -# compute execution parameters -def compute_exec_params_func( - worker_options: str, - actor_options: str, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: str, - ededup_doc_column: str, - ededup_hash_cpu: float, - ededup_n_samples: int, -) -> dict: - from src.ededup_compute_execution_params import ededup_compute_execution_params - workers, hashes = ededup_compute_execution_params(worker_options=worker_options, - actor_options=actor_options, - params={"s3_config": data_s3_config, "hash_cpu": ededup_hash_cpu}, - n_samples=ededup_n_samples) - return { - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": workers, - "runtime_worker_options": actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, - "runtime_code_location": runtime_code_location, - "ededup_doc_column": ededup_doc_column, - "ededup_hash_cpu": ededup_hash_cpu, - "ededup_num_hashes": hashes, - } - - # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use @@ -79,7 +46,9 @@ def compute_exec_params_func( ) run_id = uuid.uuid4().hex else: - compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + compute_exec_params_op = comp.create_component_from_func( + func=ededup_compute_execution_params, base_image=base_kfp_image + ) run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster diff --git a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py index f61d7b45e..16a5a0c28 100644 --- a/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py +++ b/transforms/universal/ededup/kfp_ray/src/ededup_compute_execution_params.py @@ -16,18 +16,31 @@ def ededup_compute_execution_params( worker_options: str, # ray worker configuration actor_options: str, # actor's resource requirements - params: dict[str, Any], # exact dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> (int, int): + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: str, # code location + doc_column: str, # key for accessing data + hash_cpu: float, # number of CPUs per hash + n_samples: int, # number of samples for parameters computation +) -> dict: """ Compute exact dedup execution parameters :param worker_options: cluster parameters :param actor_options: actor request requirements :param n_samples: number of samples to use - :param params: exact dedup specific parameters containing the following keys: - s3_config - s3 config - hash_cpu - hash cpu requirements - :return: json string, containing computed number of workers and hashes + :param data_s3_config - s3 config + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_pipeline_id - pipeline id + :param runtime_job_id - job id, or just a unique string + :param runtime_code_location - code location + :param doc_column - key for accessing data + :param hash_cpu - number of CPUs per hash + :param n_samples - umber of samples for parameters computation + :return: a dictionary with a Ray Job execution parameters """ # required import import math @@ -53,7 +66,7 @@ def ededup_compute_execution_params( # get credentials s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) if type(s3_config) is list: # S3 config is list. take the first element s3_config = s3_config[0] @@ -71,7 +84,6 @@ def ededup_compute_execution_params( n_hashes = math.ceil(number_of_docs * 32 / GB) print(f"Estimated Required hashes {n_hashes}") print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - hash_cpu: float = float(params.get("hash_cpu")) required_hash_cpu = n_hashes * hash_cpu required_hash_mem = n_hashes * 2 if required_hash_cpu > cluster_cpu or required_hash_mem > cluster_memory: @@ -97,5 +109,16 @@ def ededup_compute_execution_params( print(f"Try to increase the size of the cluster or increase size of the cpu per worker") sys.exit(1) print(f"Projected execution time {EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60} min") - # return json.dumps({"workers": n_workers, "hashes": n_hashes}) - return n_workers, n_hashes + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": n_workers, + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "ededup_doc_column": doc_column, + "ededup_hash_cpu": hash_cpu, + "ededup_num_hashes": n_hashes, + } From 36f6d9263d955dbc4543cacfdcf82837facfcb11 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 21:38:42 +0300 Subject: [PATCH 58/64] fix ededup2 Signed-off-by: Alexey Roytman --- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index c341cbb6a..43e7d915a 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -42,7 +42,7 @@ import uuid compute_exec_params_op = dsl.component_decorator.component( - func=compute_exec_params_func, base_image=base_kfp_image + func=ededup_compute_execution_params, base_image=base_kfp_image ) run_id = uuid.uuid4().hex else: From fc21bb832fbd66a38544ea143bea55414a539d10 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 21:44:52 +0300 Subject: [PATCH 59/64] fix ededup3 Signed-off-by: Alexey Roytman --- transforms/universal/ededup/kfp_ray/ededup_wf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 43e7d915a..5e72caa63 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -141,9 +141,9 @@ def ededup( runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - ededup_doc_column=ededup_doc_column, - ededup_hash_cpu=ededup_hash_cpu, - ededup_n_samples=ededup_n_samples, + doc_column=ededup_doc_column, + hash_cpu=ededup_hash_cpu, + n_samples=ededup_n_samples, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) From 3892937bcd3c02d7800aeb04f3eb50d3189eccd3 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 23:42:14 +0300 Subject: [PATCH 60/64] fix fdedup Signed-off-by: Alexey Roytman --- .../universal/ededup/kfp_ray/ededup_wf.py | 1 - transforms/universal/fdedup/kfp_ray/Makefile | 40 ++++ .../fdedup/kfp_ray/{v1 => }/fdedup_wf.py | 107 ++++----- .../src/fdedup_compute_execution_params.py | 106 ++++++--- .../universal/fdedup/kfp_ray/v1/Makefile | 25 -- .../universal/fdedup/kfp_ray/v2/Makefile | 25 -- .../universal/fdedup/kfp_ray/v2/fdedup_wf.py | 216 ------------------ .../v2/src/fdedup_compute_execution_params.py | 178 --------------- 8 files changed, 174 insertions(+), 524 deletions(-) create mode 100644 transforms/universal/fdedup/kfp_ray/Makefile rename transforms/universal/fdedup/kfp_ray/{v1 => }/fdedup_wf.py (74%) rename transforms/universal/fdedup/kfp_ray/{v1 => }/src/fdedup_compute_execution_params.py (63%) delete mode 100644 transforms/universal/fdedup/kfp_ray/v1/Makefile delete mode 100644 transforms/universal/fdedup/kfp_ray/v2/Makefile delete mode 100644 transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py delete mode 100644 transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 5e72caa63..fdee012c0 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -164,7 +164,6 @@ def ededup( ray_name=ray_name, run_id=run_id, additional_params=additional_params, - # note that the parameters below are specific for NOOP transform exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile new file mode 100644 index 000000000..f741801bc --- /dev/null +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -0,0 +1,40 @@ +REPOROOT=${CURDIR}/../../../../ +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +venv:: + +build:: + +test:: + +test-src:: + +test-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=fdedup_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-reconcile-requirements +workflow-reconcile-requirements: + @for file in $(PYTHON_WF); do \ + $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=$$file; \ + done diff --git a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py similarity index 74% rename from transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py rename to transforms/universal/fdedup/kfp_ray/fdedup_wf.py index fc242c38e..ea35975ed 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -9,31 +9,48 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import os + +from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.4.0.dev6" + # the name of the job script EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:0.4.0" - # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.0.dev6" # path to kfp component specifications files -component_spec_path = "../../../../../kfp/kfp_ray_components/" +component_spec_path = "../../../../kfp/kfp_ray_components/" + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=fdedup_compute_execution_params, base_image=base_kfp_image + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func( + func=fdedup_compute_execution_params, base_image=base_kfp_image + ) + run_id = dsl.RUN_ID_PLACEHOLDER -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job @@ -139,7 +156,7 @@ def fdedup( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -147,14 +164,26 @@ def fdedup( compute_exec_params = compute_exec_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, - params={ - "threshold": fdedup_threshold, - "num_permutations": fdedup_num_permutations, - "s3_config": data_s3_config, - "bucket_cpu": fdedup_bucket_cpu, - "doc_cpu": fdedup_doc_cpu, - "minhash_cpu": fdedup_mhash_cpu, - }, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_column=fdedup_doc_column, + id_column=fdedup_id_column, + cluster_column=fdedup_cluster_column, + bucket_cpu=fdedup_bucket_cpu, + doc_cpu=fdedup_doc_cpu, + mhash_cpu=fdedup_mhash_cpu, + num_permutations=fdedup_num_permutations, + threshold=fdedup_threshold, + shingles_size=fdedup_shingles_size, + delimiters=fdedup_delimiters, + random_delay_limit=fdedup_random_delay_limit, + snapshot_delay=fdedup_snapshot_delay, + use_doc_snapshot=fdedup_use_doc_snapshot, + use_bucket_snapshot=fdedup_use_bucket_snapshot, n_samples=fdedup_n_samples, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) @@ -163,7 +192,7 @@ def fdedup( # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -174,36 +203,9 @@ def fdedup( # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "fdedup_doc_column": fdedup_doc_column, - "fdedup_id_column": fdedup_id_column, - "fdedup_cluster_column": fdedup_cluster_column, - "fdedup_bucket_cpu": fdedup_bucket_cpu, - "fdedup_doc_cpu": fdedup_doc_cpu, - "fdedup_mhash_cpu": fdedup_mhash_cpu, - "fdedup_num_doc_actors": compute_exec_params.outputs["docs"], - "fdedup_num_bucket_actors": compute_exec_params.outputs["buckets"], - "fdedup_num_minhash_actors": compute_exec_params.outputs["min_hashes"], - "fdedup_num_preprocessors": compute_exec_params.outputs["preprocessors"], - "fdedup_num_permutations": fdedup_num_permutations, - "fdedup_threshold": fdedup_threshold, - "fdedup_shingles_size": fdedup_shingles_size, - "fdedup_delimiters": fdedup_delimiters, - "fdedup_random_delay_limit": fdedup_random_delay_limit, - "fdedup_snapshot_delay": fdedup_snapshot_delay, - "fdedup_use_doc_snapshot": fdedup_use_doc_snapshot, - "fdedup_use_bucket_snapshot": fdedup_use_bucket_snapshot, - }, + exec_params=compute_exec_params.output, exec_script_name=EXEC_SCRIPT_NAME, server_url=server_url, ) @@ -211,8 +213,9 @@ def fdedup( ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) execute_job.after(ray_cluster) + # TODO # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) + # dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) if __name__ == "__main__": diff --git a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py similarity index 63% rename from transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py rename to transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 3f332432d..656caaa72 100644 --- a/transforms/universal/fdedup/kfp_ray/v1/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -16,37 +16,65 @@ def fdedup_compute_execution_params( worker_options: str, # ray worker configuration actor_options: str, # actor's resource requirements - params: dict[str, Any], # fuzzy dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple( - "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] -): + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: str, # code location + doc_column: str, # document column name + id_column: str, # integer document id column name + cluster_column: str, # cluster column name + bucket_cpu: float, # number of CPUs per bucket hash + doc_cpu: float, # number of CPUs per doc hash + mhash_cpu: float, # number of CPUs per minhash hash + num_permutations: int, # number of permutations + threshold: float, # threshold, + shingles_size: int, # number of words in shingle + delimiters: str, # delimiter for splitting document + random_delay_limit: int, # delay between reads to reduce S3 load. + # A random number between 0 and random_delay_limit is used + snapshot_delay: int, # delay between restoring individual actors + use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots + use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots + n_samples: int, # number of samples to use +) -> dict: # NamedTuple( + # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + """ Compute fuzzy dedup execution parameters :param worker_options: cluster parameters :param actor_options: actor request requirements + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param doc_column: document column name + :param id_column: integer document id column name + :param cluster_column: cluster column name + :param bucket_cpu: number of CPUs per bucket hash + :param doc_cpu: number of CPUs per doc hash + :param mhash_cpu: number of CPUs per minhash hash + :param num_permutations: number of permutations + :param threshold: threshold, + :param shingles_size: number of words in shingle + :param delimiters: delimiter for splitting document + :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used + :param snapshot_delay: delay between restoring individual actors + :param use_doc_snapshot: flag to skip documents building and start from existing snapshots + :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots :param n_samples: number of samples to use - :param params: fuzzy dedup specific parameters containing the following keys: - threshold - threshold for fuzzy computations - num_permutations - number of permutation - s3_config - s3 config - bucket_cpu - bucket actor cpu requirements - minhash_cpu - minhash actor cpu requirements - doc_cpu - doc actor cpu requirements - :return: json string, containing - workers - number of workers - preprocessors - number of preprocessors - docs - number of doc actors - buckets - number of bucket actors - min_hashes - number of minhash actors + :return: a dictionary with a Ray Job execution parameters """ import math import sys from data_processing.data_access import DataAccessS3 from data_processing.utils import GB, KB - from kfp_support.workflow_support.runtime_utils import KFPUtils from scipy.integrate import quad as integrate + from workflow_support.runtime_utils import KFPUtils EXECUTION_OF_KB_DOC = 0.003 @@ -104,8 +132,8 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: # fuzzy parameters num_buckets, length_bucket = fuzzy_optimal_param( - threshold=float(params.get("threshold")), - num_perm=int(params.get("num_permutations")), + threshold=threshold, + num_perm=num_permutations, false_positive_weight=0.5, false_negative_weight=0.5, ) @@ -124,7 +152,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: # get credentials s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) if type(s3_config) is list: # S3 config is list. take the first element s3_config = s3_config[0] @@ -143,13 +171,10 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) # compute cpu requirements - bucket_cpu = float(params.get("bucket_cpu")) - min_hash_cpu = float(params.get("minhash_cpu")) - doc_cpu = float(params.get("doc_cpu")) # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount # of CPUs n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * min_hash_cpu - d_actors * doc_cpu) / actor_cpu + (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu ) if n_preprocessors < 0: print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") @@ -181,4 +206,31 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 print(f"Projected execution time {projected_execution} min") - return (n_workers, n_preprocessors, d_actors, b_actors, m_actors) + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": n_workers, + "runtime_worker_options": actor_options, + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": runtime_code_location, + "fdedup_doc_column": doc_column, + "fdedup_id_column": id_column, + "fdedup_cluster_column": cluster_column, + "fdedup_bucket_cpu": bucket_cpu, + "fdedup_doc_cpu": doc_cpu, + "fdedup_mhash_cpu": mhash_cpu, + "fdedup_num_doc_actors": d_actors, + "fdedup_num_bucket_actors": b_actors, + "fdedup_num_minhash_actors": m_actors, + "fdedup_num_preprocessors": n_preprocessors, + "fdedup_num_permutations": num_permutations, + "fdedup_threshold": threshold, + "fdedup_shingles_size": shingles_size, + "fdedup_delimiters": delimiters, + "fdedup_random_delay_limit": random_delay_limit, + "fdedup_snapshot_delay": snapshot_delay, + "fdedup_use_doc_snapshot": use_doc_snapshot, + "fdedup_use_bucket_snapshot": use_bucket_snapshot, + } diff --git a/transforms/universal/fdedup/kfp_ray/v1/Makefile b/transforms/universal/fdedup/kfp_ray/v1/Makefile deleted file mode 100644 index 8a82e5d18..000000000 --- a/transforms/universal/fdedup/kfp_ray/v1/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=fdedup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=fdedup_wf.py diff --git a/transforms/universal/fdedup/kfp_ray/v2/Makefile b/transforms/universal/fdedup/kfp_ray/v2/Makefile deleted file mode 100644 index 22ef10ef6..000000000 --- a/transforms/universal/fdedup/kfp_ray/v2/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -REPOROOT=${CURDIR}/../../../../../ -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.transforms_workflows - -SRC_DIR=${CURDIR}/../../ray/ - -YAML_FILE=fdedup_wf.yaml - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) ${YAML_FILE} - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .transforms_workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-upload -workflow-upload: workflow-build - $(MAKE) .transforms_workflows.upload-pipeline PIPELINE_FILE=${YAML_FILE} - -.PHONY: workflow-reconcile-requirements -workflow-reconcile-requirements: - $(MAKE) .transforms_workflows.reconcile-requirements PIPELINE_FILE=fdedup_wf.py diff --git a/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py deleted file mode 100644 index d27d5b2ea..000000000 --- a/transforms/universal/fdedup/kfp_ray/v2/fdedup_wf.py +++ /dev/null @@ -1,216 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from kfp_support.workflow_support.runtime_utils import ( - ONE_HOUR_SEC, - ONE_WEEK_SEC, - ComponentUtils, -) -from src.fdedup_compute_execution_params import fdedup_compute_execution_params - - -# the name of the job script -EXEC_SCRIPT_NAME: str = "fdedup_transform.py" - -task_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.3.0" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" - -# compute execution parameters -compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) -# create Ray cluster -create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "fdedup" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for fdedup", -) -def fdedup( - # Ray cluster - ray_name: str = "fdedup-kfp-ray", # name of Ray cluster - ray_head_options: str = '{"cpu": 1, "memory": 4, "image_pull_secret": "", "image": "' + task_image + '" }', - ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, ' - '"image_pull_secret": "", "image": "' + task_image + '"}', - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - # orchestrator - runtime_actor_options: str = "{'num_cpus': 0.8}", - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: str = "{'github': 'github', 'commit_hash': '12345', 'path': 'path'}", - # columns used - fdedup_doc_column: str = "contents", - fdedup_id_column: str = "int_id_column", - fdedup_cluster_column: str = "cluster", - # infrastructure - fdedup_bucket_cpu: float = 0.5, - fdedup_doc_cpu: float = 0.5, - fdedup_mhash_cpu: float = 0.5, - # fuzzy parameters - fdedup_num_permutations: int = 64, - fdedup_threshold: float = 0.8, - fdedup_shingles_size: int = 5, - fdedup_delimiters: str = " ", - # Random delay between reads - fdedup_random_delay_limit: int = 5, - # snapshotting - fdedup_snapshot_delay: int = 1, - fdedup_use_doc_snapshot: bool = False, - fdedup_use_bucket_snapshot: bool = False, - # data sampling - fdedup_n_samples: int = 10, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5}', -): - """ - Pipeline to execute FDEDUP transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param fdedup_doc_column - document column name - :param fdedup_id_column - integer document id column name - :param fdedup_cluster_column - cluster column name - :param fdedup_bucket_cpu - number of CPUs per bucket hash - :param fdedup_doc_cpu - number of CPUs per doc hash - :param fdedup_mhash_cpu - number of CPUs per minhash hash - :param fdedup_num_permutations - number of permutations - :param fdedup_threshold - threshold - :param fdedup_shingles_size - number of words in shingle - :param fdedup_delimiters - delimiter for splitting document - :param fdedup_random_delay_limit - delay between reads to reduce S3 load. - A random number between 0 and random_delay_limit is used - :param fdedup_snapshot_delay - delay between restoring individual actors - :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots - :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots - :param fdedup_n_samples - number of samples for parameters computation - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url) - ComponentUtils.add_settings_to_component(clean_up_task, 60) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - params={ - "threshold": fdedup_threshold, - "num_permutations": fdedup_num_permutations, - "s3_config": data_s3_config, - "bucket_cpu": fdedup_bucket_cpu, - "doc_cpu": fdedup_doc_cpu, - "minhash_cpu": fdedup_mhash_cpu, - }, - n_samples=fdedup_n_samples, - ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) - - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, - additional_params=additional_params, - exec_params={ - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "runtime_num_workers": compute_exec_params.outputs["workers"], - "runtime_worker_options": runtime_actor_options, - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": dsl.RUN_ID_PLACEHOLDER, - "runtime_code_location": runtime_code_location, - "fdedup_doc_column": fdedup_doc_column, - "fdedup_id_column": fdedup_id_column, - "fdedup_cluster_column": fdedup_cluster_column, - "fdedup_bucket_cpu": fdedup_bucket_cpu, - "fdedup_doc_cpu": fdedup_doc_cpu, - "fdedup_mhash_cpu": fdedup_mhash_cpu, - "fdedup_num_doc_actors": compute_exec_params.outputs["docs"], - "fdedup_num_bucket_actors": compute_exec_params.outputs["buckets"], - "fdedup_num_minhash_actors": compute_exec_params.outputs["min_hashes"], - "fdedup_num_preprocessors": compute_exec_params.outputs["preprocessors"], - "fdedup_num_permutations": fdedup_num_permutations, - "fdedup_threshold": fdedup_threshold, - "fdedup_shingles_size": fdedup_shingles_size, - "fdedup_delimiters": fdedup_delimiters, - "fdedup_random_delay_limit": fdedup_random_delay_limit, - "fdedup_snapshot_delay": fdedup_snapshot_delay, - "fdedup_use_doc_snapshot": fdedup_use_doc_snapshot, - "fdedup_use_bucket_snapshot": fdedup_use_bucket_snapshot, - }, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - # Configure the pipeline level to one week (in seconds) - dsl.get_pipeline_conf().set_timeout(ONE_WEEK_SEC) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py deleted file mode 100644 index f511784c4..000000000 --- a/transforms/universal/fdedup/kfp_ray/v2/src/fdedup_compute_execution_params.py +++ /dev/null @@ -1,178 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -from typing import Any, NamedTuple - - -def fdedup_compute_execution_params( - worker_options: str, # ray worker configuration - actor_options: str, # actor's resource requirements - params: dict[str, Any], # fuzzy dedup specific parameters - n_samples: int = 10, # number of samples to use -) -> NamedTuple( - "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] -): - """ - Compute fuzzy dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements - :param n_samples: number of samples to use - :param params: fuzzy dedup specific parameters containing the following keys: - threshold - threshold for fuzzy computations - num_permutations - number of permutation - s3_config - s3 config - bucket_cpu - bucket actor cpu requirements - minhash_cpu - minhash actor cpu requirements - doc_cpu - doc actor cpu requirements - :return: json string, containing - workers - number of workers - preprocessors - number of preprocessors - docs - number of doc actors - buckets - number of bucket actors - min_hashes - number of minhash actors - """ - import math - import sys - - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from kfp_support.workflow_support.runtime_utils import KFPUtils - from scipy.integrate import quad as integrate - - EXECUTION_OF_KB_DOC = 0.003 - - def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, - ) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt - - # fuzzy parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=float(params.get("threshold")), - num_perm=int(params.get("num_permutations")), - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") - # Get cluster parameters - w_options = KFPUtils.load_from_json(worker_options.replace("'", '"')) - cluster_cpu = w_options["replicas"] * w_options["cpu"] - cluster_memory = w_options["replicas"] * w_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu *= 0.85 - cluster_memory *= 0.85 - # get actor requirements - a_options = KFPUtils.load_from_json(actor_options.replace("'", '"')) - actor_cpu = a_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(params.get("s3_config", {}).replace("'", '"')) - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - # we are creating more buckets actors, so that we get better parallelization for bucket processing - b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) - d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) - m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) - # compute cpu requirements - bucket_cpu = float(params.get("bucket_cpu")) - min_hash_cpu = float(params.get("minhash_cpu")) - doc_cpu = float(params.get("doc_cpu")) - # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount - # of CPUs - n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * min_hash_cpu - d_actors * doc_cpu) / actor_cpu - ) - if n_preprocessors < 0: - print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") - print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") - print("Try to increase the size of the cluster") - sys.exit(1) - # compute the amount of workers - n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) - # Ensure that we do not overwhelm S3 - if n_workers > 2000: - n_workers = 2000 - print( - f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " - f"minhash actors {m_actors}, document actors {d_actors}" - ) - - # Make sure that we have enough memory - r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") - sys.exit(1) - - print( - f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * min_hash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" - ) - - projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 - print(f"Projected execution time {projected_execution} min") - return (n_workers, n_preprocessors, d_actors, b_actors, m_actors) From bb834ecb6e5635ee53b5f911ed0ab243463c8334 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Thu, 6 Jun 2024 23:56:43 +0300 Subject: [PATCH 61/64] fix fdedup2 Signed-off-by: Alexey Roytman --- .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 656caaa72..9d07940c1 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -201,7 +201,7 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float: print( f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * min_hash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" + f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" ) projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 From f8ead56c78fa8428e561fe2d8d174617b83e2482 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Fri, 7 Jun 2024 19:24:01 +0300 Subject: [PATCH 62/64] fix comments Signed-off-by: Alexey Roytman --- kfp/kfp_ray_components/Dockerfile | 20 +++++--- .../kfp_v1_workflow_support/Makefile | 6 ++- .../kfp_v1_workflow_support/pyproject.toml | 2 +- .../kfp_v2_workflow_support/Makefile | 10 ++-- .../kfp_v2_workflow_support/pyproject.toml | 2 +- .../universal/filter/src/local_pipeline.py | 51 ------------------- 6 files changed, 25 insertions(+), 66 deletions(-) delete mode 100644 transforms/universal/filter/src/local_pipeline.py diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 90bd04549..a012640ec 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -1,11 +1,5 @@ FROM docker.io/rayproject/ray:2.9.3-py310 -ARG BUILD_DATE -ARG GIT_COMMIT - -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT - # install libraries COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt @@ -22,8 +16,20 @@ RUN cd python_apiserver_client && pip install --no-cache-dir -e . COPY --chown=ray:users workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . -ENV KFP_v2=$KFP_v2 + +# overwriting the installation of old versions of pydantic +RUN pip install --no-cache-dir pydantic==2.6.3 + # remove credentials-containing file RUN rm requirements.txt # components COPY ./src /pipelines/component/src + +# Set environment +ENV KFP_v2=$KFP_v2 + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 34ab9d34f..1b29e0cf7 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -27,7 +27,7 @@ clean:: set-versions:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml - sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/data_prep_toolkit_ray==[0-9].*/data_prep_toolkit_ray==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v1}",/' pyproject.toml sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml @@ -51,7 +51,7 @@ publish:: .check-env venv:: pyproject.toml .check-env ifeq ($(KFPv2), 1) - echo "Skipping test as KFPv2 is defined" + echo "Skipping as KFPv2 is defined" else @# Help: Create the virtual environment using pyproject.toml rm -rf venv @@ -59,6 +59,8 @@ else . ${VENV_ACTIVATE}; \ cd ../../../data-processing-lib/python && make set-versions && cd -; \ pip install -e ../../../data-processing-lib/python; \ + cd ../../../data-prepossesing-lib/ray && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/ray; \ cd ../python_apiserver_client && make set-versions && cd -; \ pip install -e ../python_apiserver_client; \ pip install -e .; \ diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 930d3df5d..679f7ed08 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "kfp==1.8.22", "ray==2.9.3", "requests", - "data-prep-toolkit==0.2.0.dev6", + "data_prep_toolkit_ray==0.2.0.dev6", "python_apiserver_client==0.1.0", ] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index c7d302707..9d51ecb99 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -27,12 +27,12 @@ clean:: set-versions:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION_v2}'"/' pyproject.toml - sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/data_prep_toolkit_ray==[0-9].*/data_prep_toolkit_ray==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP_v2}",/' pyproject.toml sed -i.back 's/ray=[0-9].*/ray==${RAY}",/' pyproject.toml build:: set-versions venv -ifeq ($(KFPv2), 0) +ifneq ($(KFPv2), 1) echo "Skipping build as KFPv2 is not defined" else @# Help: Build the distribution for publishing to a pypi @@ -50,7 +50,7 @@ publish:: .check-env ${PYTHON} -m twine upload --verbose --non-interactive dist/* venv:: pyproject.toml .check-env -ifeq ($(KFPv2), 0) +ifneq ($(KFPv2), 1) echo "Skipping venv as KFPv2 is not defined" else @# Help: Create the virtual environment using pyproject.toml @@ -59,6 +59,8 @@ else . ${VENV_ACTIVATE}; \ cd ../../../data-processing-lib/python && make set-versions && cd -; \ pip install -e ../../../data-processing-lib/python; \ + cd ../../../data-prepossesing-lib/ray && make set-versions && cd -; \ + pip install -e ../../../data-processing-lib/ray; \ cd ../python_apiserver_client && make set-versions && cd -; \ pip install -e ../python_apiserver_client; \ pip install -e .; \ @@ -66,7 +68,7 @@ else endif test:: venv -ifeq ($(KFPv2), 0) +ifneq ($(KFPv2), 1) echo "Skipping test as KFPv2 is not defined" else @# Help: Use the already-built virtual environment to run pytest on the test directory. diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 5e8a2aec9..3e1607ee6 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ "kfp-kubernetes==1.2.0", "ray==2.9.3", "requests", - "data-prep-toolkit==0.2.0.dev6", + "data_prep_toolkit_ray==0.2.0.dev6", "python_apiserver_client", ] diff --git a/transforms/universal/filter/src/local_pipeline.py b/transforms/universal/filter/src/local_pipeline.py deleted file mode 100644 index 9a77e780b..000000000 --- a/transforms/universal/filter/src/local_pipeline.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import sys - -from data_processing.data_access import DataAccessLocal -sys.path.append('../../noop/src') -sys.path.append(os.path.dirname(os.path.abspath(__file__), '../..')) -from noop_transform import NOOPTransform - -from filter_transform import ( - FilterTransform, - filter_columns_to_drop_key, - filter_criteria_key, - filter_logical_operator_key, -) - -# create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) -local_conf = { - "input_folder": input_folder, - "output_folder": output_folder, -} - - -filter_criteria = [ - "docq_total_words > 100 AND docq_total_words < 200", - "ibmkenlm_docq_perplex_score < 230", -] -filter_logical_operator = "AND" -filter_columns_to_drop = ["extra", "cluster"] - -filter_params = { - filter_criteria_key: filter_criteria, - filter_columns_to_drop_key: filter_columns_to_drop, - filter_logical_operator_key: filter_logical_operator, -} - -if __name__ == "__main__": - # Here we show how to run outside of ray - # Filter transform needs a DataAccess to ready the domain list. - data_access = DataAccessLocal(local_conf) - # Create and configure the transform. - transform = FilterTransform(filter_params) - # Use the local data access to read a parquet table. - table = data_access.get_table(os.path.join(input_folder, "test1.parquet")) - print(f"input table has {table.num_rows} rows") - # Transform the table - table_list, metadata = transform.transform(table) - print(f"\noutput table has {table_list[0].num_rows} rows") - print(f"output metadata : {metadata}") - From 53f3d3b9bb561b0e86f4222c89577d58c12d1aa3 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Fri, 7 Jun 2024 19:42:43 +0300 Subject: [PATCH 63/64] add comments in Makefiles Signed-off-by: Alexey Roytman --- kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile | 3 +++ kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile | 3 +++ 2 files changed, 6 insertions(+) diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile index 1b29e0cf7..9cebae629 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/Makefile @@ -33,6 +33,7 @@ set-versions:: .check-env build:: set-versions venv ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make build` in upper directories and KFPv2==1 echo "Skipping build as KFPv2 is defined" else @# Help: Build the distribution for publishing to a pypi @@ -51,6 +52,7 @@ publish:: .check-env venv:: pyproject.toml .check-env ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make venv` in upper directories and KFPv2==1 echo "Skipping as KFPv2 is defined" else @# Help: Create the virtual environment using pyproject.toml @@ -69,6 +71,7 @@ endif test:: venv ifeq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make test` in upper directories and KFPv2==1 echo "Skipping test as KFPv2 is defined" else @# Help: Use the already-built virtual environment to run pytest on the test directory. diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile index 9d51ecb99..30921f37f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/Makefile @@ -33,6 +33,7 @@ set-versions:: .check-env build:: set-versions venv ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make build` in upper directories and KFPv2 is not set echo "Skipping build as KFPv2 is not defined" else @# Help: Build the distribution for publishing to a pypi @@ -51,6 +52,7 @@ publish:: .check-env venv:: pyproject.toml .check-env ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make venv` in upper directories and KFPv2 is not set echo "Skipping venv as KFPv2 is not defined" else @# Help: Create the virtual environment using pyproject.toml @@ -69,6 +71,7 @@ endif test:: venv ifneq ($(KFPv2), 1) + # we want to prevent execution of the rule, when we run `make test` in upper directories and KFPv2 is not set echo "Skipping test as KFPv2 is not defined" else @# Help: Use the already-built virtual environment to run pytest on the test directory. From 74647f83328b22662c35859fc0c8570612daa0a0 Mon Sep 17 00:00:00 2001 From: Alexey Roytman Date: Fri, 7 Jun 2024 19:57:05 +0300 Subject: [PATCH 64/64] add the non-unique ray cluster ID warning for KFPv2 Signed-off-by: Alexey Roytman --- .../code_quality/kfp_ray/code_quality_wf.py | 3 ++ transforms/code/malware/kfp_ray/malware_wf.py | 3 ++ .../kfp_ray/proglang_select_wf.py | 3 ++ .../universal/doc_id/kfp_ray/doc_id_wf.py | 3 ++ .../universal/ededup/kfp_ray/ededup_wf.py | 3 ++ .../universal/fdedup/kfp_ray/fdedup_wf.py | 3 ++ .../universal/filter/kfp_ray/filter_wf.py | 3 ++ .../noop/kfp_ray/noop_multiple_wf.py | 3 ++ transforms/universal/noop/kfp_ray/noop_wf.py | 2 ++ .../tokenization/kfp_ray/tokenization_wf.py | 31 +++++++++++++++++++ 10 files changed, 57 insertions(+) diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index e53880caf..b89f74083 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -77,6 +77,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 0fb03884d..d0e22643b 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -74,6 +74,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 0ed95ff64..ad256903f 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -75,6 +75,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 9bc58e9df..5cbb3e974 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -80,6 +80,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index fdee012c0..6297470e9 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -49,6 +49,9 @@ compute_exec_params_op = comp.create_component_from_func( func=ededup_compute_execution_params, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index ea35975ed..c3e21a85b 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -44,6 +44,9 @@ compute_exec_params_op = dsl.component_decorator.component( func=fdedup_compute_execution_params, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func( diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 17bd22ab1..90d2b197b 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -76,6 +76,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index a36ff5ca5..67b4aead0 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -73,6 +73,9 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 2076ea05e..8748a60ca 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -71,6 +71,8 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 4ab872ce8..f74d0a331 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -84,6 +84,37 @@ def compute_exec_params_func( compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!") + + + + + + + + + + + + + + + + + + + + + + + + + + + + run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image)