-
Notifications
You must be signed in to change notification settings - Fork 30
Support for build step-enabled pipelines #213
Changes from 13 commits
2f3b595
ad13b19
b4af4d9
935a513
898f2f3
764a741
b82e3fe
013acb1
97c85c7
0a7ed62
57e8d47
e783f22
27c0baa
bc18ca5
4c00a6f
e0cad24
d6b5e52
e4e38c7
2d36168
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
matplotlib>=3.1.2 | ||
opencv-python>=4.1.2.30 | ||
opencv-python==4.3.0.36 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,8 @@ | ||
import os | ||
import json | ||
import base64 | ||
import warnings | ||
from pathlib import Path | ||
|
||
from python_pachyderm.proto.pps import pps_pb2 as pps_proto | ||
from python_pachyderm.service import Service | ||
|
@@ -201,6 +204,87 @@ def create_pipeline(self, pipeline_name, transform, parallelism_spec=None, hasht | |
* `sidecar_resource_limits`: An optional `ResourceSpec` setting | ||
resource limits for the pipeline sidecar. | ||
""" | ||
|
||
# Support for build step-enabled pipelines. This is a python port of | ||
# the equivalent functionality in pachyderm core's | ||
# 'src/server/pps/cmds/cmds.go', and any changes made here likely have | ||
# to be reflected there as well. | ||
if transform.build.image or transform.build.language or transform.build.path: | ||
if spout: | ||
raise Exception("build step-enabled pipelines do not work with spouts") | ||
if not input: | ||
raise Exception("no `input` specified") | ||
if (not transform.build.language) and (not transform.build.image): | ||
raise Exception("must specify either a build `language` or `image`") | ||
if transform.build.language and transform.build.image: | ||
raise Exception("cannot specify both a build `language` and `image`") | ||
if any(pipeline_input_name(i) in ("build", "source") for i in pipeline_inputs(input)): | ||
raise Exception( | ||
"build step-enabled pipelines cannot have inputs with the name " | ||
+ "'build' or 'source', as they are reserved for build assets" | ||
) | ||
|
||
build_path = Path(transform.build.path or ".") | ||
if not build_path.exists(): | ||
raise Exception("build path {} does not exist".format(build_path)) | ||
if (build_path / ".pachignore").exists(): | ||
warnings.warn( | ||
"detected a '.pachignore' file, but it's unsupported by python_pachyderm -- use `pachctl` instead", | ||
RuntimeWarning | ||
) | ||
|
||
build_pipeline_name = "{}_build".format(pipeline_name) | ||
|
||
image = transform.build.image | ||
if not image: | ||
version = self.get_remote_version() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the idea that the standard build pipeline images follow pachd's versions? If so we'll need to make it part of our release process to update the image tags There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that's correct. Builder images are already automatically tagged and pushed as part of the release process. |
||
version_str = "{}.{}.{}{}".format(version.major, version.minor, version.micro, version.additional) | ||
image = "pachyderm/{}-build:{}".format(transform.build.language, version_str) | ||
if not transform.image: | ||
transform.image = image | ||
|
||
def create_build_pipeline_input(name): | ||
return pps_proto.Input( | ||
pfs=pps_proto.PFSInput( | ||
name=name, | ||
glob="/", | ||
repo=build_pipeline_name, | ||
branch=name, | ||
) | ||
) | ||
|
||
self.create_repo(build_pipeline_name, update=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm probably just blind but is the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, this is just a general question about the feature, but why do we need to create the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I just didn't quite understand how this feature worked (that the source and build output were two branches in the same repo), but this clarifies it—thanks! |
||
|
||
self._req( | ||
Service.PPS, "CreatePipeline", | ||
pipeline=pps_proto.Pipeline(name=build_pipeline_name), | ||
transform=pps_proto.Transform(image=image, cmd=["sh", "./build.sh"]), | ||
parallelism_spec=pps_proto.ParallelismSpec(constant=1), | ||
input=create_build_pipeline_input("source"), | ||
output_branch="build", | ||
update=update, | ||
) | ||
|
||
with self.put_file_client() as pfc: | ||
if update: | ||
pfc.delete_file((build_pipeline_name, "source"), "/") | ||
for root, _, filenames in os.walk(str(build_path)): | ||
for filename in filenames: | ||
source_filepath = os.path.join(root, filename) | ||
dest_filepath = os.path.join("/", os.path.relpath(source_filepath, start=str(build_path))) | ||
pfc.put_file_from_filepath((build_pipeline_name, "source"), dest_filepath, source_filepath) | ||
|
||
input = pps_proto.Input( | ||
cross=[ | ||
create_build_pipeline_input("source"), | ||
create_build_pipeline_input("build"), | ||
input, | ||
] | ||
) | ||
|
||
if not transform.cmd: | ||
transform.cmd[:] = ["sh", "/pfs/build/run.sh"] | ||
|
||
return self._req( | ||
Service.PPS, "CreatePipeline", | ||
pipeline=pps_proto.Pipeline(name=pipeline_name), | ||
|
@@ -602,3 +686,35 @@ def garbage_collect(self, memory_bytes=None): | |
precise garbage collection (at the cost of more memory usage). | ||
""" | ||
return self._req(Service.PPS, "GarbageCollect", memory_bytes=memory_bytes) | ||
|
||
|
||
def pipeline_input_name(i): | ||
if i is None: | ||
return None | ||
if i.pfs is not None: | ||
return i.pfs.name | ||
if i.cross is not None: | ||
if len(i.cross) > 0: | ||
return pipeline_input_name(i.cross[0]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to check that I understand how this code works, technically you could return the empty string here, right? Like, IIUC, the idea is that you're assembling an iterator over all inputs ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah that's right, I'll fix this. This current function is just a transliteration of the equivalent code in go (I think in |
||
if i.join is not None: | ||
if len(i.join) > 0: | ||
return pipeline_input_name(i.join[0]) | ||
if i.union is not None: | ||
if len(i.union) > 0: | ||
return pipeline_input_name(i.union[0]) | ||
return None | ||
|
||
|
||
def pipeline_inputs(root): | ||
if root is None: | ||
return | ||
elif root.cross is not None: | ||
for i in root.cross: | ||
yield from pipeline_inputs(i) | ||
elif root.join is not None: | ||
for i in root.join: | ||
yield from pipeline_inputs(i) | ||
elif root.union is not None: | ||
for i in root.union: | ||
yield from pipeline_inputs(i) | ||
yield root |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For posterity: #221