diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.py b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.py new file mode 100644 index 0000000..ff4cbce --- /dev/null +++ b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.py @@ -0,0 +1,80 @@ +from typing import NamedTuple + +from kfp.components import create_component_from_func + +def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI( + data_uri: 'GoogleCloudBigQueryUri', + display_name: str = None, + encryption_spec_key_name: str = None, + project: str = None, + location: str = 'us-central1', +) -> NamedTuple('Outputs', [ + ('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'), + ('dataset_dict', dict), +]): + '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS. + + Annotations: + author: Alexey Volkov + + Args: + data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. + display_name: Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + encryption_spec_key_name (Optional[str]): + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + project: Google Cloud project ID. If not set, the default one will be used. + location: Google Cloud region. AutoML Tables only supports us-central1. + Returns: + dataset_name: Dataset name (fully-qualified) + dataset_dict: Dataset object in JSON format + ''' + + import datetime + import json + import logging + + from google.cloud import aiplatform + from google.protobuf import json_format + + logging.getLogger().setLevel(logging.INFO) + + if not display_name: + display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") + + aiplatform.init( + project=project, + location=location, + encryption_spec_key_name=encryption_spec_key_name, + ) + dataset = aiplatform.TabularDataset.create( + display_name=display_name, + bq_source=data_uri, + ) + (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/') + dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}' + logging.info(f'Created dataset {dataset.name}.') + logging.info(f'Link: {dataset_web_url}') + dataset_json = json_format.MessageToJson(dataset._gca_resource._pb) + print(dataset_json) + return (dataset.resource_name, dataset_json, dataset_web_url) + + +if __name__ == '__main__': + create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI_op = create_component_from_func( + create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI, + base_image='python:3.9', + packages_to_install=['google-cloud-aiplatform==1.1.1'], + output_component_file='component.yaml', + annotations={ + "author": "Alexey Volkov ", + "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml", + }, + ) diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.yaml b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.yaml new file mode 100644 index 0000000..3f511d0 --- /dev/null +++ b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/From_BigQuery/component.yaml @@ -0,0 +1,177 @@ +name: Create tabular dataset from BigQuery for Google Cloud Vertex AI +description: Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in + GCS. +metadata: + annotations: {author: Alexey Volkov , canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_BigQuery/component.yaml'} +inputs: +- name: data_uri + type: GoogleCloudBigQueryUri + description: |- + Google Cloud BigQuery URI pointing to the data that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. +- name: display_name + type: String + description: |- + Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + optional: true +- name: encryption_spec_key_name + type: String + description: |- + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + optional: true +- {name: project, type: String, description: 'Google Cloud project ID. If not set, + the default one will be used.', optional: true} +- {name: location, type: String, description: Google Cloud region. AutoML Tables only + supports us-central1., default: us-central1, optional: true} +outputs: +- {name: dataset_name, type: GoogleCloudVertexAiTabularDatasetName} +- {name: dataset_dict, type: JsonObject} +implementation: + container: + image: python:3.9 + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'google-cloud-aiplatform==1.1.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 + -m pip install --quiet --no-warn-script-location 'google-cloud-aiplatform==1.1.1' + --user) && "$0" "$@" + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI( + data_uri, + display_name = None, + encryption_spec_key_name = None, + project = None, + location = 'us-central1', + ): + '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS. + + Annotations: + author: Alexey Volkov + + Args: + data_uri: Google Cloud BigQuery URI pointing to the data that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. + display_name: Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + encryption_spec_key_name (Optional[str]): + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + project: Google Cloud project ID. If not set, the default one will be used. + location: Google Cloud region. AutoML Tables only supports us-central1. + Returns: + dataset_name: Dataset name (fully-qualified) + dataset_dict: Dataset object in JSON format + ''' + + import datetime + import json + import logging + + from google.cloud import aiplatform + from google.protobuf import json_format + + logging.getLogger().setLevel(logging.INFO) + + if not display_name: + display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") + + aiplatform.init( + project=project, + location=location, + encryption_spec_key_name=encryption_spec_key_name, + ) + dataset = aiplatform.TabularDataset.create( + display_name=display_name, + bq_source=data_uri, + ) + (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/') + dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}' + logging.info(f'Created dataset {dataset.name}.') + logging.info(f'Link: {dataset_web_url}') + dataset_json = json_format.MessageToJson(dataset._gca_resource._pb) + print(dataset_json) + return (dataset.resource_name, dataset_json, dataset_web_url) + + def _serialize_json(obj) -> str: + if isinstance(obj, str): + return obj + import json + def default_serializer(obj): + if hasattr(obj, 'to_struct'): + return obj.to_struct() + else: + raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__) + return json.dumps(obj, default=default_serializer, sort_keys=True) + + import argparse + _parser = argparse.ArgumentParser(prog='Create tabular dataset from BigQuery for Google Cloud Vertex AI', description='Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.') + _parser.add_argument("--data-uri", dest="data_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--encryption-spec-key-name", dest="encryption_spec_key_name", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--project", dest="project", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = create_tabular_dataset_from_BigQuery_for_Google_Cloud_Vertex_AI(**_parsed_args) + + _output_serializers = [ + str, + _serialize_json, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --data-uri + - {inputValue: data_uri} + - if: + cond: {isPresent: display_name} + then: + - --display-name + - {inputValue: display_name} + - if: + cond: {isPresent: encryption_spec_key_name} + then: + - --encryption-spec-key-name + - {inputValue: encryption_spec_key_name} + - if: + cond: {isPresent: project} + then: + - --project + - {inputValue: project} + - if: + cond: {isPresent: location} + then: + - --location + - {inputValue: location} + - '----output-paths' + - {outputPath: dataset_name} + - {outputPath: dataset_dict} diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.py b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.py new file mode 100644 index 0000000..7618a90 --- /dev/null +++ b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.py @@ -0,0 +1,88 @@ +from typing import NamedTuple + +from kfp.components import create_component_from_func + +def create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI( + data_uri: 'GoogleCloudStorageUri', # data_type: "CSV" + display_name: str = None, + encryption_spec_key_name: str = None, + project: str = None, + location: str = 'us-central1', +) -> NamedTuple('Outputs', [ + ('dataset_name', 'GoogleCloudVertexAiTabularDatasetName'), + ('dataset_dict', dict), +]): + '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS. + + Annotations: + author: Alexey Volkov + + Args: + data_uri: Google Cloud Storage URI pointing to the data in CSV format that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. + display_name: Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + encryption_spec_key_name (Optional[str]): + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + project: Google Cloud project ID. If not set, the default one will be used. + location: Google Cloud region. AutoML Tables only supports us-central1. + Returns: + dataset_name: Dataset name (fully-qualified) + dataset_dict: Dataset object in JSON format + ''' + + import datetime + import json + import logging + + from google.cloud import aiplatform + from google.protobuf import json_format + + logging.getLogger().setLevel(logging.INFO) + + if not display_name: + display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") + + # Hack to enable passing multiple URIs + # I could have created another component or added another input, but it seems to be too much hassle for now. + # An alternative would have been to accept comma-delimited or semicolon-delimited URLs. + if data_uri.startswith("["): + data_uris = json.loads(data_uri) + else: + data_uris = [data_uri] + + aiplatform.init( + project=project, + location=location, + encryption_spec_key_name=encryption_spec_key_name, + ) + dataset = aiplatform.TabularDataset.create( + display_name=display_name, + gcs_source=data_uris, + ) + (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/') + dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}' + logging.info(f'Created dataset {dataset.name}.') + logging.info(f'Link: {dataset_web_url}') + dataset_json = json_format.MessageToJson(dataset._gca_resource._pb) + print(dataset_json) + return (dataset.resource_name, dataset_json, dataset_web_url) + + +if __name__ == '__main__': + create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI_op = create_component_from_func( + create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI, + base_image='python:3.9', + packages_to_install=['google-cloud-aiplatform==1.1.1'], + output_component_file='component.yaml', + annotations={ + "author": "Alexey Volkov ", + "canonical_location": "https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml", + }, + ) diff --git a/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml new file mode 100644 index 0000000..1f852ba --- /dev/null +++ b/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml @@ -0,0 +1,185 @@ +name: Create tabular dataset from GCS for Google Cloud Vertex AI +description: Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in + GCS. +metadata: + annotations: {author: Alexey Volkov , canonical_location: 'https://raw.githubusercontent.com/Ark-kun/pipeline_components/master/components/google-cloud/Vertex_AI/AutoML/Tables/Create_dataset/from_GCS/component.yaml'} +inputs: +- name: data_uri + type: GoogleCloudStorageUri + description: |- + Google Cloud Storage URI pointing to the data in CSV format that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. +- name: display_name + type: String + description: |- + Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + optional: true +- name: encryption_spec_key_name + type: String + description: |- + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + optional: true +- {name: project, type: String, description: 'Google Cloud project ID. If not set, + the default one will be used.', optional: true} +- {name: location, type: String, description: Google Cloud region. AutoML Tables only + supports us-central1., default: us-central1, optional: true} +outputs: +- {name: dataset_name, type: GoogleCloudVertexAiTabularDatasetName} +- {name: dataset_dict, type: JsonObject} +implementation: + container: + image: python:3.9 + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'google-cloud-aiplatform==1.1.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 + -m pip install --quiet --no-warn-script-location 'google-cloud-aiplatform==1.1.1' + --user) && "$0" "$@" + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI( + data_uri, # data_type: "CSV" + display_name = None, + encryption_spec_key_name = None, + project = None, + location = 'us-central1', + ): + '''Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS. + + Annotations: + author: Alexey Volkov + + Args: + data_uri: Google Cloud Storage URI pointing to the data in CSV format that should be imported into the dataset. + The bucket must be a regional bucket in the us-central1 region. + The file name must have a (case-insensitive) '.CSV' file extension. + display_name: Display name for the AutoML Dataset. + Allowed characters are ASCII Latin letters A-Z and a-z, an underscore (_), and ASCII digits 0-9. + encryption_spec_key_name (Optional[str]): + Optional. The Cloud KMS resource identifier of the customer + managed encryption key used to protect a resource. Has the + form: + ``projects/my-project/locations/my-region/keyRings/my-kr/cryptoKeys/my-key``. + The key needs to be in the same region as where the compute + resource is created. + project: Google Cloud project ID. If not set, the default one will be used. + location: Google Cloud region. AutoML Tables only supports us-central1. + Returns: + dataset_name: Dataset name (fully-qualified) + dataset_dict: Dataset object in JSON format + ''' + + import datetime + import json + import logging + + from google.cloud import aiplatform + from google.protobuf import json_format + + logging.getLogger().setLevel(logging.INFO) + + if not display_name: + display_name = 'Dataset_' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S") + + # Hack to enable passing multiple URIs + # I could have created another component or added another input, but it seems to be too much hassle for now. + # An alternative would have been to accept comma-delimited or semicolon-delimited URLs. + if data_uri.startswith("["): + data_uris = json.loads(data_uri) + else: + data_uris = [data_uri] + + aiplatform.init( + project=project, + location=location, + encryption_spec_key_name=encryption_spec_key_name, + ) + dataset = aiplatform.TabularDataset.create( + display_name=display_name, + gcs_source=data_uris, + ) + (_, dataset_project, _, dataset_location, _, dataset_id) = dataset.resource_name.split('/') + dataset_web_url = f'https://console.cloud.google.com/vertex-ai/locations/{dataset_location}/datasets/{dataset_id}/analyze?project={dataset_project}' + logging.info(f'Created dataset {dataset.name}.') + logging.info(f'Link: {dataset_web_url}') + dataset_json = json_format.MessageToJson(dataset._gca_resource._pb) + print(dataset_json) + return (dataset.resource_name, dataset_json, dataset_web_url) + + def _serialize_json(obj) -> str: + if isinstance(obj, str): + return obj + import json + def default_serializer(obj): + if hasattr(obj, 'to_struct'): + return obj.to_struct() + else: + raise TypeError("Object of type '%s' is not JSON serializable and does not have .to_struct() method." % obj.__class__.__name__) + return json.dumps(obj, default=default_serializer, sort_keys=True) + + import argparse + _parser = argparse.ArgumentParser(prog='Create tabular dataset from GCS for Google Cloud Vertex AI', description='Creates Google Cloud Vertex AI Tabular Dataset from CSV data stored in GCS.') + _parser.add_argument("--data-uri", dest="data_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--encryption-spec-key-name", dest="encryption_spec_key_name", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--project", dest="project", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = create_tabular_dataset_from_GCS_for_Google_Cloud_Vertex_AI(**_parsed_args) + + _output_serializers = [ + str, + _serialize_json, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --data-uri + - {inputValue: data_uri} + - if: + cond: {isPresent: display_name} + then: + - --display-name + - {inputValue: display_name} + - if: + cond: {isPresent: encryption_spec_key_name} + then: + - --encryption-spec-key-name + - {inputValue: encryption_spec_key_name} + - if: + cond: {isPresent: project} + then: + - --project + - {inputValue: project} + - if: + cond: {isPresent: location} + then: + - --location + - {inputValue: location} + - '----output-paths' + - {outputPath: dataset_name} + - {outputPath: dataset_dict}