diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.terminal_v1__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.terminal_v1__0 new file mode 100644 index 0000000000..35445fd692 --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.terminal_v1__0 @@ -0,0 +1,213 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.terminal_v1__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v1__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v1__0/ds={{ macros.ds_add(ds, 1) }}\"}, {\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v2__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v2__0/ds={{ macros.ds_add(ds, 1) }}\"}, {\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v3__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v3__0/ds={{ macros.ds_add(ds, 1) }}\"}, {\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v4__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v4__0/ds={{ macros.ds_add(ds, 1) }}\"}, {\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v5__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v5__0/ds={{ macros.ds_add(ds, 1) }}\"}, {\"name\": \"wf_sample_namespace_gcp_sample_staging_query_v6__0_with_offset_1\", \"spec\": \"sample_namespace.gcp_sample_staging_query_v6__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v1__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + }, + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v2__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + }, + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v3__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + }, + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v4__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + }, + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v5__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + }, + { + "tableInfo": { + "table": "sample_namespace.gcp_sample_staging_query_v6__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v1__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v1__0 index ce8f38759b..1c06fb0c71 100644 --- a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v1__0 +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v1__0 @@ -3,7 +3,7 @@ "name": "gcp.sample_staging_query.v1__0", "team": "gcp", "version": "0", - "outputNamespace": "data", + "outputNamespace": "sample_namespace", "tableProperties": { "sample_config_json": "{\"sample_key\": \"sample value\"}" }, diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v2__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v2__0 new file mode 100644 index 0000000000..008c7df635 --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v2__0 @@ -0,0 +1,143 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.v2__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_gcp_training_set_v1_test__0_with_offset_1\", \"spec\": \"data.gcp_training_set_v1_test__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "data.gcp_training_set_v1_test__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v3__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v3__0 new file mode 100644 index 0000000000..3d418bc359 --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v3__0 @@ -0,0 +1,143 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.v3__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_gcp_training_set_v1_test__0_with_offset_1\", \"spec\": \"data.gcp_training_set_v1_test__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "data.gcp_training_set_v1_test__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v4__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v4__0 new file mode 100644 index 0000000000..52794f912c --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v4__0 @@ -0,0 +1,143 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.v4__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_gcp_training_set_v1_test__0_with_offset_1\", \"spec\": \"data.gcp_training_set_v1_test__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "data.gcp_training_set_v1_test__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v5__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v5__0 new file mode 100644 index 0000000000..f075f53ca0 --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v5__0 @@ -0,0 +1,143 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.v5__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_gcp_training_set_v1_test__0_with_offset_1\", \"spec\": \"data.gcp_training_set_v1_test__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "data.gcp_training_set_v1_test__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v6__0 b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v6__0 new file mode 100644 index 0000000000..b3fd934b4d --- /dev/null +++ b/api/python/test/canary/compiled/staging_queries/gcp/sample_staging_query.v6__0 @@ -0,0 +1,143 @@ +{ + "metaData": { + "name": "gcp.sample_staging_query.v6__0", + "team": "gcp", + "version": "0", + "outputNamespace": "sample_namespace", + "tableProperties": { + "sample_config_json": "{\"sample_key\": \"sample value\"}" + }, + "sourceFile": "staging_queries/gcp/sample_staging_query.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_gcp_training_set_v1_test__0_with_offset_1\", \"spec\": \"data.gcp_training_set_v1_test__0/ds={{ macros.ds_add(ds, 1) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": " -Ztasks=4", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true", + "ARTIFACT_PREFIX": "gs://zipline-artifacts-dev" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "query": "\nSELECT\n *\nFROM data.gcp_training_set_v1_test__0\nWHERE ds BETWEEN '{ start_date }' AND '{ end_date }'\n", + "startPartition": "2020-03-01", + "tableDependencies": [ + { + "tableInfo": { + "table": "data.gcp_training_set_v1_test__0", + "partitionColumn": "ds" + }, + "startOffset": { + "length": 1, + "timeUnit": 1 + }, + "endOffset": { + "length": 1, + "timeUnit": 1 + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/staging_queries/gcp/sample_staging_query.py b/api/python/test/canary/staging_queries/gcp/sample_staging_query.py index 9566b33d6c..74ca3ae4b6 100644 --- a/api/python/test/canary/staging_queries/gcp/sample_staging_query.py +++ b/api/python/test/canary/staging_queries/gcp/sample_staging_query.py @@ -1,7 +1,7 @@ from joins.gcp import training_set from ai.chronon.staging_query import StagingQuery, TableDependency -from ai.chronon.utils import get_join_output_table_name +from ai.chronon.utils import get_join_output_table_name, get_staging_query_output_table_name query = f""" SELECT @@ -10,14 +10,39 @@ WHERE ds BETWEEN '{{ start_date }}' AND '{{ end_date }}' """ -v1 = StagingQuery( +def get_staging_query(): + return StagingQuery( + query=query, + start_partition="2020-03-01", + name="sample_staging_query", + output_namespace="sample_namespace", + table_properties={"sample_config_json": """{"sample_key": "sample value"}"""}, + dependencies=[ + TableDependency(table=get_join_output_table_name(training_set.v1_test, True), partition_column="ds", offset=1) + ], + version=0, + ) + +v1 = get_staging_query() +v2 = get_staging_query() +v3 = get_staging_query() +v4 = get_staging_query() +v5 = get_staging_query() +v6 = get_staging_query() + +terminal_v1 = StagingQuery( query=query, start_partition="2020-03-01", - name="sample_staging_query", - output_namespace="data", table_properties={"sample_config_json": """{"sample_key": "sample value"}"""}, + name="terminal_staging_query", + output_namespace="sample_namespace", dependencies=[ - TableDependency(table=get_join_output_table_name(training_set.v1_test, True), partition_column="ds", offset=1) + TableDependency(table=get_staging_query_output_table_name(v1, True), partition_column="ds", offset=1), + TableDependency(table=get_staging_query_output_table_name(v2, True), partition_column="ds", offset=1), + TableDependency(table=get_staging_query_output_table_name(v3, True), partition_column="ds", offset=1), + TableDependency(table=get_staging_query_output_table_name(v4, True), partition_column="ds", offset=1), + TableDependency(table=get_staging_query_output_table_name(v5, True), partition_column="ds", offset=1), + TableDependency(table=get_staging_query_output_table_name(v6, True), partition_column="ds", offset=1), ], version=0, )