diff --git a/.gitignore b/.gitignore index 0c1eb2f7ff..7acdd8b8c4 100644 --- a/.gitignore +++ b/.gitignore @@ -17,8 +17,7 @@ **/.vscode/ **/__pycache__/ **/.DS_Store -api/python/test/canary/compiled/ -api/python/test/canary/production/ +api/python/ai/chronon/resources/gcp/compiled/ api/python/test/sample/compiled/ api/python/test/sample/production/ api/python/ai/chronon/api/ diff --git a/api/BUILD.bazel b/api/BUILD.bazel index 93df3bc3cd..7d38a654a7 100644 --- a/api/BUILD.bazel +++ b/api/BUILD.bazel @@ -61,6 +61,7 @@ scala_library( scala_test_suite( name = "tests", srcs = glob(["src/test/**/*.scala"]), + data = ["//spark/src/test/resources:test-resources"], visibility = ["//visibility:public"], deps = test_deps + [":test-lib"], ) diff --git a/api/python/test/canary/compiled/group_bys/aws/purchases.v1_dev b/api/python/test/canary/compiled/group_bys/aws/purchases.v1_dev new file mode 100644 index 0000000000..734b3099a7 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/aws/purchases.v1_dev @@ -0,0 +1,123 @@ +{ + "metaData": { + "name": "aws.purchases.v1_dev", + "team": "aws", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/aws/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "aws" + }, + "modeEnvironments": {} + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds" + }, + "modeConfigs": {} + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/aws/purchases.v1_test b/api/python/test/canary/compiled/group_bys/aws/purchases.v1_test new file mode 100644 index 0000000000..09719737af --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/aws/purchases.v1_test @@ -0,0 +1,123 @@ +{ + "metaData": { + "name": "aws.purchases.v1_test", + "team": "aws", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/aws/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "aws" + }, + "modeEnvironments": {} + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds" + }, + "modeConfigs": {} + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_pubsub_v2 b/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_pubsub_v2 new file mode 100644 index 0000000000..8b0df09bd0 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_pubsub_v2 @@ -0,0 +1,191 @@ +{ + "metaData": { + "name": "gcp.item_event_canary.actions_pubsub_v2", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/item_event_canary.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_item_events_parquet_compat_with_offset_0\", \"spec\": \"data.item_events_parquet_compat/_DATE={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "_DATE", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "_DATE", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.item_events_parquet_compat", + "topic": "pubsub://test-item-event-data/serde=pubsub_schema/project=canary-443022/schemaId=item-event/tasks=4/subscription=test-item-event-data-sub", + "query": { + "selects": { + "listing_id": "EXPLODE(TRANSFORM(SPLIT(COALESCE(attributes['sold_listing_ids'], attributes['listing_id']), ','), e -> CAST(e AS LONG)))", + "add_cart": "IF(event_type = 'backend_add_to_cart', 1, 0)", + "view": "IF(event_type = 'view_listing', 1, 0)", + "purchase": "IF(event_type = 'backend_cart_payment', 1, 0)", + "favorite": "IF(event_type = 'backend_favorite_item2', 1, 0)" + }, + "wheres": [ + "event_type in ('backend_add_to_cart', 'view_listing', 'backend_cart_payment', 'backend_favorite_item2')" + ], + "timeColumn": "timestamp" + } + } + } + ], + "keyColumns": [ + "listing_id" + ], + "aggregations": [ + { + "inputColumn": "add_cart", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "view", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 7, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "favorite", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_v1 b/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_v1 new file mode 100644 index 0000000000..b3f644e3f7 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/item_event_canary.actions_v1 @@ -0,0 +1,191 @@ +{ + "metaData": { + "name": "gcp.item_event_canary.actions_v1", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/item_event_canary.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_item_events_parquet_compat_with_offset_0\", \"spec\": \"data.item_events_parquet_compat/_DATE={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "_DATE", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "_DATE", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.item_events_parquet_compat", + "topic": "kafka://test-item-event-data/provider_class=ai.chronon.flink.deser.MockCustomSchemaProvider/schema_name=item_event/security.protocol=SASL_SSL/sasl.mechanism=OAUTHBEARER/sasl.login.callback.handler.class=com.google.cloud.hosted.kafka.auth.GcpLoginCallbackHandler/sasl.jaas.config=org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule required;", + "query": { + "selects": { + "listing_id": "EXPLODE(TRANSFORM(SPLIT(COALESCE(attributes['sold_listing_ids'], attributes['listing_id']), ','), e -> CAST(e AS LONG)))", + "add_cart": "IF(event_type = 'backend_add_to_cart', 1, 0)", + "view": "IF(event_type = 'view_listing', 1, 0)", + "purchase": "IF(event_type = 'backend_cart_payment', 1, 0)", + "favorite": "IF(event_type = 'backend_favorite_item2', 1, 0)" + }, + "wheres": [ + "event_type in ('backend_add_to_cart', 'view_listing', 'backend_cart_payment', 'backend_favorite_item2')" + ], + "timeColumn": "timestamp" + } + } + } + ], + "keyColumns": [ + "listing_id" + ], + "aggregations": [ + { + "inputColumn": "add_cart", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "view", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 7, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "favorite", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 1, + "timeUnit": 1 + } + ] + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev new file mode 100644 index 0000000000..df036c707e --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev @@ -0,0 +1,205 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_dev", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev_notds b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev_notds new file mode 100644 index 0000000000..8ec6f9d9ff --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_dev_notds @@ -0,0 +1,206 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_dev_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_notds_with_offset_0\", \"spec\": \"data.purchases_notds/notds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_notds", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test new file mode 100644 index 0000000000..d15cc1996a --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test @@ -0,0 +1,205 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_test", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test_notds b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test_notds new file mode 100644 index 0000000000..93d8c888f1 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_test_notds @@ -0,0 +1,206 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_test_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_notds_with_offset_0\", \"spec\": \"data.purchases_notds/notds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_notds", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_dev b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_dev new file mode 100644 index 0000000000..7a620b4390 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_dev @@ -0,0 +1,205 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_view_dev", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_native_view_with_offset_0\", \"spec\": \"data.purchases_native_view/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_native_view", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_test b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_test new file mode 100644 index 0000000000..e9ae11b853 --- /dev/null +++ b/api/python/test/canary/compiled/group_bys/gcp/purchases.v1_view_test @@ -0,0 +1,205 @@ +{ + "metaData": { + "name": "gcp.purchases.v1_view_test", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "sourceFile": "group_bys/gcp/purchases.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_purchases_native_view_with_offset_0\", \"spec\": \"data.purchases_native_view/ds={{ macros.ds_add(ds, 0) }}\"}]}", + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_native_view", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev b/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev new file mode 100644 index 0000000000..d351b2c85d --- /dev/null +++ b/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev @@ -0,0 +1,234 @@ +{ + "metaData": { + "name": "gcp.training_set.v1_dev", + "team": "gcp", + "outputNamespace": "data", + "online": 0, + "production": 0, + "sourceFile": "joins/gcp/training_set.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_checkouts_with_offset_0\", \"spec\": \"data.checkouts/ds={{ macros.ds_add(ds, 0) }}\"}, {\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}], \"label_join\": false}", + "samplePercent": 100.0, + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "left": { + "events": { + "table": "data.checkouts", + "query": { + "selects": { + "user_id": "user_id", + "ts": "ts" + }, + "timeColumn": "ts" + } + } + }, + "joinParts": [ + { + "groupBy": { + "metaData": { + "name": "gcp.purchases.v1_dev", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "executionInfo": { + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev_notds b/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev_notds new file mode 100644 index 0000000000..479df679db --- /dev/null +++ b/api/python/test/canary/compiled/joins/gcp/training_set.v1_dev_notds @@ -0,0 +1,236 @@ +{ + "metaData": { + "name": "gcp.training_set.v1_dev_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 0, + "production": 0, + "sourceFile": "joins/gcp/training_set.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_checkouts_notds_with_offset_0\", \"spec\": \"data.checkouts_notds/notds={{ macros.ds_add(ds, 0) }}\"}, {\"name\": \"wf_data_purchases_notds_with_offset_0\", \"spec\": \"data.purchases_notds/notds={{ macros.ds_add(ds, 0) }}\"}], \"label_join\": false}", + "samplePercent": 100.0, + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "left": { + "events": { + "table": "data.checkouts_notds", + "query": { + "selects": { + "user_id": "user_id", + "ts": "ts" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + }, + "joinParts": [ + { + "groupBy": { + "metaData": { + "name": "gcp.purchases.v1_dev_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "executionInfo": { + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_notds", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/joins/gcp/training_set.v1_test b/api/python/test/canary/compiled/joins/gcp/training_set.v1_test new file mode 100644 index 0000000000..a6cd9901f8 --- /dev/null +++ b/api/python/test/canary/compiled/joins/gcp/training_set.v1_test @@ -0,0 +1,234 @@ +{ + "metaData": { + "name": "gcp.training_set.v1_test", + "team": "gcp", + "outputNamespace": "data", + "online": 0, + "production": 0, + "sourceFile": "joins/gcp/training_set.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_checkouts_with_offset_0\", \"spec\": \"data.checkouts/ds={{ macros.ds_add(ds, 0) }}\"}, {\"name\": \"wf_data_purchases_with_offset_0\", \"spec\": \"data.purchases/ds={{ macros.ds_add(ds, 0) }}\"}], \"label_join\": false}", + "samplePercent": 100.0, + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "left": { + "events": { + "table": "data.checkouts", + "query": { + "selects": { + "user_id": "user_id", + "ts": "ts" + }, + "timeColumn": "ts" + } + } + }, + "joinParts": [ + { + "groupBy": { + "metaData": { + "name": "gcp.purchases.v1_test", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "executionInfo": { + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/joins/gcp/training_set.v1_test_notds b/api/python/test/canary/compiled/joins/gcp/training_set.v1_test_notds new file mode 100644 index 0000000000..07df2a3d44 --- /dev/null +++ b/api/python/test/canary/compiled/joins/gcp/training_set.v1_test_notds @@ -0,0 +1,236 @@ +{ + "metaData": { + "name": "gcp.training_set.v1_test_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 0, + "production": 0, + "sourceFile": "joins/gcp/training_set.py", + "customJson": "{\"airflowDependencies\": [{\"name\": \"wf_data_checkouts_notds_with_offset_0\", \"spec\": \"data.checkouts_notds/notds={{ macros.ds_add(ds, 0) }}\"}, {\"name\": \"wf_data_purchases_notds_with_offset_0\", \"spec\": \"data.purchases_notds/notds={{ macros.ds_add(ds, 0) }}\"}], \"label_join\": false}", + "samplePercent": 100.0, + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + }, + "scheduleCron": "@daily" + } + }, + "left": { + "events": { + "table": "data.checkouts_notds", + "query": { + "selects": { + "user_id": "user_id", + "ts": "ts" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + }, + "joinParts": [ + { + "groupBy": { + "metaData": { + "name": "gcp.purchases.v1_test_notds", + "team": "gcp", + "outputNamespace": "data", + "online": 1, + "executionInfo": { + "scheduleCron": "@daily", + "historicalBackfill": 0 + } + }, + "sources": [ + { + "events": { + "table": "data.purchases_notds", + "query": { + "selects": { + "user_id": "user_id", + "purchase_price": "purchase_price" + }, + "timeColumn": "ts", + "partitionColumn": "notds" + } + } + } + ], + "keyColumns": [ + "user_id" + ], + "aggregations": [ + { + "inputColumn": "purchase_price", + "operation": 7, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 6, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 8, + "argMap": {}, + "windows": [ + { + "length": 3, + "timeUnit": 1 + }, + { + "length": 14, + "timeUnit": 1 + }, + { + "length": 30, + "timeUnit": 1 + } + ] + }, + { + "inputColumn": "purchase_price", + "operation": 13, + "argMap": { + "k": "10" + } + } + ], + "backfillStartDate": "2023-11-01" + } + } + ] +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/teams_metadata/aws/aws_team_metadata b/api/python/test/canary/compiled/teams_metadata/aws/aws_team_metadata new file mode 100644 index 0000000000..42726f7eca --- /dev/null +++ b/api/python/test/canary/compiled/teams_metadata/aws/aws_team_metadata @@ -0,0 +1,29 @@ +{ + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "aws" + }, + "modeEnvironments": {} + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds" + }, + "modeConfigs": {} + } + } +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/teams_metadata/default/default_team_metadata b/api/python/test/canary/compiled/teams_metadata/default/default_team_metadata new file mode 100644 index 0000000000..96ad3c01a4 --- /dev/null +++ b/api/python/test/canary/compiled/teams_metadata/default/default_team_metadata @@ -0,0 +1,28 @@ +{ + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state" + }, + "modeEnvironments": {} + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds" + }, + "modeConfigs": {} + } + } +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/teams_metadata/gcp/gcp_team_metadata b/api/python/test/canary/compiled/teams_metadata/gcp/gcp_team_metadata new file mode 100644 index 0000000000..8f6ff66603 --- /dev/null +++ b/api/python/test/canary/compiled/teams_metadata/gcp/gcp_team_metadata @@ -0,0 +1,111 @@ +{ + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-transient-upload-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "zipline-canary-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "CLOUD_PROVIDER": "gcp", + "ENABLE_PUBSUB": "true" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10" + }, + "modeConfigs": { + "backfill": { + "spark.chronon.partition.column": "ds", + "spark.chronon.cloud_provider": "gcp", + "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider", + "spark.chronon.partition.format": "yyyy-MM-dd", + "spark.chronon.table.gcs.temporary_gcs_bucket": "zipline-warehouse-canary", + "spark.chronon.table.gcs.connector_output_dataset": "data", + "spark.chronon.table.gcs.connector_output_project": "canary-443022", + "spark.chronon.table_write.prefix": "gs://zipline-warehouse-canary/data/tables/", + "spark.chronon.table_write.format": "iceberg", + "spark.sql.catalog.spark_catalog.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.spark_catalog.gcp_location": "us-central1", + "spark.sql.catalog.spark_catalog.gcp_project": "canary-443022", + "spark.sql.catalog.spark_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.spark_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.catalog.default_iceberg.warehouse": "gs://zipline-warehouse-canary/data/tables/", + "spark.sql.catalog.default_iceberg.gcp_location": "us-central1", + "spark.sql.catalog.default_iceberg.gcp_project": "canary-443022", + "spark.sql.catalog.default_iceberg.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog", + "spark.sql.catalog.default_iceberg.io-impl": "org.apache.iceberg.io.ResolvingFileIO", + "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false", + "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator", + "spark.chronon.coalesce.factor": "10", + "spark.default.parallelism": "10", + "spark.sql.shuffle.partitions": "10", + "spark.chronon.backfill_cloud_provider": "gcp" + } + } + }, + "clusterConf": { + "common": {}, + "modeClusterConfigs": { + "upload": { + "dataproc.config": "{\"gceClusterConfig\": {\"subnetworkUri\": \"default\", \"serviceAccount\": \"dataproc@canary-443022.iam.gserviceaccount.com\", \"serviceAccountScopes\": [\"https://www.googleapis.com/auth/cloud-platform\", \"https://www.googleapis.com/auth/cloud.useraccounts.readonly\", \"https://www.googleapis.com/auth/devstorage.read_write\", \"https://www.googleapis.com/auth/logging.write\"], \"metadata\": {\"hive-version\": \"3.1.2\", \"SPARK_BQ_CONNECTOR_URL\": \"gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar\", \"artifact_prefix\": \"gs://zipline-artifacts-canary\"}, \"tags\": []}, \"masterConfig\": {\"numInstances\": 1, \"machineTypeUri\": \"n2-highmem-8\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 1024}}, \"workerConfig\": {\"numInstances\": 2, \"machineTypeUri\": \"n2-highmem-4\", \"diskConfig\": {\"bootDiskType\": \"pd-standard\", \"bootDiskSizeGb\": 64, \"numLocalSsds\": 2}}, \"softwareConfig\": {\"imageVersion\": \"2.2.50-debian12\", \"optionalComponents\": [\"FLINK\", \"JUPYTER\"], \"properties\": {}}, \"initializationActions\": [{\"executable_file\": \"gs://zipline-artifacts-canary/scripts/copy_java_security.sh\"}], \"endpointConfig\": {\"enableHttpPortAccess\": true}, \"lifecycleConfig\": {\"idleDeleteTtl\": \"7200s\"}}" + } + } + } + } +} \ No newline at end of file diff --git a/api/python/test/canary/compiled/teams_metadata/test/test_team_metadata b/api/python/test/canary/compiled/teams_metadata/test/test_team_metadata new file mode 100644 index 0000000000..dcae8e37ab --- /dev/null +++ b/api/python/test/canary/compiled/teams_metadata/test/test_team_metadata @@ -0,0 +1,66 @@ +{ + "executionInfo": { + "env": { + "common": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "test-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state" + }, + "modeEnvironments": { + "upload": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "test-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "PARALLELISM": "2", + "MAX_EXECUTORS": "4" + }, + "backfill": { + "VERSION": "latest", + "JOB_MODE": "local[*]", + "HADOOP_DIR": "[STREAMING-TODO]/path/to/folder/containing", + "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class", + "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host= -Zkv-port=", + "PARTITION_COLUMN": "ds", + "PARTITION_FORMAT": "yyyy-MM-dd", + "CUSTOMER_ID": "dev", + "GCP_PROJECT_ID": "canary-443022", + "GCP_REGION": "us-central1", + "GCP_DATAPROC_CLUSTER_NAME": "zipline-canary-cluster", + "GCP_BIGTABLE_INSTANCE_ID": "test-instance", + "FLINK_STATE_URI": "gs://zipline-warehouse-canary/flink-state", + "EXECUTOR_CORES": "2", + "DRIVER_MEMORY": "15G", + "EXECUTOR_MEMORY": "4G", + "PARALLELISM": "4", + "MAX_EXECUTORS": "4" + } + } + }, + "conf": { + "common": { + "spark.chronon.partition.column": "ds" + }, + "modeConfigs": {} + } + } +} \ No newline at end of file diff --git a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala index a0fed6eeed..07c57a50de 100644 --- a/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala +++ b/api/src/main/scala/ai/chronon/api/ThriftJsonCodec.scala @@ -105,7 +105,7 @@ object ThriftJsonCodec { if (check) { val inputNode: JsonNode = mapper.readTree(jsonStr) val reSerializedInput: JsonNode = mapper.readTree(toJsonStr(obj)) - assert( + require( inputNode.equals(reSerializedInput), message = s"""Parsed Json object isn't reversible. Original JSON String: $jsonStr diff --git a/api/src/main/scala/ai/chronon/api/planner/MonolithJoinPlanner.scala b/api/src/main/scala/ai/chronon/api/planner/MonolithJoinPlanner.scala index 29af590fdb..5b031ff2ae 100644 --- a/api/src/main/scala/ai/chronon/api/planner/MonolithJoinPlanner.scala +++ b/api/src/main/scala/ai/chronon/api/planner/MonolithJoinPlanner.scala @@ -5,7 +5,7 @@ import ai.chronon.planner import scala.collection.JavaConverters._ -class MonolithJoinPlanner(join: Join)(implicit outputPartitionSpec: PartitionSpec) +case class MonolithJoinPlanner(join: Join)(implicit outputPartitionSpec: PartitionSpec) extends Planner[Join](join)(outputPartitionSpec) { private def effectiveStepDays: Int = { @@ -26,9 +26,13 @@ class MonolithJoinPlanner(join: Join)(implicit outputPartitionSpec: PartitionSpe val node = new planner.MonolithJoinNode().setJoin(join) val finalNode = toNode(metaData, _.setMonolithJoin(node), join) - val terminalNodeNames = Map( - planner.Mode.BACKFILL -> finalNode.metaData.name - ).asJava + val terminalNodeNames: java.util.Map[planner.Mode, String] = ( + for { + fin <- Option(finalNode) + metaData <- Option(fin.metaData) + name <- Option(metaData.name) + } yield Map(planner.Mode.BACKFILL -> name) + ).getOrElse(Map.empty).asJava confPlan.setNodes(List(finalNode).asJava).setTerminalNodeNames(terminalNodeNames) } } diff --git a/api/src/test/scala/ai/chronon/api/test/planner/PlannerTest.scala b/api/src/test/scala/ai/chronon/api/test/planner/PlannerTest.scala new file mode 100644 index 0000000000..c7b5420206 --- /dev/null +++ b/api/src/test/scala/ai/chronon/api/test/planner/PlannerTest.scala @@ -0,0 +1,34 @@ +package ai.chronon.api.test.planner + +import ai.chronon.api.{Join, PartitionSpec} +import ai.chronon.api.planner.LocalRunner +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import ai.chronon.api.planner.MonolithJoinPlanner +import scala.jdk.CollectionConverters._ + +import java.nio.file.Paths + +class PlannerTest extends AnyFlatSpec with Matchers { + + private implicit val testPartitionSpec = PartitionSpec.daily + + it should "monolith join planner plans valid confs without exceptions" in { + + val runfilesDir = System.getenv("RUNFILES_DIR") + val rootDir = Paths.get(runfilesDir, "chronon/spark/src/test/resources/canary/compiled/joins") + + val joinConfs = LocalRunner.parseConfs[Join](rootDir.toString) + + val joinPlanners = joinConfs.map(MonolithJoinPlanner(_)) + + joinPlanners + .foreach { planner => + noException should be thrownBy { + val plan = planner.buildPlan + plan.terminalNodeNames.asScala.size should be > 0 + } + } + } + +} diff --git a/spark/src/test/resources/canary/compiled b/spark/src/test/resources/canary/compiled new file mode 120000 index 0000000000..1babc6cb53 --- /dev/null +++ b/spark/src/test/resources/canary/compiled @@ -0,0 +1 @@ +../../../../../api/python/test/canary/compiled \ No newline at end of file