-
Notifications
You must be signed in to change notification settings - Fork 9
Stop using cloud_gcp_submitter jar and just use cloud_gcp jar, and add --gcp for fetch
#466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
882843d
f6e9b93
b5f3d33
9e97388
575c3c7
e06b9a5
1949cb7
e14ff4f
70ea38e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,10 +27,10 @@ | |
| import xml.etree.ElementTree as ET | ||
| from datetime import datetime, timedelta | ||
|
|
||
| from .gcp import generate_dataproc_submitter_args, get_gcp_project_id, get_gcp_bigtable_instance_id, \ | ||
| from ai.chronon.repo.gcp import generate_dataproc_submitter_args, get_gcp_project_id, get_gcp_bigtable_instance_id, \ | ||
| get_gcp_region_id, download_zipline_dataproc_jar, ZIPLINE_GCP_ONLINE_CLASS_DEFAULT, DATAPROC_ENTRY, \ | ||
| ZIPLINE_GCP_DATAPROC_SUBMITTER_JAR, ZIPLINE_GCP_SERVICE_JAR, ZIPLINE_GCP_ONLINE_JAR_DEFAULT | ||
| from .utils import DataprocJobType, extract_filename_from_path, retry_decorator, get_customer_id | ||
| ZIPLINE_GCP_SERVICE_JAR, ZIPLINE_GCP_JAR_DEFAULT | ||
| from ai.chronon.repo.utils import DataprocJobType, extract_filename_from_path, retry_decorator, get_customer_id | ||
|
|
||
| ONLINE_ARGS = "--online-jar={online_jar} --online-class={online_class} " | ||
| OFFLINE_ARGS = "--conf-path={conf_path} --end-date={ds} " | ||
|
|
@@ -342,7 +342,8 @@ def set_runtime_env(params): | |
| environment["cli_args"]["APP_NAME"] = "_".join( | ||
| [ | ||
| k | ||
| for k in [ | ||
| for k in | ||
| [ | ||
| "chronon", | ||
| conf_type, | ||
| params["mode"].replace("-", "_") if params["mode"] else None, | ||
|
|
@@ -780,6 +781,7 @@ def set_defaults(ctx): | |
| @click.option("--env", required=False, default="dev", help="Running environment - default to be dev") | ||
| @click.option("--mode", type=click.Choice(MODE_ARGS.keys())) | ||
| @click.option("--dataproc", is_flag=True, help="Run on Dataproc in GCP") | ||
| @click.option("--gcp", is_flag=True, help="Use GCP settings") | ||
| @click.option("--ds", help="the end partition to backfill the data") | ||
| @click.option("--app-name", help="app name. Default to {}".format(APP_NAME_TEMPLATE)) | ||
| @click.option("--start-ds", help="override the original start partition for a range backfill. " | ||
|
|
@@ -789,7 +791,7 @@ def set_defaults(ctx): | |
| @click.option("--parallelism", help="break down the backfill range into this number of tasks in parallel. " | ||
| "Please use it along with --start-ds and --end-ds and only in manual mode") | ||
| @click.option("--repo", help="Path to chronon repo", default=".") | ||
| @click.option("--online-jar", default=ZIPLINE_GCP_ONLINE_JAR_DEFAULT, | ||
| @click.option("--online-jar", default=ZIPLINE_GCP_JAR_DEFAULT, | ||
| help="Jar containing Online KvStore & Deserializer Impl. " | ||
| "Used for streaming and metadata-upload mode.") | ||
| @click.option("--online-class", default=ZIPLINE_GCP_ONLINE_CLASS_DEFAULT, | ||
|
|
@@ -814,7 +816,7 @@ def set_defaults(ctx): | |
| help="Use a mocked data source instead of a real source for groupby-streaming Flink.") | ||
| @click.option("--savepoint-uri", help="Savepoint URI for Flink streaming job") | ||
| @click.pass_context | ||
| def main(ctx, conf, env, mode, dataproc, ds, app_name, start_ds, end_ds, parallelism, repo, online_jar, | ||
| def main(ctx, conf, env, mode, dataproc, gcp, ds, app_name, start_ds, end_ds, parallelism, repo, online_jar, | ||
| online_class, | ||
| version, spark_version, spark_submit_path, spark_streaming_submit_path, online_jar_fetch, sub_help, conf_type, | ||
| online_args, chronon_jar, release_tag, list_apps, render_info, groupby_name, kafka_bootstrap, mock_source, | ||
|
|
@@ -827,16 +829,16 @@ def main(ctx, conf, env, mode, dataproc, ds, app_name, start_ds, end_ds, paralle | |
| ctx.params["args"] = " ".join(unknown_args) + extra_args | ||
| os.makedirs(ZIPLINE_DIRECTORY, exist_ok=True) | ||
|
|
||
| if dataproc: | ||
| jar_path = download_zipline_dataproc_jar(ZIPLINE_DIRECTORY, get_customer_id(), | ||
| ZIPLINE_GCP_DATAPROC_SUBMITTER_JAR) | ||
| elif chronon_jar: | ||
| jar_path = chronon_jar | ||
| else: | ||
| if dataproc or gcp: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I regret not naming this Currently it looks like this: but once we onboard plaid who uses aws, we do need to include in the command which cloud provider is being used. So moving forward the new fetch command will look like this:
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this something that we could set as part of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, actually we could set something like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tchow-zlai , mind if I do this in a follow up PR?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good. |
||
| gcp_jar_path = download_zipline_dataproc_jar(ZIPLINE_DIRECTORY, get_customer_id(), | ||
| ZIPLINE_GCP_JAR_DEFAULT) | ||
| service_jar_path = download_zipline_dataproc_jar(ZIPLINE_DIRECTORY, get_customer_id(), ZIPLINE_GCP_SERVICE_JAR) | ||
| chronon_gcp_jar_path = download_zipline_dataproc_jar(ZIPLINE_DIRECTORY, get_customer_id(), | ||
| ZIPLINE_GCP_ONLINE_JAR_DEFAULT) | ||
| jar_path = f"{service_jar_path}:{chronon_gcp_jar_path}" | ||
| jar_path = f"{service_jar_path}:{gcp_jar_path}" if mode == 'fetch' else gcp_jar_path | ||
| else: | ||
| jar_path = chronon_jar | ||
|
|
||
| if not jar_path: | ||
| raise ValueError("Jar path is not set.") | ||
|
|
||
| Runner(ctx.params, os.path.expanduser(jar_path)).run() | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what do we invoke instead? just the default jar?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sounds good, wondering if that's gonna be pretty heavy to download the large jar every time
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should be doing the hash check to only download if the hashes don't match