-
Notifications
You must be signed in to change notification settings - Fork 8
Allow metadata upload to submit to dataproc #246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8b6ccc9
9689672
6cbb9eb
cb98f6a
97450d8
46698ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,12 @@ import java.nio.file.Paths | |
| import scala.reflect.ClassTag | ||
| import scala.util.Try | ||
|
|
||
| class MetadataDirWalker(dirPath: String, metadataEndPointNames: List[String]) { | ||
| class MetadataDirWalker(dirPath: String, metadataEndPointNames: List[String], maybeConfType: Option[String] = None) { | ||
|
|
||
| val JoinKeyword = "joins" | ||
| val GroupByKeyword = "group_bys" | ||
| val StagingQueryKeyword = "staging_queries" | ||
| val ModelKeyword = "models" | ||
|
|
||
| @transient implicit lazy val logger: Logger = LoggerFactory.getLogger(getClass) | ||
| private def loadJsonToConf[T <: TBase[_, _]: Manifest: ClassTag](file: String): Option[T] = { | ||
|
|
@@ -77,10 +82,14 @@ class MetadataDirWalker(dirPath: String, metadataEndPointNames: List[String]) { | |
| val optConf = | ||
| try { | ||
| filePath match { | ||
| case value if value.contains("joins/") => loadJsonToConf[api.Join](filePath) | ||
| case value if value.contains("group_bys/") => loadJsonToConf[api.GroupBy](filePath) | ||
| case value if value.contains("staging_queries/") => loadJsonToConf[api.StagingQuery](filePath) | ||
| case value if value.contains("models/") => loadJsonToConf[api.Model](filePath) | ||
| case value if value.contains(s"$JoinKeyword/") || maybeConfType.contains(JoinKeyword) => | ||
| loadJsonToConf[api.Join](filePath) | ||
| case value if value.contains(s"$GroupByKeyword/") || maybeConfType.contains(GroupByKeyword) => | ||
| loadJsonToConf[api.GroupBy](filePath) | ||
| case value if value.contains(s"$StagingQueryKeyword/") || maybeConfType.contains(StagingQueryKeyword) => | ||
| loadJsonToConf[api.StagingQuery](filePath) | ||
| case value if value.contains(s"$ModelKeyword/") || maybeConfType.contains(ModelKeyword) => | ||
| loadJsonToConf[api.Model](filePath) | ||
|
Comment on lines
+85
to
+92
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doing this because the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is where we strip the filepath for just the filename in rrun.py. https://github.com/zipline-ai/chronon/blob/main/api/py/ai/chronon/repo/run.py#L575-L580 has some docs |
||
| } | ||
| } catch { | ||
| case e: Throwable => | ||
|
|
@@ -94,22 +103,22 @@ class MetadataDirWalker(dirPath: String, metadataEndPointNames: List[String]) { | |
| val conf = optConf.get | ||
|
|
||
| val kVPair = filePath match { | ||
| case value if value.contains("joins/") => | ||
| case value if value.contains(s"$JoinKeyword/") || maybeConfType.contains(JoinKeyword) => | ||
| MetadataEndPoint | ||
| .getEndPoint[api.Join](endPointName) | ||
| .extractFn(filePath, conf.asInstanceOf[api.Join]) | ||
|
|
||
| case value if value.contains("group_bys/") => | ||
| case value if value.contains(s"$GroupByKeyword/") || maybeConfType.contains(GroupByKeyword) => | ||
| MetadataEndPoint | ||
| .getEndPoint[api.GroupBy](endPointName) | ||
| .extractFn(filePath, conf.asInstanceOf[api.GroupBy]) | ||
|
|
||
| case value if value.contains("staging_queries/") => | ||
| case value if value.contains(s"$StagingQueryKeyword/") || maybeConfType.contains(StagingQueryKeyword) => | ||
| MetadataEndPoint | ||
| .getEndPoint[api.StagingQuery](endPointName) | ||
| .extractFn(filePath, conf.asInstanceOf[api.StagingQuery]) | ||
|
|
||
| case value if value.contains("models/") => | ||
| case value if value.contains(s"$ModelKeyword/") || maybeConfType.contains(ModelKeyword) => | ||
| MetadataEndPoint | ||
| .getEndPoint[api.Model](endPointName) | ||
| .extractFn(filePath, conf.asInstanceOf[api.Model]) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,7 +232,10 @@ class MetadataStore(kvStore: KVStore, val dataset: String = MetadataDataset, tim | |
| def create(dataset: String): Unit = { | ||
| try { | ||
| logger.info(s"Creating dataset: $dataset") | ||
| // TODO: this is actually just an async task. it doesn't block and thus we don't actually | ||
| // know if it successfully created the dataset | ||
|
Comment on lines
+235
to
+236
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @piyush-zlai - was thinking I could poll? the issue is this doesn't return a future |
||
| kvStore.create(dataset) | ||
|
|
||
| logger.info(s"Successfully created dataset: $dataset") | ||
| } catch { | ||
| case e: Exception => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,17 +86,20 @@ object Driver { | |
| def parseConf[T <: TBase[_, _]: Manifest: ClassTag](confPath: String): T = | ||
| ThriftJsonCodec.fromJsonFile[T](confPath, check = true) | ||
|
|
||
| trait AddGcpSubCommandArgs { | ||
| trait SharedSubCommandArgs { | ||
| this: ScallopConf => | ||
| val isGcp: ScallopOption[Boolean] = | ||
| opt[Boolean](required = false, default = Some(false), descr = "Whether to use GCP") | ||
| val gcpProjectId: ScallopOption[String] = | ||
| opt[String](required = false, descr = "GCP project id") | ||
| val gcpBigtableInstanceId: ScallopOption[String] = | ||
| opt[String](required = false, descr = "GCP BigTable instance id") | ||
|
|
||
| val confType: ScallopOption[String] = | ||
| opt[String](required = false, descr = "Type of the conf to run. ex: join, group-by, etc") | ||
|
Comment on lines
+98
to
+99
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mentioned earlier but this is needed because there's logic in scala that depends on the conf file path having keywords like |
||
| } | ||
|
|
||
| trait OfflineSubcommand extends AddGcpSubCommandArgs { | ||
| trait OfflineSubcommand extends SharedSubCommandArgs { | ||
| this: ScallopConf => | ||
| val confPath: ScallopOption[String] = opt[String](required = true, descr = "Path to conf") | ||
|
|
||
|
|
@@ -584,7 +587,7 @@ object Driver { | |
| } | ||
|
|
||
| // common arguments to all online commands | ||
| trait OnlineSubcommand extends AddGcpSubCommandArgs { s: ScallopConf => | ||
| trait OnlineSubcommand extends SharedSubCommandArgs { s: ScallopConf => | ||
| // this is `-Z` and not `-D` because sbt-pack plugin uses that for JAVA_OPTS | ||
| val propsInner: Map[String, String] = props[String]('Z') | ||
| val onlineJar: ScallopOption[String] = | ||
|
|
@@ -615,7 +618,7 @@ object Driver { | |
| } | ||
|
|
||
| def metaDataStore = | ||
| new MetadataStore(impl(serializableProps).genKvStore, MetadataDataset, timeoutMillis = 10000) | ||
| new MetadataStore(api.genKvStore, MetadataDataset, timeoutMillis = 10000) | ||
|
|
||
| def impl(props: Map[String, String]): Api = { | ||
| val urls = Array(new File(onlineJar()).toURI.toURL) | ||
|
|
@@ -748,7 +751,7 @@ object Driver { | |
|
|
||
| def run(args: Args): Unit = { | ||
| val acceptedEndPoints = List(MetadataEndPoint.ConfByKeyEndPointName, MetadataEndPoint.NameByTeamEndPointName) | ||
| val dirWalker = new MetadataDirWalker(args.confPath(), acceptedEndPoints) | ||
| val dirWalker = new MetadataDirWalker(args.confPath(), acceptedEndPoints, maybeConfType = args.confType.toOption) | ||
| val kvMap: Map[String, Map[String, List[String]]] = dirWalker.run | ||
|
|
||
| // trigger creates of the datasets before we proceed with writes | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
feels like we might want a flag that's not just for dataproc but whether this command is an offline batch run? Not critically blocking though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, it's getting pretty crowded in the code here.
the sub help part is one we probably could catch much earlier tbh.