Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ class BigTableKVStoreImpl(dataClient: BigtableDataClient,
s"""
|EXPORT DATA OPTIONS (
| format='CLOUD_BIGTABLE',
| auto_create_column_families=true,
| overwrite=true,
| uri="https://bigtable.googleapis.com/projects/${adminClient.getProjectId}/instances/${adminClient.getInstanceId}/appProfiles/GROUPBY_INGEST/tables/$batchTable",
| bigtable_options='''{
Expand All @@ -313,16 +312,15 @@ class BigTableKVStoreImpl(dataClient: BigtableDataClient,
| "familyId": "cf",
| "encoding": "BINARY",
| "columns": [
| {"qualifierString": "value", "fieldName": "value_bytes"}
|
| {"qualifierString": "value", "fieldName": ""}
| ]
| }
| ]
|}'''
|) AS
|SELECT
| CONCAT(CAST(CONCAT('$datasetName', '#') AS BYTES), key_bytes) as rowkey,
| value_bytes as value,
| value_bytes as cf,
| TIMESTAMP_MILLIS($endDsPlusOne) as _CHANGE_TIMESTAMP
|FROM $sourceOfflineTable
|WHERE ds = '$partition'
Expand All @@ -334,8 +332,10 @@ class BigTableKVStoreImpl(dataClient: BigtableDataClient,
.newBuilder(exportQuery)
.build()

val jobId = JobId.of(adminClient.getProjectId, s"export_${sourceOfflineTable.sanitize}_to_bigtable_$partition")
val startTs = System.currentTimeMillis()
// we append the timestamp to the jobID as BigQuery doesn't allow us to re-run the same job
val jobId =
JobId.of(adminClient.getProjectId, s"export_${sourceOfflineTable.sanitize}_to_bigtable_${partition}_$startTs")
val job: Job = bigQueryClient.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build())
logger.info(s"Export job started with Id: $jobId and link: ${job.getSelfLink}")
val retryConfig =
Expand Down
38 changes: 37 additions & 1 deletion spark/src/main/scala/ai/chronon/spark/Driver.scala
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,38 @@ object Driver {
}
}

object GroupByUploadToKVBulkLoad {
@transient lazy val logger: Logger = LoggerFactory.getLogger(getClass)
class Args extends Subcommand("groupby-upload-bulk-load") with OnlineSubcommand {
val srcOfflineTable: ScallopOption[String] =
opt[String](required = true, descr = "Name of the source GroupBy Upload table")

val groupbyName: ScallopOption[String] =
opt[String](required = true, descr = "Name of the GroupBy that we're triggering this upload for")

val partitionString: ScallopOption[String] =
opt[String](required = true, descr = "Partition string (in 'yyyy-MM-dd' format) that we are uploading")
Copy link
Collaborator

@tchow-zlai tchow-zlai Jan 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this string supposed to match whatever's configured at:

private val partitionFormat: String =
sparkSession.conf.get("spark.chronon.partition.format", "yyyy-MM-dd")
?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this needs to match what we're writing out.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay I think there's some form of this where we can DRY things up but wouldn't block the PR. Thanks for clarifying it!

Comment on lines +720 to +727
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add validation for partition string format.

Ensure the partition string matches 'yyyy-MM-dd' format to prevent runtime errors.

 val partitionString: ScallopOption[String] =
   opt[String](required = true, descr = "Partition string (in 'yyyy-MM-dd' format) that we are uploading")
+  validate(s => try {
+    java.time.LocalDate.parse(s, java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd"))
+    true
+  } catch {
+    case _: Exception => false
+  })
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
val srcOfflineTable: ScallopOption[String] =
opt[String](required = true, descr = "Name of the source GroupBy Upload table")
val groupbyName: ScallopOption[String] =
opt[String](required = true, descr = "Name of the GroupBy that we're triggering this upload for")
val partitionString: ScallopOption[String] =
opt[String](required = true, descr = "Partition string (in 'yyyy-MM-dd' format) that we are uploading")
val srcOfflineTable: ScallopOption[String] =
opt[String](required = true, descr = "Name of the source GroupBy Upload table")
val groupbyName: ScallopOption[String] =
opt[String](required = true, descr = "Name of the GroupBy that we're triggering this upload for")
val partitionString: ScallopOption[String] =
opt[String](required = true, descr = "Partition string (in 'yyyy-MM-dd' format) that we are uploading")
validate(s => try {
java.time.LocalDate.parse(s, java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd"))
true
} catch {
case _: Exception => false
})

}

def run(args: Args): Unit = {
logger.info(s"Triggering bulk load for GroupBy: ${args.groupbyName()} for partition: ${args
.partitionString()} from table: ${args.srcOfflineTable()}")
val kvStore = args.api.genKvStore
val startTime = System.currentTimeMillis()
try {
kvStore.bulkPut(args.srcOfflineTable(), args.groupbyName(), args.partitionString())
} catch {
case e: Exception =>
logger.error(s"Failed to upload GroupBy: ${args.groupbyName()} for partition: ${args
.partitionString()} from table: ${args.srcOfflineTable()}",
e)
throw e
}
logger.info(s"Uploaded GroupByUpload data to KV store for GroupBy: ${args.groupbyName()}; partition: ${args
.partitionString()} in ${(System.currentTimeMillis() - startTime) / 1000} seconds")
}
}

object LogFlattener {
class Args extends Subcommand("log-flattener") with OfflineSubcommand {
val logTable: ScallopOption[String] =
Expand Down Expand Up @@ -913,6 +945,8 @@ object Driver {
addSubcommand(FetcherCliArgs)
object MetadataUploaderArgs extends MetadataUploader.Args
addSubcommand(MetadataUploaderArgs)
object GroupByUploadToKVBulkLoadArgs extends GroupByUploadToKVBulkLoad.Args
addSubcommand(GroupByUploadToKVBulkLoadArgs)
object GroupByStreamingArgs extends GroupByStreaming.Args
addSubcommand(GroupByStreamingArgs)
object AnalyzerArgs extends Analyzer.Args
Expand Down Expand Up @@ -958,7 +992,9 @@ object Driver {
shouldExit = false
GroupByStreaming.run(args.GroupByStreamingArgs)

case args.MetadataUploaderArgs => MetadataUploader.run(args.MetadataUploaderArgs)
case args.MetadataUploaderArgs => MetadataUploader.run(args.MetadataUploaderArgs)
case args.GroupByUploadToKVBulkLoadArgs =>
GroupByUploadToKVBulkLoad.run(args.GroupByUploadToKVBulkLoadArgs)
case args.FetcherCliArgs => FetcherCli.run(args.FetcherCliArgs)
case args.LogFlattenerArgs => LogFlattener.run(args.LogFlattenerArgs)
case args.ConsistencyMetricsArgs => ConsistencyMetricsCompute.run(args.ConsistencyMetricsArgs)
Expand Down
Loading