-
Notifications
You must be signed in to change notification settings - Fork 8
Flink updates to use TileKey on tiling flow writes #307
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,8 +3,11 @@ package ai.chronon.flink | |
| import ai.chronon.api.Constants | ||
| import ai.chronon.api.DataModel | ||
| import ai.chronon.api.Extensions.GroupByOps | ||
| import ai.chronon.api.Extensions.WindowUtils | ||
| import ai.chronon.api.Query | ||
| import ai.chronon.api.TilingUtils | ||
| import ai.chronon.api.{StructType => ChrononStructType} | ||
| import ai.chronon.fetcher.TileKey | ||
| import ai.chronon.flink.types.AvroCodecOutput | ||
| import ai.chronon.flink.types.TimestampedTile | ||
| import ai.chronon.online.AvroConversions | ||
|
|
@@ -126,9 +129,10 @@ case class AvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) | |
| * that can be written out to the KV store (PutRequest object). | ||
| * | ||
| * @param groupByServingInfoParsed The GroupBy we are working with | ||
| * @param tilingWindowSizeMs The size of the tiling window in milliseconds | ||
| * @tparam T The input data type | ||
| */ | ||
| case class TiledAvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed) | ||
| case class TiledAvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParsed, tilingWindowSizeMs: Long) | ||
| extends BaseAvroCodecFn[TimestampedTile, AvroCodecOutput] { | ||
| override def open(configuration: Configuration): Unit = { | ||
| super.open(configuration) | ||
|
|
@@ -158,18 +162,27 @@ case class TiledAvroCodecFn[T](groupByServingInfoParsed: GroupByServingInfoParse | |
| // 'keys' is a map of (key name in schema -> key value), e.g. Map("card_number" -> "4242-4242-4242-4242") | ||
| // We convert to AnyRef because Chronon expects an AnyRef (for scala <> java interoperability reasons). | ||
| val keys: Map[String, AnyRef] = keyColumns.zip(in.keys.map(_.asInstanceOf[AnyRef])).toMap | ||
| val keyBytes = keyToBytes(in.keys.toArray) | ||
| val entityKeyBytes = keyToBytes(in.keys.toArray) | ||
|
|
||
| val tileKey = new TileKey() | ||
| val tileStart = WindowUtils.windowStartMillis(tsMills, tilingWindowSizeMs) | ||
| tileKey.setDataset(streamingDataset) | ||
| tileKey.setKeyBytes(entityKeyBytes.toList.asJava.asInstanceOf[java.util.List[java.lang.Byte]]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be better to box individually? eg. keyBytes.map(java.lang.Byte.valueOf).toList.asJava
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I went with this as I thought it would be better to skip iterating over all the bytes in the key bytes list given Scala and Java bytes are equivalent - so we can just do a constant time operation of casting the top level types.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay I thought it would have been an Exception but if it works in UT's that's good. |
||
| tileKey.setTileSizeMillis(tilingWindowSizeMs) | ||
| tileKey.setTileStartTimestampMillis(tileStart) | ||
|
|
||
| val valueBytes = in.tileBytes | ||
|
|
||
| logger.debug( | ||
| s""" | ||
| |Avro converting tile to PutRequest - tile=${in} | ||
| |groupBy=${groupByServingInfoParsed.groupBy.getMetaData.getName} tsMills=$tsMills keys=$keys | ||
| |keyBytes=${java.util.Base64.getEncoder.encodeToString(keyBytes)} | ||
| |keyBytes=${java.util.Base64.getEncoder.encodeToString(entityKeyBytes)} | ||
| |valueBytes=${java.util.Base64.getEncoder.encodeToString(valueBytes)} | ||
| |streamingDataset=$streamingDataset""".stripMargin | ||
| ) | ||
|
|
||
| new AvroCodecOutput(keyBytes, valueBytes, streamingDataset, tsMills) | ||
| val tileKeyBytes = TilingUtils.serializeTileKey(tileKey) | ||
| new AvroCodecOutput(tileKeyBytes, valueBytes, streamingDataset, tsMills) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you expand on this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah so if a user creates a GroupBy with no windows we end up defaulting to all time windows in Chronon. Currently the Flink code will throw an error for this scenario (we return None here and in FlinkJob when we call .get it errors out). Instead of failing we go with a 1D tile size as thats the best option to help us compute the all time window (batch has the rest)