-
Notifications
You must be signed in to change notification settings - Fork 68
Update metadata export logic for join derivation #879
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
792b3a2
78d253c
ed1ae62
71629d8
8390305
1d113fa
70d8d1f
e14f6d0
4b36bc7
38f2ffa
bcbd162
7e3937e
09c3e87
b2394da
bc80fec
3f4ac12
a61a944
bbb09ce
68e394a
30f4624
cdeb1ac
4eaa63e
0c40fa4
1e3889f
9c1efd5
1071563
561a38b
69a640a
6a5855c
5e3c5b5
e628d34
f4deba2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -407,8 +407,25 @@ class Analyzer(tableUtils: TableUtils, | |
) | ||
} | ||
} | ||
val aggMetadata: ListBuffer[AggregationMetadata] = if (joinConf.hasDerivations) { | ||
val keyAndPartitionFields = | ||
leftDf.schema.fields ++ Seq(org.apache.spark.sql.types.StructField(tableUtils.partitionColumn, StringType)) | ||
val sparkSchema = { | ||
val schema: Seq[(String, DataType)] = leftSchema.toSeq ++ rightSchema.toSeq | ||
StructType(SparkConversions.fromChrononSchema(schema)) | ||
} | ||
val dummyOutputDf = tableUtils.sparkSession | ||
.createDataFrame(tableUtils.sparkSession.sparkContext.parallelize(immutable.Seq[Row]()), sparkSchema) | ||
val finalOutputColumns = joinConf.derivationsScala.finalOutputColumn(dummyOutputDf.columns).toSeq | ||
val derivedDummyOutputDf = dummyOutputDf.select(finalOutputColumns: _*) | ||
val columns = SparkConversions.toChrononSchema( | ||
StructType(derivedDummyOutputDf.schema.filterNot(f => keyAndPartitionFields.map(_.name).contains(f.name)))) | ||
ListBuffer(columns.map { tup => toAggregationMetadata(tup._1, tup._2, joinConf.hasDerivations) }: _*) | ||
} else { | ||
aggregationsMetadata | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: rename agg because agg is specific to group_by, but now we have external parts and derivations
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The concept of this aggMetadata is more like output features(not including keys). It will be used as the source data of features on MLI tool. I am thinking of maybe renaming this to featuresMetadata or joinOutputValuesMetadata. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yuli-han rename?
|
||
} | ||
// (schema map showing the names and datatypes, right side feature aggregations metadata for metadata upload) | ||
(leftSchema ++ rightSchema, aggregationsMetadata) | ||
(leftSchema ++ rightSchema, aggMetadata) | ||
} | ||
|
||
// validate the schema of the left and right side of the join and make sure the types match | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,18 +18,8 @@ package ai.chronon.spark.test | |
|
||
import ai.chronon.aggregator.test.Column | ||
import ai.chronon.api | ||
import ai.chronon.api.{ | ||
Accuracy, | ||
Builders, | ||
Constants, | ||
JoinPart, | ||
LongType, | ||
Operation, | ||
PartitionSpec, | ||
StringType, | ||
TimeUnit, | ||
Window | ||
} | ||
import ai.chronon.api.Builders.Derivation | ||
import ai.chronon.api.{Accuracy, Builders, Constants, JoinPart, LongType, Operation, PartitionSpec, StringType, TimeUnit, Window} | ||
import ai.chronon.api.Extensions._ | ||
import ai.chronon.spark.Extensions._ | ||
import ai.chronon.spark.GroupBy.renderDataSourceQuery | ||
|
@@ -1547,4 +1537,30 @@ class JoinTest { | |
assert( | ||
thrown2.getMessage.contains("Table or view not found") && thrown3.getMessage.contains("Table or view not found")) | ||
} | ||
|
||
def testJoinDerivationAnalyzer(): Unit = { | ||
lazy val spark: SparkSession = SparkSessionBuilder.build("JoinTest" + "_" + Random.alphanumeric.take(6).mkString, local = true) | ||
val tableUtils = TableUtils(spark) | ||
val namespace = "test_join_derivation" + "_" + Random.alphanumeric.take(6).mkString | ||
tableUtils.createDatabase(namespace) | ||
val viewsGroupBy = getViewsGroupBy(suffix = "cumulative", makeCumulative = true, namespace) | ||
val joinConf = getEventsEventsTemporal("cumulative", namespace) | ||
joinConf.setJoinParts(Seq(Builders.JoinPart(groupBy = viewsGroupBy)).asJava) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a test case that contains external parts? |
||
joinConf.setDerivations(Seq( | ||
Derivation( | ||
name = "*", | ||
expression = "*" | ||
), Derivation( | ||
name = "test_feature_name", | ||
expression = f"${viewsGroupBy.metaData.name}_time_spent_ms_average" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add some derivations that use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, can we add a test case for key columns as output?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plus one on the ts and ds expressions. Those values will be derived from the request time. |
||
) | ||
).asJava) | ||
|
||
|
||
val today = tableUtils.partitionSpec.at(System.currentTimeMillis()) | ||
val (_, aggregationsMetadata) = | ||
new Analyzer(tableUtils, joinConf, monthAgo, today).analyzeJoin(joinConf, enableHitter = false) | ||
aggregationsMetadata.foreach(agg => {assertTrue(agg.operation == "Derivation")}) | ||
aggregationsMetadata.exists(_.name == "test_feature_name") | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the
filterNot
necessary here? I think we should keep everything that users included in derivations. For example, we should allow key columns to be in the output if users explicitly included it in derivations.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The aggMetadata are actually features, so the purpose of this line is to exclude the keys from the features list.
But this may not apply to the case where the users rename an external feature to a key name, if that happens the key(which is actually an external feature) will be filtered out here 🤔