airbnb · yuli-han · Dec 6, 2024 · Nov 19, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/spark/src/main/scala/ai/chronon/spark/Analyzer.scala b/spark/src/main/scala/ai/chronon/spark/Analyzer.scala
@@ -407,8 +407,25 @@ class Analyzer(tableUtils: TableUtils,
         )
       }
     }
+    val aggMetadata: ListBuffer[AggregationMetadata] = if (joinConf.hasDerivations) {
+      val keyAndPartitionFields =
+        leftDf.schema.fields ++ Seq(org.apache.spark.sql.types.StructField(tableUtils.partitionColumn, StringType))
+      val sparkSchema = {
+        val schema: Seq[(String, DataType)] = leftSchema.toSeq ++ rightSchema.toSeq
+        StructType(SparkConversions.fromChrononSchema(schema))
+      }
+      val dummyOutputDf = tableUtils.sparkSession
+        .createDataFrame(tableUtils.sparkSession.sparkContext.parallelize(immutable.Seq[Row]()), sparkSchema)
+      val finalOutputColumns = joinConf.derivationsScala.finalOutputColumn(dummyOutputDf.columns).toSeq
+      val derivedDummyOutputDf = dummyOutputDf.select(finalOutputColumns: _*)
+      val columns = SparkConversions.toChrononSchema(
+        StructType(derivedDummyOutputDf.schema.filterNot(f => keyAndPartitionFields.map(_.name).contains(f.name))))
+      ListBuffer(columns.map { tup => toAggregationMetadata(tup._1, tup._2, joinConf.hasDerivations) }: _*)
+    } else {
+      aggregationsMetadata
+    }
     // (schema map showing the names and datatypes, right side feature aggregations metadata for metadata upload)
-    (leftSchema ++ rightSchema, aggregationsMetadata)
+    (leftSchema ++ rightSchema, aggMetadata)
   }
 
   // validate the schema of the left and right side of the join and make sure the types match

diff --git a/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala b/spark/src/test/scala/ai/chronon/spark/test/JoinTest.scala
@@ -18,18 +18,8 @@ package ai.chronon.spark.test
 
 import ai.chronon.aggregator.test.Column
 import ai.chronon.api
-import ai.chronon.api.{
-  Accuracy,
-  Builders,
-  Constants,
-  JoinPart,
-  LongType,
-  Operation,
-  PartitionSpec,
-  StringType,
-  TimeUnit,
-  Window
-}
+import ai.chronon.api.Builders.Derivation
+import ai.chronon.api.{Accuracy, Builders, Constants, JoinPart, LongType, Operation, PartitionSpec, StringType, TimeUnit, Window}
 import ai.chronon.api.Extensions._
 import ai.chronon.spark.Extensions._
 import ai.chronon.spark.GroupBy.renderDataSourceQuery
@@ -1547,4 +1537,30 @@ class JoinTest {
     assert(
       thrown2.getMessage.contains("Table or view not found") && thrown3.getMessage.contains("Table or view not found"))
   }
+
+  def testJoinDerivationAnalyzer(): Unit = {
+    lazy val spark: SparkSession = SparkSessionBuilder.build("JoinTest" + "_" + Random.alphanumeric.take(6).mkString, local = true)
+    val tableUtils = TableUtils(spark)
+    val namespace = "test_join_derivation" + "_" + Random.alphanumeric.take(6).mkString
+    tableUtils.createDatabase(namespace)
+    val viewsGroupBy = getViewsGroupBy(suffix = "cumulative", makeCumulative = true, namespace)
+    val joinConf = getEventsEventsTemporal("cumulative", namespace)
+    joinConf.setJoinParts(Seq(Builders.JoinPart(groupBy = viewsGroupBy)).asJava)
+    joinConf.setDerivations(Seq(
+      Derivation(
+        name = "*",
+        expression = "*"
+      ), Derivation(
+        name = "test_feature_name",
+        expression = f"${viewsGroupBy.metaData.name}_time_spent_ms_average"
+      )
+    ).asJava)
+
+
+    val today = tableUtils.partitionSpec.at(System.currentTimeMillis())
+    val (_, aggregationsMetadata) =
+      new Analyzer(tableUtils, joinConf, monthAgo, today).analyzeJoin(joinConf, enableHitter = false)
+    aggregationsMetadata.foreach(agg => {assertTrue(agg.operation == "Derivation")})
+    aggregationsMetadata.exists(_.name == "test_feature_name")
+  }
 }