-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13721][SQL] Support outer generators in DataFrame API #16608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -175,6 +175,9 @@ object FunctionRegistry { | |
| expression[NullIf]("nullif"), | ||
| expression[Nvl]("nvl"), | ||
| expression[Nvl2]("nvl2"), | ||
| expression[OuterExplode]("outer_explode"), | ||
|
||
| expression[OuterInline]("outer_inline"), | ||
|
||
| expression[OuterPosExplode]("outer_posexplode"), | ||
| expression[PosExplode]("posexplode"), | ||
| expression[Rand]("rand"), | ||
| expression[Randn]("randn"), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -204,6 +204,17 @@ case class Stack(children: Seq[Expression]) extends Generator { | |
| } | ||
| } | ||
|
|
||
| case class GeneratorOuter(child: Generator) extends UnaryExpression | ||
| with Generator { | ||
|
||
|
|
||
| final override def eval(input: InternalRow = null): TraversableOnce[InternalRow] = | ||
| throw new UnsupportedOperationException(s"Cannot evaluate expression: $this") | ||
|
|
||
| final override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = | ||
| throw new UnsupportedOperationException(s"Cannot evaluate expression: $this") | ||
|
|
||
| override def elementSchema: StructType = child.elementSchema | ||
| } | ||
| /** | ||
| * A base class for [[Explode]] and [[PosExplode]]. | ||
| */ | ||
|
|
@@ -233,11 +244,11 @@ abstract class ExplodeBase extends UnaryExpression with CollectionGenerator with | |
| if (position) { | ||
| new StructType() | ||
| .add("pos", IntegerType, nullable = false) | ||
| .add("key", kt, nullable = false) | ||
| .add("key", kt, nullable = true) | ||
|
||
| .add("value", vt, valueContainsNull) | ||
| } else { | ||
| new StructType() | ||
| .add("key", kt, nullable = false) | ||
| .add("key", kt, nullable = true) | ||
| .add("value", vt, valueContainsNull) | ||
| } | ||
| } | ||
|
|
@@ -300,7 +311,7 @@ abstract class ExplodeBase extends UnaryExpression with CollectionGenerator with | |
| case class Explode(child: Expression) extends ExplodeBase { | ||
| override val position: Boolean = false | ||
| } | ||
|
|
||
| class OuterExplode(child: Expression) extends GeneratorOuter(Explode(child)) | ||
|
||
| /** | ||
| * Given an input array produces a sequence of rows for each position and value in the array. | ||
| * | ||
|
|
@@ -323,7 +334,7 @@ case class Explode(child: Expression) extends ExplodeBase { | |
| case class PosExplode(child: Expression) extends ExplodeBase { | ||
| override val position = true | ||
| } | ||
|
|
||
| class OuterPosExplode(child: Expression) extends GeneratorOuter(PosExplode(child)) | ||
|
||
| /** | ||
| * Explodes an array of structs into a table. | ||
| */ | ||
|
|
@@ -369,3 +380,5 @@ case class Inline(child: Expression) extends UnaryExpression with CollectionGene | |
| child.genCode(ctx) | ||
| } | ||
| } | ||
|
|
||
| class OuterInline(child: Expression) extends GeneratorOuter(Inline(child)) | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,9 +160,20 @@ case class GenerateExec( | |
|
|
||
| // Generate looping variables. | ||
| val index = ctx.freshName("index") | ||
| val numElements = ctx.freshName("numElements") | ||
|
|
||
| // In case of outer=true we need to make sure the loop is executed at-least once when the | ||
| // array/map contains no input. | ||
| // generateOuter is an int. it is set to 1 iff outer is true and the input is empty or null. | ||
| val generateOuter = ctx.freshName("generateOuter") | ||
| val isOuter = if (outer) { | ||
|
||
| "true" | ||
| } else { | ||
| "false" | ||
| } | ||
|
|
||
| // Add a check if the generate outer flag is true. | ||
| val checks = optionalCode(outer, data.isNull) | ||
| val checks = optionalCode(outer, s"($generateOuter == 1)") | ||
|
|
||
| // Add position | ||
| val position = if (e.position) { | ||
|
|
@@ -199,21 +210,13 @@ case class GenerateExec( | |
| (initArrayData, "", values) | ||
| } | ||
|
|
||
| // In case of outer=true we need to make sure the loop is executed at-least once when the | ||
| // array/map contains no input. We do this by setting the looping index to -1 if there is no | ||
| // input, evaluation of the array is prevented by a check in the accessor code. | ||
| val numElements = ctx.freshName("numElements") | ||
| val init = if (outer) { | ||
| s"$numElements == 0 ? -1 : 0" | ||
| } else { | ||
| "0" | ||
| } | ||
| val numOutput = metricTerm(ctx, "numOutputRows") | ||
| s""" | ||
| |${data.code} | ||
| |$initMapData | ||
| |int $numElements = ${data.isNull} ? 0 : ${data.value}.numElements(); | ||
| |for (int $index = $init; $index < $numElements; $index++) { | ||
| |int $generateOuter = ($numElements == 0 && $isOuter) ? 1 : 0; | ||
| |for (int $index = 0; $index < $numElements + $generateOuter; $index++) { | ||
| | $numOutput.add(1); | ||
| | $updateRowData | ||
| | ${consume(ctx, input ++ position ++ values)} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2870,6 +2870,15 @@ object functions { | |
| */ | ||
| def explode(e: Column): Column = withExpr { Explode(e.expr) } | ||
|
|
||
| /** | ||
| * Creates a new row for each element in the given array or map column. | ||
| * Unlike explode, if the array/map is null or empty then null is produced. | ||
| * | ||
| * @group collection_funcs | ||
| * @since 2.2.0 | ||
| */ | ||
| def outer_explode(e: Column): Column = withExpr { new OuterExplode(e.expr) } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming: |
||
|
|
||
| /** | ||
| * Creates a new row for each element with position in the given array or map column. | ||
| * | ||
|
|
@@ -2878,6 +2887,15 @@ object functions { | |
| */ | ||
| def posexplode(e: Column): Column = withExpr { PosExplode(e.expr) } | ||
|
|
||
| /** | ||
| * Creates a new row for each element with position in the given array or map column. | ||
| * Unlike posexplode, if the array/map is null or empty then the row (0, null) is produced. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. update this the result. |
||
| * | ||
| * @group collection_funcs | ||
| * @since 2.2.0 | ||
| */ | ||
| def outer_posexplode(e: Column): Column = withExpr { new OuterPosExplode(e.expr) } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naming: |
||
|
|
||
| /** | ||
| * Extracts json object from a json string based on json path specified, and returns json string | ||
| * of the extracted json object. It will return null if the input json string is invalid. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,13 +86,25 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| df.select(explode('intList)), | ||
| Row(1) :: Row(2) :: Row(3) :: Nil) | ||
| } | ||
| test("single outer_explode") { | ||
|
||
| val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") | ||
| checkAnswer( | ||
| df.select(outer_explode('intList)), | ||
| Row(1) :: Row(2) :: Row(3) :: Row(0) :: Nil) | ||
| } | ||
|
|
||
| test("single posexplode") { | ||
| val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") | ||
| checkAnswer( | ||
| df.select(posexplode('intList)), | ||
| Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Nil) | ||
| } | ||
| test("single outer_posexplode") { | ||
| val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") | ||
| checkAnswer( | ||
| df.select(outer_posexplode('intList)), | ||
| Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Row(0, 0) :: Nil) | ||
| } | ||
|
|
||
| test("explode and other columns") { | ||
| val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") | ||
|
|
@@ -109,6 +121,25 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| Row(1, Seq(1, 2, 3), 2) :: | ||
| Row(1, Seq(1, 2, 3), 3) :: Nil) | ||
| } | ||
| test("outer_explode and other columns") { | ||
| val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") | ||
|
|
||
| checkAnswer( | ||
| df.select($"a", outer_explode('intList)), | ||
| Row(1, 1) :: | ||
| Row(1, 2) :: | ||
| Row(1, 3) :: | ||
| Row(2, 0) :: | ||
| Nil) | ||
|
|
||
| checkAnswer( | ||
| df.select($"*", outer_explode('intList)), | ||
| Row(1, Seq(1, 2, 3), 1) :: | ||
| Row(1, Seq(1, 2, 3), 2) :: | ||
| Row(1, Seq(1, 2, 3), 3) :: | ||
| Row(2, Seq(), 0) :: | ||
| Nil) | ||
| } | ||
|
|
||
| test("aliased explode") { | ||
| val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") | ||
|
|
@@ -122,13 +153,33 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| Row(6) :: Nil) | ||
| } | ||
|
|
||
| test("aliased outer_explode") { | ||
| val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") | ||
|
|
||
| checkAnswer( | ||
| df.select(outer_explode('intList).as('int)).select('int), | ||
| Row(1) :: Row(2) :: Row(3) :: Row(0) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| df.select(explode('intList).as('int)).select(sum('int)), | ||
| Row(6) :: Nil) | ||
| } | ||
|
|
||
| test("explode on map") { | ||
| val df = Seq((1, Map("a" -> "b"))).toDF("a", "map") | ||
|
|
||
| checkAnswer( | ||
| df.select(explode('map)), | ||
| Row("a", "b")) | ||
| } | ||
| test("outer_explode on map") { | ||
| val df = Seq((1, Map("a" -> "b")), (2, Map[String, String]()), | ||
| (3, Map("c" -> "d"))).toDF("a", "map") | ||
|
|
||
| checkAnswer( | ||
| df.select(outer_explode('map)), | ||
| Row("a", "b") :: Row(null, null) :: Row("c", "d") :: Nil) | ||
| } | ||
|
|
||
| test("explode on map with aliases") { | ||
| val df = Seq((1, Map("a" -> "b"))).toDF("a", "map") | ||
|
|
@@ -138,6 +189,14 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| Row("a", "b")) | ||
| } | ||
|
|
||
| test("outer_explode on map with aliases") { | ||
| val df = Seq((3, None), (1, Some(Map("a" -> "b")))).toDF("a", "map") | ||
|
|
||
| checkAnswer( | ||
| df.select(outer_explode('map).as("key1" :: "value1" :: Nil)).select("key1", "value1"), | ||
| Row("a", "b") :: Row(null, null) :: Nil) | ||
| } | ||
|
|
||
| test("self join explode") { | ||
| val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") | ||
| val exploded = df.select(explode('intList).as('i)) | ||
|
|
@@ -206,6 +265,18 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| df.selectExpr("array(struct(a), named_struct('a', b))").selectExpr("inline(*)"), | ||
| Row(1) :: Row(2) :: Nil) | ||
| } | ||
| test("outer_inline") { | ||
| val df = Seq((1, "2"), (3, "4"), (5, "6")).toDF("col1", "col2") | ||
| val df2 = df.select(when('col1 === 1, null).otherwise(array(struct('col1, 'col2))).as("col1")) | ||
| checkAnswer( | ||
| df2.selectExpr("inline(col1)"), | ||
| Row(3, "4") :: Row(5, "6") :: Nil | ||
| ) | ||
| checkAnswer( | ||
| df2.selectExpr("outer_inline(col1)"), | ||
| Row(0, null) :: Row(3, "4") :: Row(5, "6") :: Nil | ||
| ) | ||
| } | ||
|
|
||
| test("SPARK-14986: Outer lateral view with empty generate expression") { | ||
| checkAnswer( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,6 +102,9 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils { | |
| checkSqlGeneration("SELECT map(1, 'a', 2, 'b')") | ||
| checkSqlGeneration("SELECT named_struct('c1',1,'c2',2,'c3',3)") | ||
| checkSqlGeneration("SELECT nanvl(a, 5), nanvl(b, 10), nanvl(d, c) from t2") | ||
| checkSqlGeneration("SELECT outer_explode(array())") | ||
|
||
| checkSqlGeneration("SELECT outer_posexplode(array())") | ||
| checkSqlGeneration("SELECT outer_inline(array(struct('a', 1)))") | ||
| checkSqlGeneration("SELECT rand(1)") | ||
| checkSqlGeneration("SELECT randn(3)") | ||
| checkSqlGeneration("SELECT struct(1,2,3)") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
document what the return value means (especially that boolean value, but also the Seq[String] that's preexisting)