-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23736][SQL] Extending the concat function to support array columns #20858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
282e724
aa5a089
90d3ab7
bb46c3d
11205af
753499d
2efdd77
fd84bee
116f91f
e199ac5
067c2db
090929f
8abd1a8
367ee22
6bb33e6
57b250c
944e0c9
7f5124b
0201e4b
600ae89
f2a67e8
8a125d9
5a4cc8c
f7bdcf7
36d5d25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -308,7 +308,6 @@ object FunctionRegistry { | |
| expression[BitLength]("bit_length"), | ||
| expression[Length]("char_length"), | ||
| expression[Length]("character_length"), | ||
| expression[Concat]("concat"), | ||
| expression[ConcatWs]("concat_ws"), | ||
| expression[Decode]("decode"), | ||
| expression[Elt]("elt"), | ||
|
|
@@ -408,6 +407,7 @@ object FunctionRegistry { | |
| expression[MapValues]("map_values"), | ||
| expression[Size]("size"), | ||
| expression[SortArray]("sort_array"), | ||
| expression[UnresolvedConcat]("concat"), | ||
|
||
| CreateStruct.registryEntry, | ||
|
|
||
| // misc functions | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -699,3 +699,90 @@ abstract class TernaryExpression extends Expression { | |
| * and Hive function wrappers. | ||
| */ | ||
| trait UserDefinedExpression | ||
|
|
||
| /** | ||
| * The trait covers logic for performing null safe evaluation and code generation. | ||
| */ | ||
| trait NullSafeEvaluation extends Expression | ||
|
||
| { | ||
| override def foldable: Boolean = children.forall(_.foldable) | ||
|
|
||
| override def nullable: Boolean = children.exists(_.nullable) | ||
|
|
||
| /** | ||
| * Default behavior of evaluation according to the default nullability of NullSafeEvaluation. | ||
| * If a class utilizing NullSaveEvaluation override [[nullable]], probably should also | ||
| * override this. | ||
| */ | ||
| override def eval(input: InternalRow): Any = | ||
| { | ||
|
||
| val values = children.toStream.map(_.eval(input)) | ||
| if (values.contains(null)) { | ||
| null | ||
| } else { | ||
| nullSafeEval(values) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Called by default [[eval]] implementation. If a class utilizing NullSaveEvaluation keep | ||
| * the default nullability, they can override this method to save null-check code. If we need | ||
| * full control of evaluation process, we should override [[eval]]. | ||
| */ | ||
| protected def nullSafeEval(inputs: Seq[Any]): Any = | ||
| sys.error(s"The class utilizing NullSaveEvaluation must override either eval or nullSafeEval") | ||
|
|
||
| /** | ||
| * Short hand for generating of null save evaluation code. | ||
| * If either of the sub-expressions is null, the result of this computation | ||
| * is assumed to be null. | ||
| * | ||
| * @param f accepts a sequence of variable names and returns Java code to compute the output. | ||
| */ | ||
| protected def defineCodeGen( | ||
| ctx: CodegenContext, | ||
| ev: ExprCode, | ||
| f: Seq[String] => String): ExprCode = { | ||
| nullSafeCodeGen(ctx, ev, values => { | ||
| s"${ev.value} = ${f(values)};" | ||
| }) | ||
| } | ||
|
|
||
| /** | ||
| * Called by expressions to generate null safe evaluation code. | ||
| * If either of the sub-expressions is null, the result of this computation | ||
| * is assumed to be null. | ||
| * | ||
| * @param f a function that accepts a sequence of non-null evaluation result names of children | ||
| * and returns Java code to compute the output. | ||
| */ | ||
| protected def nullSafeCodeGen( | ||
|
||
| ctx: CodegenContext, | ||
| ev: ExprCode, | ||
| f: Seq[String] => String): ExprCode = { | ||
| val gens = children.map(_.genCode(ctx)) | ||
| val resultCode = f(gens.map(_.value)) | ||
|
|
||
| if (nullable) { | ||
| val nullSafeEval = children.zip(gens).foldRight(s""" | ||
| ${ev.isNull} = false; // resultCode could change nullability. | ||
| $resultCode | ||
| """) { | ||
| case ((child, gen), acc) => | ||
| gen.code + ctx.nullSafeExec(child.nullable, gen.isNull)(acc) | ||
|
||
| } | ||
|
|
||
| ev.copy(code = s""" | ||
| boolean ${ev.isNull} = true; | ||
| ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; | ||
| $nullSafeEval | ||
| """) | ||
| } else { | ||
| ev.copy(code = s""" | ||
| boolean ${ev.isNull} = false; | ||
| ${gens.map(_.code).mkString("\n")} | ||
| ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; | ||
| $resultCode""", isNull = "false") | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -21,8 +21,10 @@ import java.util.Comparator | |||
| import org.apache.spark.sql.catalyst.InternalRow | ||||
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, CodegenFallback, ExprCode} | ||||
| import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData} | ||||
| import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, TypeUtils} | ||||
| import org.apache.spark.sql.types._ | ||||
| import org.apache.spark.unsafe.Platform | ||||
| import org.apache.spark.unsafe.array.ByteArrayMethods | ||||
|
|
||||
| /** | ||||
| * Given an array or map, returns its size. Returns -1 if null. | ||||
|
|
@@ -287,3 +289,171 @@ case class ArrayContains(left: Expression, right: Expression) | |||
|
|
||||
| override def prettyName: String = "array_contains" | ||||
| } | ||||
|
|
||||
| /** | ||||
| * Replaces [[org.apache.spark.sql.catalyst.analysis.UnresolvedConcat UnresolvedConcat]]s | ||||
| * with concrete concate expressions. | ||||
| */ | ||||
| object ResolveConcat | ||||
| { | ||||
|
||||
| def apply(children: Seq[Expression]): Expression = { | ||||
| if (children.nonEmpty && ArrayType.acceptsType(children(0).dataType)) { | ||||
| ConcatArrays(children) | ||||
| } else { | ||||
| Concat(children) | ||||
| } | ||||
| } | ||||
| } | ||||
|
|
||||
| /** | ||||
| * Concatenates multiple arrays into one. | ||||
| */ | ||||
| @ExpressionDescription( | ||||
| usage = "_FUNC_(expr, ...) - Concatenates multiple arrays of the same type into one.", | ||||
| examples = """ | ||||
| Examples: | ||||
| > SELECT _FUNC_(array(1, 2, 3), array(4, 5), array(6)); | ||||
| [1,2,3,4,5,6] | ||||
| """, | ||||
| since = "2.4.0") | ||||
| case class ConcatArrays(children: Seq[Expression]) extends Expression with NullSafeEvaluation { | ||||
|
||||
| object CombineConcats extends Rule[LogicalPlan] { |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we just put this in checkInputDataTypes?
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we allow empty children? I can't think of a use case for now and we should better disallow it first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Definitely share your opinion, but I think we should be consistent across the whole Spark SQL API. Functions like concat and concat_ws accept empty children as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm .. but then this is array<null> when the children are empty. Seems CreateArray's type is array<string> in this case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, changing to return type array<string> when no children are provided. Also I've created the jira ticket SPARK-23798 since I don't see any reason why it couldn't return a default concrete type in this case. Hope I don't miss anything.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To use mkString may lead to a compilation error due to 64KB bytecode limitation if there are lots of childrens. Would it be possible to use CodegenContext.splitExpressions()?
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we concate complex elements into UnsafeArrayData?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, can we reuse the UnsafeArrayWriter logic for this case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really like this idea! I think it would require moving the complex type insertion logic from InterprettedUnsafeProjection directly to UnsafeDataWriter and introduce in that way write methods for complex type fields. I'm not sure whether this big refactoring task is still in the scope of this PR.
Also see that we could improve codeGen of CreateArray in the same way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You couldn't use UnsafeArrayData in the complex case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, currently there are no write methods on UnsafeArrayWriter or set methods on UnsafeArrayData that we could leverage for complex types. In theory, we could follow the same approach as in InterprettedUnsafeProjection and each complex type to a byte array and subsequently insert the produced byte array into the target UnsafeArrayData. Since this logic could be utilized from more places (e.g. CreateArray), it should be encapsulated into UnsafeArrayWriter or UnsafeArrayData at first. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did we move this down .. ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The whole file is divide into sections according to groups of functions. Based on @gatorsmile's suggestion, the concat function should be categorized as a collection function. So I moved the function to comply with the file structure.