apache · viirya · Mar 7, 2018 · Mar 8, 2018 · Mar 8, 2018 · Mar 8, 2018
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.objects
 
 import java.lang.reflect.Modifier
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.Builder
 import scala.language.existentials
 import scala.reflect.ClassTag
@@ -501,12 +502,22 @@ case class LambdaVariable(
     value: String,
     isNull: String,
     dataType: DataType,
-    nullable: Boolean = true) extends LeafExpression
-  with Unevaluable with NonSQLExpression {
+    nullable: Boolean = true) extends LeafExpression with NonSQLExpression {
+
+  // Interpreted execution of `LambdaVariable` always get the 0-index element from input row.
+  override def eval(input: InternalRow): Any = {
+    assert(input.numFields == 1,
+      "The input row of interpreted LambdaVariable should have only 1 field.")
+    input.get(0, dataType)
+  }
 
   override def genCode(ctx: CodegenContext): ExprCode = {
     ExprCode(code = "", value = value, isNull = if (nullable) isNull else "false")
   }
+
+  // This won't be called as `genCode` is overrided, just overriding it to make
+  // `LambdaVariable` non-abstract.
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ev
 }
 
 /**
@@ -599,8 +610,71 @@ case class MapObjects private(
 
   override def children: Seq[Expression] = lambdaFunction :: inputData :: Nil
 
-  override def eval(input: InternalRow): Any =
-    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+  // The data with PythonUserDefinedType are actually stored with the data type of its sqlType.
+  // When we want to apply MapObjects on it, we have to use it.
+  lazy private val inputDataType = inputData.dataType match {
+    case p: PythonUserDefinedType => p.sqlType
+    case _ => inputData.dataType
+  }
+
+  private def executeFuncOnCollection(inputCollection: Seq[_]): Seq[_] = {
+    inputCollection.map { element =>
+      val row = InternalRow.fromSeq(Seq(element))
+      lambdaFunction.eval(row)
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val inputCollection = inputData.eval(input)
+
+    if (inputCollection == null) {
+      return inputCollection
+    }
+
+    val results = inputDataType match {
+      case ObjectType(cls) if classOf[Seq[_]].isAssignableFrom(cls) =>
+        executeFuncOnCollection(inputCollection.asInstanceOf[Seq[_]])
+      case ObjectType(cls) if cls.isArray =>
+        executeFuncOnCollection(inputCollection.asInstanceOf[Array[_]].toSeq)
+      case ObjectType(cls) if classOf[java.util.List[_]].isAssignableFrom(cls) =>
+        executeFuncOnCollection(inputCollection.asInstanceOf[java.util.List[_]].asScala)
+      case ObjectType(cls) if cls == classOf[Object] =>
+        if (inputCollection.getClass.isArray) {
+          executeFuncOnCollection(inputCollection.asInstanceOf[Array[_]].toSeq)
+        } else {
+          executeFuncOnCollection(inputCollection.asInstanceOf[Seq[_]])
+        }
+      case ArrayType(et, _) =>
+        executeFuncOnCollection(inputCollection.asInstanceOf[ArrayData].array)
+    }
+
+    customCollectionCls match {
+      case Some(cls) if classOf[Seq[_]].isAssignableFrom(cls) =>
+        // Scala sequence
+        results.toSeq
+      case Some(cls) if classOf[scala.collection.Set[_]].isAssignableFrom(cls) =>
+        // Scala set
+        results.toSet
+      case Some(cls) if classOf[java.util.List[_]].isAssignableFrom(cls) =>
+        // Java list
+        if (cls == classOf[java.util.List[_]] || cls == classOf[java.util.AbstractList[_]] ||
+            cls == classOf[java.util.AbstractSequentialList[_]]) {
+          results.asJava
+        } else {
+          val builder = Try(cls.getConstructor(Integer.TYPE)).map { constructor =>
+            constructor.newInstance(results.length.asInstanceOf[Object])
+          }.getOrElse {
+            cls.getConstructor().newInstance()
+          }.asInstanceOf[java.util.List[Any]]
+
+          results.foreach(builder.add(_))
+          builder
+        }
+      case None =>
+        // array
+        new GenericArrayData(results.toArray)
+    }
+  }
 
   override def dataType: DataType =
     customCollectionCls.map(ObjectType.apply).getOrElse(
@@ -647,13 +721,6 @@ case class MapObjects private(
       case _ => ""
     }
 
-    // The data with PythonUserDefinedType are actually stored with the data type of its sqlType.
-    // When we want to apply MapObjects on it, we have to use it.
-    val inputDataType = inputData.dataType match {
-      case p: PythonUserDefinedType => p.sqlType
-      case _ => inputData.dataType
-    }
-
     // `MapObjects` generates a while loop to traverse the elements of the input collection. We
     // need to take care of Seq and List because they may have O(n) complexity for indexed accessing
     // like `list.get(1)`. Here we use Iterator to traverse Seq and List.

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
@@ -17,13 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.objects._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData}
 import org.apache.spark.sql.types._
 
 
@@ -123,4 +125,55 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkEvaluation(encodeUsingSerializer, null, InternalRow.fromSeq(Seq(null)))
     }
   }
+
+  test("SPARK-23587: MapObjects should support interpreted execution") {
+    val customCollectionClasses = Seq(classOf[Seq[Int]], classOf[scala.collection.Set[Int]],
+      classOf[java.util.List[Int]], classOf[java.util.AbstractList[Int]],
+      classOf[java.util.AbstractSequentialList[Int]], classOf[java.util.Vector[Int]],
+      classOf[java.util.Stack[Int]], null)
+    val function = (lambda: Expression) => Add(lambda, Literal(1))
+    val elementType = IntegerType
+    val expected = Seq(2, 3, 4)
+
+    val list = new java.util.ArrayList[Int]()
+    list.add(1)
+    list.add(2)
+    list.add(3)
+    val arrayData = new GenericArrayData(Array(1, 2, 3))
+    val vector = new java.util.Vector[Int]()
+    vector.add(1)
+    vector.add(2)
+    vector.add(3)
+    val stack = new java.util.Stack[Int]()
+    stack.add(1)
+    stack.add(2)
+    stack.add(3)
+
+    Seq(
+      (Seq(1, 2, 3), ObjectType(classOf[Seq[Int]])),
+      (list, ObjectType(classOf[java.util.List[Int]])),
+      (vector, ObjectType(classOf[java.util.Vector[Int]])),
+      (arrayData, ArrayType(IntegerType))
+    ).foreach { case (collection, inputType) =>
+      val inputObject = BoundReference(0, inputType, nullable = true)
+
+      customCollectionClasses.foreach { customCollectionCls =>
+        val optClass = Option(customCollectionCls)
+        val mapObj = MapObjects(function, inputObject, elementType, true, optClass)
+        val row = InternalRow.fromSeq(Seq(collection))
+        val result = mapObj.eval(row)
+
+        customCollectionCls match {
+          case null =>
+            assert(result.asInstanceOf[ArrayData].array.toSeq == expected)
+          case l if classOf[java.util.List[_]].isAssignableFrom(l) =>
+            assert(result.asInstanceOf[java.util.List[_]].asScala.toSeq == expected)
+          case s if classOf[Seq[_]].isAssignableFrom(s) =>
+            assert(result.asInstanceOf[Seq[_]].toSeq == expected)
+          case s if classOf[scala.collection.Set[_]].isAssignableFrom(s) =>
+            assert(result.asInstanceOf[scala.collection.Set[_]] == expected.toSet)
+        }
+      }
+    }
+  }
 }