1717
1818package org .apache .spark .sql
1919
20+ import java .beans .Introspector
2021import java .util .Properties
2122
2223import scala .collection .immutable
2324import scala .language .implicitConversions
2425import scala .reflect .runtime .universe .TypeTag
2526
26- import org .apache .hadoop .conf .Configuration
2727import org .apache .spark .SparkContext
2828import org .apache .spark .annotation .{AlphaComponent , DeveloperApi , Experimental }
29+ import org .apache .spark .api .java .JavaRDD
2930import org .apache .spark .rdd .RDD
3031import org .apache .spark .sql .catalyst .ScalaReflection
3132import org .apache .spark .sql .catalyst .analysis ._
@@ -36,9 +37,9 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
3637import org .apache .spark .sql .catalyst .rules .RuleExecutor
3738import org .apache .spark .sql .execution ._
3839import org .apache .spark .sql .json ._
39- import org .apache .spark .sql .parquet .ParquetRelation
40- import org .apache .spark .sql .sources .{BaseRelation , DDLParser , DataSourceStrategy , LogicalRelation }
40+ import org .apache .spark .sql .sources .{LogicalRelation , BaseRelation , DDLParser , DataSourceStrategy }
4141import org .apache .spark .sql .types ._
42+ import org .apache .spark .util .Utils
4243
4344/**
4445 * :: AlphaComponent ::
@@ -59,7 +60,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
5960 self =>
6061
6162 // Note that this is a lazy val so we can override the default value in subclasses.
62- private [sql] lazy val conf : SQLConf = new SQLConf
63+ protected [sql] lazy val conf : SQLConf = new SQLConf
6364
6465 /** Set Spark SQL configuration properties. */
6566 def setConf (props : Properties ): Unit = conf.setConf(props)
@@ -117,15 +118,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
117118 case _ =>
118119 }
119120
120- /**
121- * :: DeveloperApi ::
122- * Allows catalyst LogicalPlans to be executed as a SchemaRDD. Note that the LogicalPlan
123- * interface is considered internal, and thus not guaranteed to be stable. As a result, using
124- * them directly is not recommended.
125- */
126- @ DeveloperApi
127- implicit def logicalPlanToSparkQuery (plan : LogicalPlan ): SchemaRDD = new SchemaRDD (this , plan)
128-
129121 /**
130122 * Creates a SchemaRDD from an RDD of case classes.
131123 *
@@ -139,8 +131,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
139131 new SchemaRDD (this , LogicalRDD (attributeSeq, rowRDD)(self))
140132 }
141133
142- implicit def baseRelationToSchemaRDD (baseRelation : BaseRelation ): SchemaRDD = {
143- logicalPlanToSparkQuery(LogicalRelation (baseRelation))
134+ /**
135+ * Convert a [[BaseRelation ]] created for external data sources into a [[SchemaRDD ]].
136+ */
137+ def baseRelationToSchemaRDD (baseRelation : BaseRelation ): SchemaRDD = {
138+ new SchemaRDD (this , LogicalRelation (baseRelation))
144139 }
145140
146141 /**
@@ -181,6 +176,43 @@ class SQLContext(@transient val sparkContext: SparkContext)
181176 new SchemaRDD (this , logicalPlan)
182177 }
183178
179+ /**
180+ * Applies a schema to an RDD of Java Beans.
181+ *
182+ * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
183+ * SELECT * queries will return the columns in an undefined order.
184+ */
185+ def applySchema (rdd : RDD [_], beanClass : Class [_]): SchemaRDD = {
186+ val attributeSeq = getSchema(beanClass)
187+ val className = beanClass.getName
188+ val rowRdd = rdd.mapPartitions { iter =>
189+ // BeanInfo is not serializable so we must rediscover it remotely for each partition.
190+ val localBeanInfo = Introspector .getBeanInfo(
191+ Class .forName(className, true , Utils .getContextOrSparkClassLoader))
192+ val extractors =
193+ localBeanInfo.getPropertyDescriptors.filterNot(_.getName == " class" ).map(_.getReadMethod)
194+
195+ iter.map { row =>
196+ new GenericRow (
197+ extractors.zip(attributeSeq).map { case (e, attr) =>
198+ DataTypeConversions .convertJavaToCatalyst(e.invoke(row), attr.dataType)
199+ }.toArray[Any ]
200+ ) : Row
201+ }
202+ }
203+ new SchemaRDD (this , LogicalRDD (attributeSeq, rowRdd)(this ))
204+ }
205+
206+ /**
207+ * Applies a schema to an RDD of Java Beans.
208+ *
209+ * WARNING: Since there is no guaranteed ordering for fields in a Java Bean,
210+ * SELECT * queries will return the columns in an undefined order.
211+ */
212+ def applySchema (rdd : JavaRDD [_], beanClass : Class [_]): SchemaRDD = {
213+ applySchema(rdd.rdd, beanClass)
214+ }
215+
184216 /**
185217 * Loads a Parquet file, returning the result as a [[SchemaRDD ]].
186218 *
@@ -259,41 +291,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
259291 applySchema(rowRDD, appliedSchema)
260292 }
261293
262- /**
263- * :: Experimental ::
264- * Creates an empty parquet file with the schema of class `A`, which can be registered as a table.
265- * This registered table can be used as the target of future `insertInto` operations.
266- *
267- * {{{
268- * val sqlContext = new SQLContext(...)
269- * import sqlContext._
270- *
271- * case class Person(name: String, age: Int)
272- * createParquetFile[Person]("path/to/file.parquet").registerTempTable("people")
273- * sql("INSERT INTO people SELECT 'michael', 29")
274- * }}}
275- *
276- * @tparam A A case class type that describes the desired schema of the parquet file to be
277- * created.
278- * @param path The path where the directory containing parquet metadata should be created.
279- * Data inserted into this table will also be stored at this location.
280- * @param allowExisting When false, an exception will be thrown if this directory already exists.
281- * @param conf A Hadoop configuration object that can be used to specify options to the parquet
282- * output format.
283- *
284- * @group userf
285- */
286- @ Experimental
287- def createParquetFile [A <: Product : TypeTag ](
288- path : String ,
289- allowExisting : Boolean = true ,
290- conf : Configuration = new Configuration ()): SchemaRDD = {
291- new SchemaRDD (
292- this ,
293- ParquetRelation .createEmpty(
294- path, ScalaReflection .attributesFor[A ], allowExisting, conf, this ))
295- }
296-
297294 /**
298295 * Registers the given RDD as a temporary table in the catalog. Temporary tables exist only
299296 * during the lifetime of this instance of SQLContext.
@@ -336,12 +333,10 @@ class SQLContext(@transient val sparkContext: SparkContext)
336333 new SchemaRDD (this , catalog.lookupRelation(Seq (tableName)))
337334
338335 /**
339- * :: DeveloperApi ::
340- * Allows extra strategies to be injected into the query planner at runtime. Note this API
341- * should be consider experimental and is not intended to be stable across releases.
336+ * A collection of methods that are considered experimental, but can be used to hook into
337+ * the query planner for advanced functionalities.
342338 */
343- @ DeveloperApi
344- var extraStrategies : Seq [Strategy ] = Nil
339+ val experimental : ExperimentalMethods = new ExperimentalMethods (this )
345340
346341 protected [sql] class SparkPlanner extends SparkStrategies {
347342 val sparkContext : SparkContext = self.sparkContext
@@ -353,7 +348,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
353348 def numPartitions = self.conf.numShufflePartitions
354349
355350 def strategies : Seq [Strategy ] =
356- extraStrategies ++ (
351+ experimental. extraStrategies ++ (
357352 DataSourceStrategy ::
358353 DDLStrategy ::
359354 TakeOrdered ::
@@ -479,14 +474,14 @@ class SQLContext(@transient val sparkContext: SparkContext)
479474 * have the same format as the one generated by `toString` in scala.
480475 * It is only used by PySpark.
481476 */
482- private [sql] def parseDataType (dataTypeString : String ): DataType = {
477+ protected [sql] def parseDataType (dataTypeString : String ): DataType = {
483478 DataType .fromJson(dataTypeString)
484479 }
485480
486481 /**
487482 * Apply a schema defined by the schemaString to an RDD. It is only used by PySpark.
488483 */
489- private [sql] def applySchemaToPythonRDD (
484+ protected [sql] def applySchemaToPythonRDD (
490485 rdd : RDD [Array [Any ]],
491486 schemaString : String ): SchemaRDD = {
492487 val schema = parseDataType(schemaString).asInstanceOf [StructType ]
@@ -496,7 +491,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
496491 /**
497492 * Apply a schema defined by the schema to an RDD. It is only used by PySpark.
498493 */
499- private [sql] def applySchemaToPythonRDD (
494+ protected [sql] def applySchemaToPythonRDD (
500495 rdd : RDD [Array [Any ]],
501496 schema : StructType ): SchemaRDD = {
502497
@@ -527,4 +522,43 @@ class SQLContext(@transient val sparkContext: SparkContext)
527522
528523 new SchemaRDD (this , LogicalRDD (schema.toAttributes, rowRdd)(self))
529524 }
525+
526+ /**
527+ * Returns a Catalyst Schema for the given java bean class.
528+ */
529+ protected def getSchema (beanClass : Class [_]): Seq [AttributeReference ] = {
530+ // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
531+ val beanInfo = Introspector .getBeanInfo(beanClass)
532+
533+ // Note: The ordering of elements may differ from when the schema is inferred in Scala.
534+ // This is because beanInfo.getPropertyDescriptors gives no guarantees about
535+ // element ordering.
536+ val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == " class" )
537+ fields.map { property =>
538+ val (dataType, nullable) = property.getPropertyType match {
539+ case c : Class [_] if c.isAnnotationPresent(classOf [SQLUserDefinedType ]) =>
540+ (c.getAnnotation(classOf [SQLUserDefinedType ]).udt().newInstance(), true )
541+ case c : Class [_] if c == classOf [java.lang.String ] => (StringType , true )
542+ case c : Class [_] if c == java.lang.Short .TYPE => (ShortType , false )
543+ case c : Class [_] if c == java.lang.Integer .TYPE => (IntegerType , false )
544+ case c : Class [_] if c == java.lang.Long .TYPE => (LongType , false )
545+ case c : Class [_] if c == java.lang.Double .TYPE => (DoubleType , false )
546+ case c : Class [_] if c == java.lang.Byte .TYPE => (ByteType , false )
547+ case c : Class [_] if c == java.lang.Float .TYPE => (FloatType , false )
548+ case c : Class [_] if c == java.lang.Boolean .TYPE => (BooleanType , false )
549+
550+ case c : Class [_] if c == classOf [java.lang.Short ] => (ShortType , true )
551+ case c : Class [_] if c == classOf [java.lang.Integer ] => (IntegerType , true )
552+ case c : Class [_] if c == classOf [java.lang.Long ] => (LongType , true )
553+ case c : Class [_] if c == classOf [java.lang.Double ] => (DoubleType , true )
554+ case c : Class [_] if c == classOf [java.lang.Byte ] => (ByteType , true )
555+ case c : Class [_] if c == classOf [java.lang.Float ] => (FloatType , true )
556+ case c : Class [_] if c == classOf [java.lang.Boolean ] => (BooleanType , true )
557+ case c : Class [_] if c == classOf [java.math.BigDecimal ] => (DecimalType (), true )
558+ case c : Class [_] if c == classOf [java.sql.Date ] => (DateType , true )
559+ case c : Class [_] if c == classOf [java.sql.Timestamp ] => (TimestampType , true )
560+ }
561+ AttributeReference (property.getName, dataType, nullable)()
562+ }
563+ }
530564}
0 commit comments