-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-2060][SQL] Querying JSON Datasets with SQL and DSL in Spark SQL #999
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
f31065f
f45583b
af91b23
0576406
f3ce176
a2313a6
666b957
0387523
52a2275
8846af5
65b87f0
4325475
a5a4b52
8ffed79
66f9e76
8347f2e
6df0891
ab810b0
d0bd412
cff84cc
7027634
9df8c5a
4fbddf0
6d20b85
e7a6c19
83013fb
6a5f5ef
7ea750e
d7a005c
1f908ce
5428451
79ea9ba
e2773a6
ce31c81
94ffdaa
bc9ac51
ce8eedd
227e89e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -108,6 +108,9 @@ trait HiveTypeCoercion { | |
| * | ||
| * Additionally, all types when UNION-ed with strings will be promoted to strings. | ||
| * Other string conversions are handled by PromoteStrings. | ||
| * | ||
| * A widening conversion of a value with IntegerType and LongType to FloatType, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe more clear if you document it this way: |
||
| * or of a value with LongType to DoubleType, may result in loss of precision. | ||
| */ | ||
| object WidenTypes extends Rule[LogicalPlan] { | ||
| // See https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,56 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.types.{StructField, DataType, ArrayType, StructType} | ||
|
||
|
|
||
| /** | ||
| * A a collection of common abstractions for query plans as well as | ||
| * a base logical plan representation. | ||
| */ | ||
| package object plans | ||
| package object plans { | ||
| def generateSchemaTreeString(schema: Seq[Attribute]): String = { | ||
|
||
| val builder = new StringBuilder | ||
| builder.append("root\n") | ||
| val prefix = " |" | ||
| schema.foreach { | ||
| attribute => { | ||
|
||
| val name = attribute.name | ||
| val dataType = attribute.dataType | ||
| dataType match { | ||
| case fields: StructType => | ||
| builder.append(s"$prefix-- $name: $StructType\n") | ||
| generateSchemaTreeString(fields, s"$prefix |", builder) | ||
| case ArrayType(fields: StructType) => | ||
| builder.append(s"$prefix-- $name: $ArrayType[$StructType]\n") | ||
| generateSchemaTreeString(fields, s"$prefix |", builder) | ||
| case ArrayType(elementType: DataType) => | ||
| builder.append(s"$prefix-- $name: $ArrayType[$elementType]\n") | ||
| case _ => builder.append(s"$prefix-- $name: $dataType\n") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| builder.toString() | ||
| } | ||
|
|
||
| def generateSchemaTreeString( | ||
|
||
| schema: StructType, | ||
| prefix: String, | ||
| builder: StringBuilder): StringBuilder = { | ||
| schema.fields.foreach { | ||
| case StructField(name, fields: StructType, _) => | ||
| builder.append(s"$prefix-- $name: $StructType\n") | ||
| generateSchemaTreeString(fields, s"$prefix |", builder) | ||
| case StructField(name, ArrayType(fields: StructType), _) => | ||
| builder.append(s"$prefix-- $name: $ArrayType[$StructType]\n") | ||
| generateSchemaTreeString(fields, s"$prefix |", builder) | ||
| case StructField(name, ArrayType(elementType: DataType), _) => | ||
| builder.append(s"$prefix-- $name: $ArrayType[$elementType]\n") | ||
| case StructField(name, fieldType: DataType, _) => | ||
| builder.append(s"$prefix-- $name: $fieldType\n") | ||
| } | ||
|
|
||
| builder | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,11 @@ | |
| <artifactId>parquet-hadoop</artifactId> | ||
| <version>${parquet.version}</version> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>com.fasterxml.jackson.core</groupId> | ||
| <artifactId>jackson-core</artifactId> | ||
| <version>2.3.2</version> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pwendell I think in general sub project pom files don't specify dependency versions. Can you verify? |
||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.scalatest</groupId> | ||
| <artifactId>scalatest_${scala.binary.version}</artifactId> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution._ | |
| import org.apache.spark.sql.execution.SparkStrategies | ||
|
|
||
| import org.apache.spark.sql.parquet.ParquetRelation | ||
| import org.apache.spark.sql.json._ | ||
|
|
||
| /** | ||
| * :: AlphaComponent :: | ||
|
|
@@ -97,6 +98,41 @@ class SQLContext(@transient val sparkContext: SparkContext) | |
| def parquetFile(path: String): SchemaRDD = | ||
| new SchemaRDD(this, parquet.ParquetRelation(path)) | ||
|
|
||
| /** | ||
| * Loads a JSON file, returning the result as a [[SchemaRDD]]. | ||
| * Right now, we only do eager schema resolution. | ||
| */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here too, although with sampling |
||
| def jsonFile( | ||
| path: String, | ||
| mode: SchemaResolutionMode = EAGER_SCHEMA_RESOLUTION): SchemaRDD = { | ||
| logger.info(s"Loads a JSON file $path.") | ||
| val json = sparkContext.textFile(path) | ||
| jsonRDD(json, mode) | ||
| } | ||
|
|
||
| /** | ||
| * Loads a RDD[String] storing JSON objects (one object per record), | ||
| * returning the result as a [[SchemaRDD]]. | ||
| * Right now, we only do eager schema resolution. | ||
| */ | ||
| def jsonRDD( | ||
| json: RDD[String], | ||
| mode: SchemaResolutionMode = EAGER_SCHEMA_RESOLUTION): SchemaRDD = { | ||
| mode match { | ||
| case EAGER_SCHEMA_RESOLUTION => | ||
| logger.info(s"Eagerly resolve the schema without sampling.") | ||
| val logicalPlan = JsonTable.inferSchema(json) | ||
| logicalPlanToSparkQuery(logicalPlan) | ||
| case EAGER_SCHEMA_RESOLUTION_WITH_SAMPLING(fraction) => | ||
| logger.info(s"Eagerly resolve the schema with sampling " + | ||
| s"(sampling fraction: $fraction).") | ||
| val logicalPlan = JsonTable.inferSchema(json, Some(fraction)) | ||
| logicalPlanToSparkQuery(logicalPlan) | ||
| case LAZY_SCHEMA_RESOLUTION => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps we should just leave this out so users will get compile errors instead of runtime errors. |
||
| throw new UnsupportedOperationException("Lazy schema resolution has not been implemented.") | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * Creates an empty parquet file with the schema of class `A`, which can be registered as a table. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -111,4 +111,10 @@ private[sql] trait SchemaRDDLike { | |
| @Experimental | ||
| def saveAsTable(tableName: String): Unit = | ||
| sqlContext.executePlan(InsertIntoCreatedTable(None, tableName, logicalPlan)).toRdd | ||
|
|
||
|
|
||
| /** | ||
| * Print the schema of this SchemaRDD. | ||
| */ | ||
| def printSchema = queryExecution.analyzed.printSchema() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put the return type explicitly here (is it just Unit)? We should explicitly define returns types of all public functions. |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need to import all of these?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the console is primarily for developers then I find it pretty useful to have all the sorts of things I'd want for debugging in scope. This is how hive/console is already.