-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5950][SQL]Insert array into a metastore table saved as parquet should work when using datasource api #4826
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
4ec17fd
8f19fe5
9a26611
0a703e7
bf50d73
8bd008b
b2c06f8
e4f397c
f6ed813
0eb5578
8a3f237
8360817
d3747d1
486ed08
3cec464
0cb7ea2
587d88b
80e487e
3b61a04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,7 +181,7 @@ object DataType { | |
| /** | ||
| * Compares two types, ignoring nullability of ArrayType, MapType, StructType. | ||
| */ | ||
| private[sql] def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = { | ||
| private[spark] def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = { | ||
| (left, right) match { | ||
| case (ArrayType(leftElementType, _), ArrayType(rightElementType, _)) => | ||
| equalsIgnoreNullability(leftElementType, rightElementType) | ||
|
|
@@ -198,6 +198,57 @@ object DataType { | |
| case (left, right) => left == right | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Compares two types, ignoring compatible nullability of ArrayType, MapType, StructType. | ||
| * | ||
| * Compatible nullability is defined as follows: | ||
| * - If `from` and `to` are ArrayTypes, `from` has a compatible nullability with `to` | ||
| * if and only if `from.containsNull` is false, or both of `from.containsNull` and | ||
| * `to.containsNull` are true. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be better to reword this definition to make it consistent with the code:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same applies to |
||
| * - If `from` and `to` are MapTypes, `from` has a compatible nullability with `to` | ||
| * if and only if `from.valueContainsNull` is false, or both of `from.valueContainsNull` and | ||
| * `to.valueContainsNull` are true. | ||
| * - If `from` and `to` are StructTypes, `from` has a compatible nullability with `to` | ||
| * if and only if for all every pair of fields, `fromField.nullable` is false, or both | ||
| * of `fromField.nullable` and `toField.nullable` are true. | ||
| */ | ||
| private[spark] def equalsIgnoreCompatibleNullability(from: DataType, to: DataType): Boolean = { | ||
| (from, to) match { | ||
| case (ArrayType(fromElement, fn), ArrayType(toElement, tn)) => | ||
| (tn || !fn) && equalsIgnoreCompatibleNullability(fromElement, toElement) | ||
|
|
||
| case (MapType(fromKey, fromValue, fn), MapType(toKey, toValue, tn)) => | ||
| (tn || !fn) && | ||
| equalsIgnoreCompatibleNullability(fromKey, toKey) && | ||
| equalsIgnoreCompatibleNullability(fromValue, toValue) | ||
|
|
||
| case (StructType(fromFields), StructType(toFields)) => | ||
| fromFields.size == toFields.size && | ||
| fromFields.zip(toFields).forall { | ||
| case (fromField, toField) => | ||
| fromField.name == toField.name && | ||
| (toField.nullable || !fromField.nullable) && | ||
| equalsIgnoreCompatibleNullability(fromField.dataType, toField.dataType) | ||
| } | ||
|
|
||
| case (fromDataType, toDataType) => fromDataType == toDataType | ||
| } | ||
| } | ||
|
|
||
| /** Sets all nullable/containsNull/valueContainsNull to true. */ | ||
| private[spark] def alwaysNullable(dataType: DataType): DataType = dataType match { | ||
| case ArrayType(elementType, _) => | ||
| ArrayType(alwaysNullable(elementType), containsNull = true) | ||
| case MapType(keyType, valueType, _) => | ||
| MapType(alwaysNullable(keyType), alwaysNullable(valueType), true) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Use named parameter for |
||
| case StructType(fields) => | ||
| val newFields = fields.map { field => | ||
| StructField(field.name, alwaysNullable(field.dataType), nullable = true) | ||
| } | ||
| StructType(newFields) | ||
| case other => other | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -115,4 +115,84 @@ class DataTypeSuite extends FunSuite { | |
| checkDefaultSize(MapType(IntegerType, StringType, true), 410000) | ||
| checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400) | ||
| checkDefaultSize(structType, 812) | ||
|
|
||
| def checkEqualsIgnoreCompatibleNullability( | ||
| from: DataType, | ||
| to: DataType, | ||
| expected: Boolean): Unit = { | ||
| val testName = | ||
| s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})" | ||
| test(testName) { | ||
| assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected) | ||
| } | ||
| } | ||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = true), | ||
| to = ArrayType(DoubleType, containsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(DoubleType, containsNull = false), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(DoubleType, containsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = true), | ||
| to = ArrayType(DoubleType, containsNull = false), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = ArrayType(DoubleType, containsNull = false), | ||
| to = ArrayType(StringType, containsNull = false), | ||
| expected = false) | ||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, DoubleType, valueContainsNull = true), | ||
| to = MapType(StringType, DoubleType, valueContainsNull = false), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true), | ||
| to = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true), | ||
| expected = false) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to add another test case to show nested case: checkEqualsIgnoreCompatibleNullability(
from = MapType(StringType, ArrayType(IntegerType, false), valueContainsNull = true),
to = MapType(StringType, ArrayType(IntegerType, true), valueContainsNull = true),
expected = true)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done (added a test after this one) |
||
|
|
||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| expected = true) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType(StructField("a", StringType, nullable = true) :: Nil), | ||
| to = StructType(StructField("a", StringType, nullable = false) :: Nil), | ||
| expected = false) | ||
| checkEqualsIgnoreCompatibleNullability( | ||
| from = StructType( | ||
| StructField("a", StringType, nullable = false) :: | ||
| StructField("b", StringType, nullable = true) :: Nil), | ||
| to = StructType( | ||
| StructField("a", StringType, nullable = false) :: | ||
| StructField("b", StringType, nullable = false) :: Nil), | ||
| expected = false) | ||
|
|
||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ import org.apache.hadoop.fs.Path | |
|
|
||
| import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext} | ||
| import org.apache.spark.sql.sources._ | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
|
|
||
|
|
||
| private[sql] class DefaultSource | ||
|
|
@@ -131,7 +131,7 @@ private[sql] case class JSONRelation( | |
|
|
||
| override def equals(other: Any): Boolean = other match { | ||
| case that: JSONRelation => | ||
| (this.path == that.path) && (this.schema == that.schema) | ||
| (this.path == that.path) && (DataType.equalsIgnoreNullability(this.schema, that.schema)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should also have this in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since nullability affects how parquet encode and decode data, I think it is better to leave
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please ignore my comment. Will change it. |
||
| case _ => false | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ import java.util.logging.Level | |
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hadoop.fs.permission.FsAction | ||
| import org.apache.spark.sql.types.{StructType, DataType} | ||
| import parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat} | ||
| import parquet.hadoop.metadata.CompressionCodecName | ||
| import parquet.schema.MessageType | ||
|
|
@@ -172,9 +173,14 @@ private[sql] object ParquetRelation { | |
| sqlContext.conf.parquetCompressionCodec.toUpperCase, CompressionCodecName.UNCOMPRESSED) | ||
| .name()) | ||
| ParquetRelation.enableLogForwarding() | ||
| ParquetTypesConverter.writeMetaData(attributes, path, conf) | ||
| // This is a hack. We always set nullable/containsNull/valueContainsNull to true | ||
| // for the schema of a parquet data. | ||
| val schema = | ||
| DataType.alwaysNullable(StructType.fromAttributes(attributes)).asInstanceOf[StructType] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should also make
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure if we want to do this. For |
||
| val newAttributes = schema.toAttributes | ||
| ParquetTypesConverter.writeMetaData(newAttributes, path, conf) | ||
| new ParquetRelation(path.toString, Some(conf), sqlContext) { | ||
| override val output = attributes | ||
| override val output = newAttributes | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @liancheng @marmbrus I am also changing the nullability for our old parquet write path to make the behavior consistent with our new write path. Let me know if there is any potential compatibility issue and we should revert this change.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also make data types of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind, since we always write nullable data, it should be OK to leave
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Verified that when merging schemas, official Parquet implementation will handle nullability (repetition level) properly. So our change should be safe for interoperation with other systems that support Parquet schema evolving. |
||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mengxr Is it OK to ignore the nullability of ArrayType (containsNull field) and MapType (valueContainsNull field) in this check?