From 4c047a124dfa57d53fe99a37c5cc49854d12db07 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 2 Jul 2020 19:15:53 -0700 Subject: [PATCH 1/4] nested pruning should work even with cosmetic variations. --- .../catalyst/optimizer/NestedColumnAliasing.scala | 11 ++++++++--- .../execution/datasources/SchemaPruningSuite.scala | 12 ++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 118f41f9cd232..644951958c95f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -149,10 +149,12 @@ object NestedColumnAliasing { case _ => false } + // Note that when we group by extractors with their references, we should remove + // cosmetic variations. val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences) val aliasSub = nestedFieldReferences.asInstanceOf[Seq[ExtractValue]] .filter(!_.references.subsetOf(exclusiveAttrSet)) - .groupBy(_.references.head) + .groupBy(_.references.head.canonicalized.asInstanceOf[Attribute]) .flatMap { case (attr, nestedFields: Seq[ExtractValue]) => // Remove redundant `ExtractValue`s if they share the same parent nest field. // For example, when `a.b` and `a.b.c` are in project list, we only need to alias `a.b`. @@ -174,9 +176,12 @@ object NestedColumnAliasing { // If all nested fields of `attr` are used, we don't need to introduce new aliases. // By default, ColumnPruning rule uses `attr` already. + // Note that we need to remove cosmetic variations first, so we only count a + // nested field once. if (nestedFieldToAlias.nonEmpty && - nestedFieldToAlias - .map { case (nestedField, _) => totalFieldNum(nestedField.dataType) } + dedupNestedFields.map(_.canonicalized.asInstanceOf[ExtractValue]) + .distinct + .map { nestedField => totalFieldNum(nestedField.dataType) } .sum < totalFieldNum(attr.dataType)) { Some(attr.exprId -> nestedFieldToAlias) } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 8b859e951b9b9..7e85a3edb8516 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -497,6 +497,18 @@ abstract class SchemaPruningSuite Row(Row("Janet", null, "Jones"), "Jones") ::Nil) } + testSchemaPruning("SPARK-32163: nested pruning should work even with cosmetic variations") { + withTempView("contact_alias") { + sql("select * from contacts") + .repartition(100, col("name.first"), col("name.last")) + .selectExpr("name").createOrReplaceTempView("contact_alias") + + val query = sql("select name.first from contact_alias") + checkScan(query, "struct>") + checkAnswer(query, Row("Jane") :: Row("John") :: Row("Jim") :: Row("Janet") ::Nil) + } + } + protected def testSchemaPruning(testName: String)(testThunk: => Unit): Unit = { test(s"Spark vectorized reader - without partition data column - $testName") { withSQLConf(vectorizedReaderEnabledKey -> "true") { From 5f111b4a0ab4bc98f753e7a9663e229732e63fb9 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 5 Jul 2020 10:55:28 -0700 Subject: [PATCH 2/4] Add test case. --- .../execution/datasources/SchemaPruningSuite.scala | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 7e85a3edb8516..d51eafa5a8aed 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -503,9 +503,17 @@ abstract class SchemaPruningSuite .repartition(100, col("name.first"), col("name.last")) .selectExpr("name").createOrReplaceTempView("contact_alias") - val query = sql("select name.first from contact_alias") - checkScan(query, "struct>") - checkAnswer(query, Row("Jane") :: Row("John") :: Row("Jim") :: Row("Janet") ::Nil) + val query1 = sql("select name.first from contact_alias") + checkScan(query1, "struct>") + checkAnswer(query1, Row("Jane") :: Row("John") :: Row("Jim") :: Row("Janet") ::Nil) + + sql("select * from contacts") + .select(explode(col("friends.first")), col("friends")) + .createOrReplaceTempView("contact_alias") + + val query2 = sql("select friends.middle, col from contact_alias") + checkScan(query2, "struct>>") + checkAnswer(query2, Row(Array("Z."), "Susan") :: Nil) } } From 04f6bb640d449d9a17bb34ea8f5f04faf3697b0a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 5 Jul 2020 11:07:33 -0700 Subject: [PATCH 3/4] Remove unnessary cast. --- .../spark/sql/catalyst/optimizer/NestedColumnAliasing.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 644951958c95f..94ff3278506e6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -179,7 +179,7 @@ object NestedColumnAliasing { // Note that we need to remove cosmetic variations first, so we only count a // nested field once. if (nestedFieldToAlias.nonEmpty && - dedupNestedFields.map(_.canonicalized.asInstanceOf[ExtractValue]) + dedupNestedFields.map(_.canonicalized) .distinct .map { nestedField => totalFieldNum(nestedField.dataType) } .sum < totalFieldNum(attr.dataType)) { From d352dbcdecfbdab08607c2416748e7650294fa42 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 7 Jul 2020 08:39:13 -0700 Subject: [PATCH 4/4] Fix ident. --- .../spark/sql/catalyst/optimizer/NestedColumnAliasing.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index 94ff3278506e6..0c8666b72cace 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -179,7 +179,7 @@ object NestedColumnAliasing { // Note that we need to remove cosmetic variations first, so we only count a // nested field once. if (nestedFieldToAlias.nonEmpty && - dedupNestedFields.map(_.canonicalized) + dedupNestedFields.map(_.canonicalized) .distinct .map { nestedField => totalFieldNum(nestedField.dataType) } .sum < totalFieldNum(attr.dataType)) {