Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression}
import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, IsNotNull, IsNull, NamedExpression}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line too long.

Copy link
Member

@HyukjinKwon HyukjinKwon Sep 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which can be wildcard when there are more than 6 entities per https://github.com/databricks/scala-style-guide#imports

import org.apache.spark.sql.catalyst.planning.PhysicalOperation
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.Rule
Expand Down Expand Up @@ -196,6 +196,7 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
*/
private def getRootFields(expr: Expression): Seq[RootField] = {
expr match {
case IsNotNull(_: Attribute) | IsNull(_: Attribute) => Seq.empty
Copy link
Member

@HyukjinKwon HyukjinKwon Sep 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmmm .. shouldn't we exclude this only for filters?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is in projects, I think we also don't need to include all nested fields?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the case mentioned here looks specific to the pushed filter itself. Can we add a simple test for project case as well?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, for instance, this case select address is not null, name.last from contacts it wouldn't work. I thought this is a quick bandaid fix to resolve a basic case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. This was a case I didn't test. Fixed it and added test case.

case att: Attribute =>
RootField(StructField(att.name, att.dataType, att.nullable), derivedFromAtt = true) :: Nil
case SelectedField(field) => RootField(field, derivedFromAtt = false) :: Nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about

      case IsNotNull(_: Attribute) | IsNull(_: Attribute) =>
        expr.children.flatMap(getRootFields).map(_.copy(contentAccessed = false))
      case _ =>
        expr.children.flatMap(getRootFields)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,30 @@ class ParquetSchemaPruningSuite
Row(null) :: Row(null) :: Nil)
}

testSchemaPruning("select a single complex field and in where clause") {
val query = sql("select name.first from contacts where name.first = 'Jane'")
checkScan(query, "struct<name:struct<first:string>>")
checkAnswer(query, Row("Jane") :: Nil)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add another tests that select name.first and name.last, and apply where clause on name.first. We should only read name.first and name.last without name.middle.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Added test case for it.

}

testSchemaPruning("select a single complex field array and in clause") {
val query = sql("select friends.middle from contacts where friends.first[0] = 'Susan'")
checkScan(query,
"struct<friends:array<struct<first:string,middle:string>>>")
checkAnswer(query.orderBy("id"),
Row(Array("Z.")) :: Nil)
}

testSchemaPruning("select a single complex field from a map entry and in clause") {
val query =
sql("select relatives[\"brother\"].middle from contacts " +
"where relatives[\"brother\"].first = 'John'")
checkScan(query,
"struct<relatives:map<string,struct<first:string,middle:string>>>")
checkAnswer(query.orderBy("id"),
Row("Y.") :: Nil)
}

private def testSchemaPruning(testName: String)(testThunk: => Unit) {
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
test(s"Spark vectorized reader - without partition data column - $testName") {
Expand Down