-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31489][SPARK-31488][SQL] Translate date values of pushed down filters to java.sql.Date
#28272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-31489][SPARK-31488][SQL] Translate date values of pushed down filters to java.sql.Date
#28272
Changes from 6 commits
3dca84d
9ce9a34
32fb0ea
d52fe37
ac0b27a
ff2ca3f
2a82a10
d7b2ece
2973dd7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.parquet | |
| import java.math.{BigDecimal => JBigDecimal} | ||
| import java.nio.charset.StandardCharsets | ||
| import java.sql.{Date, Timestamp} | ||
| import java.time.LocalDate | ||
|
|
||
| import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} | ||
| import org.apache.parquet.filter2.predicate.FilterApi._ | ||
|
|
@@ -1561,6 +1562,63 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("filter pushdown - local date") { | ||
|
||
| implicit class StringToDate(s: String) { | ||
| def date: LocalDate = LocalDate.parse(s) | ||
| } | ||
|
|
||
| val data = Seq("2018-03-18", "2018-03-19", "2018-03-20", "2018-03-21").map(_.date) | ||
| import testImplicits._ | ||
| withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { | ||
| withNestedDataFrame(data.map(i => Tuple1(i)).toDF()) { case (inputDF, colName, resultFun) => | ||
| withParquetDataFrame(inputDF) { implicit df => | ||
| val dateAttr: Expression = df(colName).expr | ||
| assert(df(colName).expr.dataType === DateType) | ||
|
|
||
| checkFilterPredicate(dateAttr.isNull, classOf[Eq[_]], Seq.empty[Row]) | ||
| checkFilterPredicate(dateAttr.isNotNull, classOf[NotEq[_]], | ||
| data.map(i => Row.apply(resultFun(i)))) | ||
|
|
||
| checkFilterPredicate(dateAttr === "2018-03-18".date, classOf[Eq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(dateAttr <=> "2018-03-18".date, classOf[Eq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(dateAttr =!= "2018-03-18".date, classOf[NotEq[_]], | ||
| Seq("2018-03-19", "2018-03-20", "2018-03-21").map(i => Row.apply(resultFun(i.date)))) | ||
|
|
||
| checkFilterPredicate(dateAttr < "2018-03-19".date, classOf[Lt[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(dateAttr > "2018-03-20".date, classOf[Gt[_]], | ||
| resultFun("2018-03-21".date)) | ||
| checkFilterPredicate(dateAttr <= "2018-03-18".date, classOf[LtEq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(dateAttr >= "2018-03-21".date, classOf[GtEq[_]], | ||
| resultFun("2018-03-21".date)) | ||
|
|
||
| checkFilterPredicate(Literal("2018-03-18".date) === dateAttr, classOf[Eq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(Literal("2018-03-18".date) <=> dateAttr, classOf[Eq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(Literal("2018-03-19".date) > dateAttr, classOf[Lt[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(Literal("2018-03-20".date) < dateAttr, classOf[Gt[_]], | ||
| resultFun("2018-03-21".date)) | ||
| checkFilterPredicate(Literal("2018-03-18".date) >= dateAttr, classOf[LtEq[_]], | ||
| resultFun("2018-03-18".date)) | ||
| checkFilterPredicate(Literal("2018-03-21".date) <= dateAttr, classOf[GtEq[_]], | ||
| resultFun("2018-03-21".date)) | ||
|
|
||
| checkFilterPredicate(!(dateAttr < "2018-03-21".date), classOf[GtEq[_]], | ||
| resultFun("2018-03-21".date)) | ||
| checkFilterPredicate( | ||
| dateAttr < "2018-03-19".date || dateAttr > "2018-03-20".date, | ||
| classOf[Operators.Or], | ||
| Seq(Row(resultFun("2018-03-18".date)), Row(resultFun("2018-03-21".date)))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class ParquetV1FilterSuite extends ParquetFilterSuite { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.orc | |
| import java.math.MathContext | ||
| import java.nio.charset.StandardCharsets | ||
| import java.sql.{Date, Timestamp} | ||
| import java.time.LocalDate | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
|
|
@@ -450,5 +451,31 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { | |
| ).get.toString | ||
| } | ||
| } | ||
|
|
||
| test("filter pushdown - local date") { | ||
|
||
| val dates = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => | ||
| LocalDate.parse(day) | ||
| } | ||
| withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { | ||
| withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => | ||
| checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) | ||
|
|
||
| checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) | ||
| checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) | ||
|
|
||
| checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) | ||
| checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) | ||
|
|
||
| checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) | ||
| checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) | ||
| checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) | ||
| checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.orc | |
| import java.math.MathContext | ||
| import java.nio.charset.StandardCharsets | ||
| import java.sql.{Date, Timestamp} | ||
| import java.time.LocalDate | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
|
|
@@ -451,5 +452,31 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { | |
| ).get.toString | ||
| } | ||
| } | ||
|
|
||
| test("filter pushdown - local date") { | ||
|
||
| val dates = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day => | ||
| LocalDate.parse(day) | ||
| } | ||
| withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { | ||
| withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => | ||
| checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) | ||
|
|
||
| checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) | ||
| checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) | ||
|
|
||
| checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) | ||
| checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) | ||
|
|
||
| checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) | ||
| checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) | ||
| checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) | ||
| checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) | ||
| checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we're treating this as a temp fix for Spark 3.0?
Looks like ideally we should support Java 8 datetime instances for this interface as well when
spark.sql.datetime.java8API.enabledis enabled. It could cause more confusion. In addition, seems likespark.sql.datetime.java8API.enabledis disabled by default, too.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's problematic to let the java8 config also control the value type inside
Filter, as it can break existing DS v1 implementations. It's a bit unfortunate that we don't document clearly what the value type can be forFilter, but if we do, it's not user-friendly to say "the value type depends on xxx config". This just makes it harder to implement data source filter pushdown.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@HyukjinKwon Taking into account #23811 (comment), the flag won't be enabled by default in the near future.