-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-12139] [SQL] REGEX Column Specification #18023
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
af55afd
6f9bdb0
43beb07
7699e87
6e37517
bee07cd
d5e450a
979bfb6
612bedf
48c54aa
0284d01
129243a
5df5494
779724d
a27023c
e8d4054
201f4d6
a0e3773
6b091dc
537b3bc
da60368
9c582eb
79e58f0
616b726
f98207b
321211d
4e36ed9
04b62c6
2d0dd1c
448c3e2
2ef2c14
d65c462
65e5eec
65886cd
ca89a4a
d3eed1a
956b849
8adad7c
56e2b83
d613ff9
f5104e4
a5f9c44
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,6 +83,28 @@ case class UnresolvedTableValuedFunction( | |
| override lazy val resolved = false | ||
| } | ||
|
|
||
| /** | ||
| * Represents all of the input attributes to a given relational operator, for example in | ||
| * "SELECT `(id)?+.+` FROM ...". | ||
| * | ||
| * @param table an optional table that should be the target of the expansion. If omitted all | ||
| * tables' columns are produced. | ||
| */ | ||
| case class UnresolvedRegex(regexPattern: String, table: Option[String]) | ||
| extends Star with Unevaluable { | ||
| override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { | ||
| table match { | ||
| // If there is no table specified, use all input attributes that match expr | ||
| case None => input.output.filter(_.name.matches(regexPattern)) | ||
| // If there is a table, pick out attributes that are part of this table that match expr | ||
|
||
| case Some(t) => input.output.filter(_.qualifier.exists(resolver(_, t))) | ||
| .filter(_.name.matches(regexPattern)) | ||
| } | ||
| } | ||
|
|
||
| override def toString: String = table.map(_ + ".").getOrElse("") + regexPattern | ||
|
||
| } | ||
|
|
||
| /** | ||
| * Holds the name of an attribute that has yet to be resolved. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1230,25 +1230,37 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging | |
| } | ||
|
|
||
| /** | ||
| * Create a dereference expression. The return type depends on the type of the parent, this can | ||
| * either be a [[UnresolvedAttribute]] (if the parent is an [[UnresolvedAttribute]]), or an | ||
| * [[UnresolvedExtractValue]] if the parent is some expression. | ||
| * Create a dereference expression. The return type depends on the type of the parent. | ||
| * If the parent is an [[UnresolvedAttribute]], it can be a [[UnresolvedAttribute]] or | ||
| * a [[UnresolvedRegex]] for regex quoted in ``; if the parent is some other expression, | ||
| * it can be [[UnresolvedExtractValue]]. | ||
| */ | ||
| override def visitDereference(ctx: DereferenceContext): Expression = withOrigin(ctx) { | ||
| val attr = ctx.fieldName.getText | ||
| expression(ctx.base) match { | ||
| case UnresolvedAttribute(nameParts) => | ||
| UnresolvedAttribute(nameParts :+ attr) | ||
| case unresolved_attr @ UnresolvedAttribute(nameParts) => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use a guard, e.g.:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is concise to put the if inside the case unresolved_attr @ UnresolvedAttribute(nameParts). if we use guard, we still need to handle the case when the conf.supportQuotedIdentifiers is false.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this wont work. In your first "case", ctx.fieldName.getStart.getText is
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh sorry I made a mistake,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, ctx.fieldName.getText will trim the backquote |
||
| matchEscapedIdentifier(ctx.fieldName.getStart.getText) match { | ||
| case Some(i) if conf.supportQuotedIdentifiers => | ||
| UnresolvedRegex(i, Some(unresolved_attr.name)) | ||
| case _ => | ||
| UnresolvedAttribute(nameParts :+ attr) | ||
| } | ||
| case e => | ||
| UnresolvedExtractValue(e, Literal(attr)) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create an [[UnresolvedAttribute]] expression. | ||
| * Create an [[UnresolvedAttribute]] expression or a [[UnresolvedRegex]] if it is a regex | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we always create
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should only create UnresolvedRegex when necessary.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there seems no problem if we always go with the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the code complexity will be similar, because if the column is ``, we need to extract the pattern;
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not talking about algorithm complexity, but saying that we can simplify the logic by avoiding detecting the regex string.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan, the code path is shared by by both select a, select a.b and on cause etc. If it is select a.b, the table part also go here, but later there is no project expand. If it is on cause, the the string is already striped, not regex any more. Only with column names, we will have the project expanding (similar to start). So, we will need the regex pattern match to know that this is only for columns. Do you have any suggestion? Currently Hive only supports select column regex expansion. and this PR matches the hive behavior. |
||
| * quoted in `` | ||
| */ | ||
| override def visitColumnReference(ctx: ColumnReferenceContext): Expression = withOrigin(ctx) { | ||
| UnresolvedAttribute.quoted(ctx.getText) | ||
| matchEscapedIdentifier(ctx.getStart.getText) match { | ||
| case Some(i) if conf.supportQuotedIdentifiers => | ||
| UnresolvedRegex(i, None) | ||
| case _ => | ||
| UnresolvedAttribute.quoted(ctx.getText) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -177,6 +177,18 @@ object ParserUtils { | |
| sb.toString() | ||
| } | ||
|
|
||
| val escapedIdentifier = "`(.+)`".r | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a comment for this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. |
||
|
|
||
| /** | ||
| * Return the substring extracted using regex | ||
| */ | ||
| def matchEscapedIdentifier(b: String): Option[String] = { | ||
| b match { | ||
| case escapedIdentifier(i) => Some(i) | ||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| /** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */ | ||
| implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal { | ||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -795,6 +795,12 @@ object SQLConf { | |
| .intConf | ||
| .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt) | ||
|
|
||
| val SUPPORT_QUOTED_IDENTIFIERS = buildConf("spark.sql.support.quoted.identifiers") | ||
|
||
| .internal() | ||
|
||
| .doc("When true, identifiers specified by regex patterns will be expanded.") | ||
|
||
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| object Deprecated { | ||
| val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks" | ||
| } | ||
|
|
@@ -1051,6 +1057,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO) | ||
|
|
||
| def supportQuotedIdentifiers: Boolean = getConf(SUPPORT_QUOTED_IDENTIFIERS) | ||
|
|
||
| /** ********************** SQLConf functionality methods ************ */ | ||
|
|
||
| /** Set Spark SQL configuration properties. */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -244,6 +244,40 @@ class DatasetSuite extends QueryTest with SharedSQLContext { | |
| ("a", ClassData("a", 1)), ("b", ClassData("b", 2)), ("c", ClassData("c", 3))) | ||
| } | ||
|
|
||
| test("select 3, regex") { | ||
|
||
| val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDF() | ||
| intercept[AnalysisException] { | ||
| ds.select(expr("`(_1)?+.+`").as[Int]) | ||
| } | ||
|
|
||
| intercept[AnalysisException] { | ||
| ds.select(expr("`(_1|_2)`").as[Int]) | ||
| } | ||
|
|
||
| withSQLConf(SQLConf.SUPPORT_QUOTED_IDENTIFIERS.key -> "true") { | ||
| checkDataset( | ||
| ds.select(expr("`(_1)?+.+`").as[Int]), | ||
| 1, 2, 3) | ||
| val m = ds.select(expr("`(_1|_2)`")) | ||
|
|
||
| checkDataset( | ||
| ds.select(expr("`(_1|_2)`")) | ||
| .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]), | ||
| ClassData("a", 1), ClassData("b", 2), ClassData("c", 3)) | ||
|
|
||
| checkDataset( | ||
| ds.alias("g") | ||
| .select(expr("g.`(_1)?+.+`").as[Int]), | ||
| 1, 2, 3) | ||
|
|
||
| checkDataset( | ||
| ds.alias("g") | ||
| .select(expr("g.`(_1|_2)`")) | ||
| .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]), | ||
| ClassData("a", 1), ClassData("b", 2), ClassData("c", 3)) | ||
| } | ||
| } | ||
|
|
||
| test("filter") { | ||
| val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS() | ||
| checkDataset( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2624,4 +2624,92 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
| val e = intercept[AnalysisException](sql("SELECT nvl(1, 2, 3)")) | ||
| assert(e.message.contains("Invalid number of arguments")) | ||
| } | ||
|
|
||
| test("SPARK-12139: REGEX Column Specification for Hive Queries") { | ||
|
||
| // hive.support.quoted.identifiers is turned off by default | ||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT b | ||
| |FROM testData2 | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT t.b | ||
| |FROM testData2 t | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
||
|
|
||
| intercept[AnalysisException] { | ||
| sql( | ||
| """ | ||
| |SELECT `(a)?+.+` | ||
| |FROM testData2 | ||
| |WHERE a = 1 | ||
| """.stripMargin) | ||
| } | ||
|
|
||
| intercept[AnalysisException] { | ||
| sql( | ||
| """ | ||
| |SELECT t.`(a)?+.+` | ||
| |FROM testData2 t | ||
| |WHERE a = 1 | ||
| """.stripMargin) | ||
| } | ||
|
|
||
| // now, turn on hive.support.quoted.identifiers | ||
| withSQLConf(SQLConf.SUPPORT_QUOTED_IDENTIFIERS.key -> "true") { | ||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT b | ||
| |FROM testData2 | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT t.b | ||
| |FROM testData2 t | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT `(a)?+.+` | ||
| |FROM testData2 | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT t.`(a)?+.+` | ||
| |FROM testData2 t | ||
| |WHERE a = 1 | ||
| """.stripMargin), | ||
| Row(1) :: Row(2) :: Nil) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| """ | ||
| |SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` | ||
| |FROM testData p join testData2 | ||
| |ON p.key = testData2.a | ||
| |WHERE key < 3 | ||
| """.stripMargin), | ||
| Row("1", 1, 1) :: Row("1", 2, 1) :: Row("2", 1, 2) :: Row("2", 2, 2) :: Nil) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we move it below
UnresolvedStar?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
moved