Skip to content
Closed
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
af55afd
Fix SPARK-12139: REGEX Column Specification for Hive Queries
janewangfb May 18, 2017
6f9bdb0
Fix SPARK-12139: REGEX Column Specification for Hive Queries
janewangfb May 18, 2017
43beb07
Merge branch 'support_select_regex' of https://github.com/janewangfb/…
janewangfb May 18, 2017
7699e87
add unittests for DataSet.
janewangfb May 18, 2017
6e37517
Address hvanhovell's comments
janewangfb May 19, 2017
bee07cd
Address gatorsmile's comments
janewangfb May 19, 2017
d5e450a
address gatorsmile's comment
janewangfb May 19, 2017
979bfb6
add the gold file
janewangfb May 19, 2017
612bedf
address gatorsmile's comment
janewangfb May 19, 2017
48c54aa
address gatorsmile's comment
janewangfb May 19, 2017
0284d01
address cloud-fan's commentS
janewangfb May 22, 2017
129243a
Merge branch 'master' into support_select_regex
janewangfb May 23, 2017
5df5494
Merge branch 'master' into support_select_regex
janewangfb May 26, 2017
779724d
Address cloud-fan's comments
janewangfb May 26, 2017
a27023c
fix typo
janewangfb May 27, 2017
e8d4054
Merge branch 'master' into support_select_regex
janewangfb May 27, 2017
201f4d6
Merge branch 'master' into support_select_regex
janewangfb May 30, 2017
a0e3773
roll back code
janewangfb May 30, 2017
6b091dc
merge master
janewangfb Jun 26, 2017
537b3bc
Merge branch 'master' into support_select_regex
janewangfb Jun 26, 2017
da60368
add gatorsmile's comments
janewangfb Jun 26, 2017
9c582eb
Merge branch 'master' into support_select_regex
janewangfb Jun 28, 2017
79e58f0
address gatorsmile's comments
janewangfb Jun 28, 2017
616b726
made regex case insensitive
janewangfb Jun 28, 2017
f98207b
merge master
janewangfb Jun 28, 2017
321211d
fix regex case insensitive match
janewangfb Jun 28, 2017
4e36ed9
Address gatorsmile's comments
janewangfb Jun 30, 2017
04b62c6
Merge branch 'master' into support_select_regex
janewangfb Jun 30, 2017
2d0dd1c
Merge branch 'master' into support_select_regex
janewangfb Jun 30, 2017
448c3e2
address gatorsmile's comment
janewangfb Jun 30, 2017
2ef2c14
Merge branch 'master' into support_select_regex
janewangfb Jul 5, 2017
d65c462
address gatorsmile's comments and fixed his failed testcases
janewangfb Jul 5, 2017
65e5eec
Merge branch 'master' into support_select_regex
janewangfb Jul 5, 2017
65886cd
fix build failure
janewangfb Jul 5, 2017
ca89a4a
Merge branch 'master' into support_select_regex
janewangfb Jul 7, 2017
d3eed1a
Address gatorsmile and viirya's comments
janewangfb Jul 7, 2017
956b849
address gatorsmile's comments
janewangfb Jul 10, 2017
8adad7c
Merge branch 'master' into support_select_regex
janewangfb Jul 10, 2017
56e2b83
merge master and resolve conflicts
janewangfb Jul 11, 2017
d613ff9
fix build failure
janewangfb Jul 11, 2017
f5104e4
Merge branch 'master' into support_select_regex
janewangfb Jul 11, 2017
a5f9c44
address gaatorsmile's comments
janewangfb Jul 11, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIden
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
import org.apache.spark.sql.catalyst.parser.ParserUtils
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
import org.apache.spark.sql.catalyst.trees.TreeNode
import org.apache.spark.sql.catalyst.util.quoteIdentifier
Expand Down Expand Up @@ -123,7 +124,10 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un

override def toString: String = s"'$name"

override def sql: String = quoteIdentifier(name)
override def sql: String = name match {
case ParserUtils.escapedIdentifier(_) | ParserUtils.qualifiedEscapedIdentifier(_, _) => name
case _ => quoteIdentifier(name)
}
}

object UnresolvedAttribute {
Expand Down Expand Up @@ -306,6 +310,29 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu
override def toString: String = target.map(_ + ".").getOrElse("") + "*"
}

/**
* Represents all of the input attributes to a given relational operator, for example in
* "SELECT `(id)?+.+` FROM ...".
*
* @param table an optional table that should be the target of the expansion. If omitted all
* tables' columns are produced.
*/
case class UnresolvedRegex(regexPattern: String, table: Option[String], caseSensitive: Boolean)
extends Star with Unevaluable {
override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = {
val pattern = if (caseSensitive) regexPattern else s"(?i)$regexPattern"
table match {
// If there is no table specified, use all input attributes that match expr
case None => input.output.filter(_.name.matches(pattern))
// If there is a table, pick out attributes that are part of this table that match expr
case Some(t) => input.output.filter(_.qualifier.exists(resolver(_, t)))
.filter(_.name.matches(pattern))
}
}

override def toString: String = table.map(_ + "." + regexPattern).getOrElse(regexPattern)
}

/**
* Used to assign new names to Generator's output, such as hive udtf.
* For example the SQL expression "stack(2, key, value, key, value) as (a, b)" could be represented
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1259,25 +1259,37 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
}

/**
* Create a dereference expression. The return type depends on the type of the parent, this can
* either be a [[UnresolvedAttribute]] (if the parent is an [[UnresolvedAttribute]]), or an
* [[UnresolvedExtractValue]] if the parent is some expression.
* Create a dereference expression. The return type depends on the type of the parent.
* If the parent is an [[UnresolvedAttribute]], it can be a [[UnresolvedAttribute]] or
* a [[UnresolvedRegex]] for regex quoted in ``; if the parent is some other expression,
* it can be [[UnresolvedExtractValue]].
*/
override def visitDereference(ctx: DereferenceContext): Expression = withOrigin(ctx) {
val attr = ctx.fieldName.getText
expression(ctx.base) match {
case UnresolvedAttribute(nameParts) =>
UnresolvedAttribute(nameParts :+ attr)
case unresolved_attr @ UnresolvedAttribute(nameParts) =>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a guard, e.g.: case unresolved_attr @ UnresolvedAttribute(nameParts) if conf.supportQuotedIdentifiers => . That makes the logic down the line much simpler.

Copy link
Contributor Author

@janewangfb janewangfb May 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is concise to put the if inside the case unresolved_attr @ UnresolvedAttribute(nameParts).

if we use guard, we still need to handle the case when the conf.supportQuotedIdentifiers is false.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about

case u @ UnresolvedAttribute(nameParts) if nameParts.length == 1 && conf.supportQuotedRegexColumnName =>
  UnresolvedRegex(ctx.fieldName.getStart.getText, Some(nameParts.head))

// If there are more dereferences, turn `UnresolvedRegex` back to `UnresolvedAttribute`
case UnresolvedRegex(regex, table) =>
  UnresolvedAttribute(table.toSeq + regex)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this wont work. In your first "case", ctx.fieldName.getStart.getText is XYZ, nameparts is XYZ. and the table part should come from ctx.base.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh sorry I made a mistake, ctx.fieldName.getText will trim the backquote?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, ctx.fieldName.getText will trim the backquote

ctx.fieldName.getStart.getText match {
case escapedIdentifier(columnNameRegex) if conf.supportQuotedRegexColumnName =>
UnresolvedRegex(columnNameRegex, Some(unresolved_attr.name), conf.caseSensitiveAnalysis)
case _ =>
UnresolvedAttribute(nameParts :+ attr)
}
case e =>
UnresolvedExtractValue(e, Literal(attr))
}
}

/**
* Create an [[UnresolvedAttribute]] expression.
* Create an [[UnresolvedAttribute]] expression or a [[UnresolvedRegex]] if it is a regex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if we always create UnresolvedRegex?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should only create UnresolvedRegex when necessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there seems no problem if we always go with the UnresolvedRegex, then we can simplify the code and remove the logic to detect regex string.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code complexity will be similar, because if the column is ``, we need to extract the pattern;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not talking about algorithm complexity, but saying that we can simplify the logic by avoiding detecting the regex string.

Copy link
Contributor Author

@janewangfb janewangfb May 30, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan, the code path is shared by by both select a, select a.b and on cause etc. If it is select a.b, the table part also go here, but later there is no project expand. If it is on cause, the the string is already striped, not regex any more. Only with column names, we will have the project expanding (similar to start). So, we will need the regex pattern match to know that this is only for columns.

Do you have any suggestion? Currently Hive only supports select column regex expansion. and this PR matches the hive behavior.

* quoted in ``
*/
override def visitColumnReference(ctx: ColumnReferenceContext): Expression = withOrigin(ctx) {
UnresolvedAttribute.quoted(ctx.getText)
ctx.getStart.getText match {
case escapedIdentifier(columnNameRegex) if conf.supportQuotedRegexColumnName =>
UnresolvedRegex(columnNameRegex, None, conf.caseSensitiveAnalysis)
case _ =>
UnresolvedAttribute.quoted(ctx.getText)
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ object ParserUtils {
sb.toString()
}

/** the column name pattern in quoted regex without qualifier */
val escapedIdentifier = "`(.+)`".r
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment for this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added.


/** the column name pattern in quoted regex with qualifier */
val qualifiedEscapedIdentifier = ("(.+)" + """.""" + "`(.+)`").r
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these 2 seems hacky to me, we can always create UnresolvedRegex if the config is on, and UnresolvedAttribute otherwise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when the config is on, we need to extract XYZ from XYZ pattern, thats why we need these patterns.


/** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,12 @@ object SQLConf {
.intConf
.createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)

val SUPPORT_QUOTED_REGEX_COLUMN_NAME = buildConf("spark.sql.parser.quotedRegexColumnNames")
.doc("When true, quoted Identifiers (using backticks) in SELECT statement are interpreted" +
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not only select statement. It can be almost any query.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should only support select. It does not make sense to do select a from test where (a)?+.+=3.

Also, for hive (https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select), it only supports the select statements:
"REGEX Column Specification
A SELECT statement can take regex-based column specification in Hive releases prior to 0.13.0, or in 0.13.0 and later releases if the configuration property hive.support.quoted.identifiers is set to none."

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. It only makes sense when we use it in SELECT statement. However, our parser allows quoted Identifiers (using backticks) in any part of the SQL statement. Below is the just the example. If we turn on this conf flag, will it cause the problem for the other users when they have quoted identifiers in the query except Project/SELECT list?

Copy link
Member

@viirya viirya Jul 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible users to use regex column in agg such as testData2.groupBy($"a", $"b").agg($"`...`")? In analyzer, seems we process Star in both Project and Aggregate.

Copy link
Contributor Author

@janewangfb janewangfb Jul 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gatorsmile for quoted Identifiers; if it not in select, it would not be affected.
however, if it is in select, it will be affected, e.g., before (a)?+.+ is invalid, now, it will be expanded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya I tried it out, e.g.,

val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
ds.groupByKey(_._1).agg(sum("*").as[Long]) is not supported.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@janewangfb You can do something like:

scala> val df = Seq((1, 2), (2, 3), (3, 4)).toDF("a", "b")
df: org.apache.spark.sql.DataFrame = [a: int, b: int]
scala> df.groupBy("a", "b").agg(df.col("*")).show
+---+---+---+---+
|  a|  b|  a|  b|
+---+---+---+---+
|  2|  3|  2|  3|
|  1|  2|  1|  2|
|  3|  4|  3|  4|
+---+---+---+---+

So I guess you can also do something like:

scala> df.groupBy("a", "b").agg(df.colRegex("`...`"))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, df.groupBy("a", "b").agg(df.col("(a)?+.+")).show works too.

" as regular expressions.")
.booleanConf
.createWithDefault(false)

object Deprecated {
val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
}
Expand Down Expand Up @@ -1105,6 +1111,8 @@ class SQLConf extends Serializable with Logging {

def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO)

def supportQuotedRegexColumnName: Boolean = getConf(SUPPORT_QUOTED_REGEX_COLUMN_NAME)

/** ********************** SQLConf functionality methods ************ */

/** Set Spark SQL configuration properties. */
Expand Down
27 changes: 24 additions & 3 deletions sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JSONOptions}
import org.apache.spark.sql.catalyst.optimizer.CombineUnions
import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection}
Expand Down Expand Up @@ -1188,8 +1188,29 @@ class Dataset[T] private[sql](
case "*" =>
Column(ResolvedStar(queryExecution.analyzed.output))
case _ =>
val expr = resolve(colName)
Column(expr)
if (sqlContext.conf.supportQuotedRegexColumnName) {
colRegex(colName)
} else {
val expr = resolve(colName)
Column(expr)
}
}

/**
* Selects column based on the column name specified as a regex and return it as [[Column]].
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add @group untypedrel and @since 2.3.0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

* @group untypedrel
* @since 2.3.0
*/
def colRegex(colName: String): Column = {
Copy link
Member

@viirya viirya Jul 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

col returns a column resolved on the current Dataset. colRegex now can return an unresolved one. Seems ok, but any possibility to go wrong with it?

For example, we can do:

val colRegex1 = df1.colRegex("`...`")  // colRegex1 is unresolved.
df2.select(colRegex1)

But we can't do the same thing with col.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tested out. it works for both cases. and I have added testcase DatasetSuite.scala.

val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
colName match {
case ParserUtils.escapedIdentifier(columnNameRegex) =>
Column(UnresolvedRegex(columnNameRegex, None, caseSensitive))
case ParserUtils.qualifiedEscapedIdentifier(nameParts, columnNameRegex) =>
Column(UnresolvedRegex(columnNameRegex, Some(nameParts), caseSensitive))
case _ =>
Column(resolve(colName))
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, "1", "11"), (2, "2", "22"), (3, "3", "33"), (4, "4", "44"), (5, "5", "55"), (6, "6", "66")
AS testData(key, value1, value2);

CREATE OR REPLACE TEMPORARY VIEW testData2 AS SELECT * FROM VALUES
(1, 1, 1, 2), (1, 2, 1, 2), (2, 1, 2, 3), (2, 2, 2, 3), (3, 1, 3, 4), (3, 2, 3, 4)
AS testData2(A, B, c, d);

-- AnalysisException
SELECT `(a)?+.+` FROM testData2 WHERE a = 1;
SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1;
SELECT `(a|b)` FROM testData2 WHERE a = 2;
SELECT `(a|b)?+.+` FROM testData2 WHERE a = 2;

set spark.sql.parser.quotedRegexColumnNames=true;

-- Regex columns
SELECT `(a)?+.+` FROM testData2 WHERE a = 1;
SELECT `(A)?+.+` FROM testData2 WHERE a = 1;
SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1;
SELECT t.`(A)?+.+` FROM testData2 t WHERE a = 1;
SELECT `(a|B)` FROM testData2 WHERE a = 2;
SELECT `(A|b)` FROM testData2 WHERE a = 2;
SELECT `(a|B)?+.+` FROM testData2 WHERE a = 2;
SELECT `(A|b)?+.+` FROM testData2 WHERE a = 2;
SELECT `(e|f)` FROM testData2;
SELECT t.`(e|f)` FROM testData2 t;
SELECT p.`(KEY)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3;
SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3;

set spark.sql.caseSensitive=true;

CREATE OR REPLACE TEMPORARY VIEW testdata3 AS SELECT * FROM VALUES
(0, 1), (1, 2), (2, 3), (3, 4)
AS testdata3(a, b);

-- Regex columns
SELECT `(A)?+.+` FROM testdata3;
SELECT `(a)?+.+` FROM testdata3;
Loading