apache · janewangfb · May 18, 2017 · May 18, 2017 · May 18, 2017 · May 18, 2017
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -83,6 +83,28 @@ case class UnresolvedTableValuedFunction(
   override lazy val resolved = false
 }
 
+/**
+ * Represents all of the input attributes to a given relational operator, for example in
+ * "SELECT `(id)?+.+` FROM ...".
+ *
+ * @param table an optional table that should be the target of the expansion.  If omitted all
+ *              tables' columns are produced.
+ */
+case class UnresolvedRegex(regexPattern: String, table: Option[String])
+  extends Star with Unevaluable {
+  override def expand(input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = {
+    table match {
+      // If there is no table specified, use all input attributes that match expr
+      case None => input.output.filter(_.name.matches(regexPattern))
+      // If there is a table, pick out attributes that are part of this table that match expr
+      case Some(t) => input.output.filter(_.qualifier.exists(resolver(_, t)))
+        .filter(_.name.matches(regexPattern))
+    }
+  }
+
+  override def toString: String = table.map(_ + ".").getOrElse("") + regexPattern
+}
+
 /**
  * Holds the name of an attribute that has yet to be resolved.
  */

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1230,25 +1230,37 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   }
 
   /**
-   * Create a dereference expression. The return type depends on the type of the parent, this can
-   * either be a [[UnresolvedAttribute]] (if the parent is an [[UnresolvedAttribute]]), or an
-   * [[UnresolvedExtractValue]] if the parent is some expression.
+   * Create a dereference expression. The return type depends on the type of the parent.
+   * If the parent is an [[UnresolvedAttribute]], it can be a [[UnresolvedAttribute]] or
+   * a [[UnresolvedRegex]] for regex quoted in ``; if the parent is some other expression,
+   * it can be [[UnresolvedExtractValue]].
    */
   override def visitDereference(ctx: DereferenceContext): Expression = withOrigin(ctx) {
     val attr = ctx.fieldName.getText
     expression(ctx.base) match {
-      case UnresolvedAttribute(nameParts) =>
-        UnresolvedAttribute(nameParts :+ attr)
+      case unresolved_attr @ UnresolvedAttribute(nameParts) =>
+        matchEscapedIdentifier(ctx.fieldName.getStart.getText) match {
+          case Some(i) if conf.supportQuotedIdentifiers =>
+            UnresolvedRegex(i, Some(unresolved_attr.name))
+          case _ =>
+            UnresolvedAttribute(nameParts :+ attr)
+        }
       case e =>
         UnresolvedExtractValue(e, Literal(attr))
     }
   }
 
   /**
-   * Create an [[UnresolvedAttribute]] expression.
+   * Create an [[UnresolvedAttribute]] expression or a [[UnresolvedRegex]] if it is a regex
+   * quoted in ``
    */
   override def visitColumnReference(ctx: ColumnReferenceContext): Expression = withOrigin(ctx) {
-    UnresolvedAttribute.quoted(ctx.getText)
+    matchEscapedIdentifier(ctx.getStart.getText) match {
+      case Some(i) if conf.supportQuotedIdentifiers =>
+        UnresolvedRegex(i, None)
+      case _ =>
+        UnresolvedAttribute.quoted(ctx.getText)
+    }
   }
 
   /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -177,6 +177,18 @@ object ParserUtils {
     sb.toString()
   }
 
+  val escapedIdentifier = "`(.+)`".r
+
+  /**
+   * Return the substring extracted using regex
+   */
+  def matchEscapedIdentifier(b: String): Option[String] = {
+    b match {
+      case escapedIdentifier(i) => Some(i)
+      case _ => None
+    }
+  }
+
   /** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
   implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
     /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -795,6 +795,12 @@ object SQLConf {
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
+  val SUPPORT_QUOTED_IDENTIFIERS = buildConf("spark.sql.support.quoted.identifiers")
+    .internal()
+    .doc("When true, identifiers specified by regex patterns will be expanded.")
+    .booleanConf
+    .createWithDefault(false)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1051,6 +1057,8 @@ class SQLConf extends Serializable with Logging {
 
   def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO)
 
+  def supportQuotedIdentifiers: Boolean = getConf(SUPPORT_QUOTED_IDENTIFIERS)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -244,6 +244,40 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       ("a", ClassData("a", 1)), ("b", ClassData("b", 2)), ("c", ClassData("c", 3)))
   }
 
+  test("select 3, regex") {
+    val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDF()
+    intercept[AnalysisException] {
+      ds.select(expr("`(_1)?+.+`").as[Int])
+    }
+
+    intercept[AnalysisException] {
+      ds.select(expr("`(_1|_2)`").as[Int])
+    }
+
+    withSQLConf(SQLConf.SUPPORT_QUOTED_IDENTIFIERS.key -> "true") {
+      checkDataset(
+        ds.select(expr("`(_1)?+.+`").as[Int]),
+        1, 2, 3)
+      val m = ds.select(expr("`(_1|_2)`"))
+
+      checkDataset(
+        ds.select(expr("`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+
+      checkDataset(
+        ds.alias("g")
+          .select(expr("g.`(_1)?+.+`").as[Int]),
+        1, 2, 3)
+
+      checkDataset(
+        ds.alias("g")
+          .select(expr("g.`(_1|_2)`"))
+          .select(expr("named_struct('a', _1, 'b', _2)").as[ClassData]),
+        ClassData("a", 1), ClassData("b", 2), ClassData("c", 3))
+    }
+  }
+
   test("filter") {
     val ds = Seq(("a", 1), ("b", 2), ("c", 3)).toDS()
     checkDataset(

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2624,4 +2624,92 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val e = intercept[AnalysisException](sql("SELECT nvl(1, 2, 3)"))
     assert(e.message.contains("Invalid number of arguments"))
   }
+
+  test("SPARK-12139: REGEX Column Specification for Hive Queries") {
+    // hive.support.quoted.identifiers is turned off by default
+    checkAnswer(
+      sql(
+        """
+          |SELECT b
+          |FROM testData2
+          |WHERE a = 1
+        """.stripMargin),
+      Row(1) :: Row(2) :: Nil)
+
+    checkAnswer(
+      sql(
+        """
+          |SELECT t.b
+          |FROM testData2 t
+          |WHERE a = 1
+        """.stripMargin),
+      Row(1) :: Row(2) :: Nil)
+
+    intercept[AnalysisException] {
+      sql(
+        """
+          |SELECT `(a)?+.+`
+          |FROM testData2
+          |WHERE a = 1
+        """.stripMargin)
+    }
+
+    intercept[AnalysisException] {
+      sql(
+        """
+          |SELECT t.`(a)?+.+`
+          |FROM testData2 t
+          |WHERE a = 1
+        """.stripMargin)
+    }
+
+    // now, turn on hive.support.quoted.identifiers
+    withSQLConf(SQLConf.SUPPORT_QUOTED_IDENTIFIERS.key -> "true") {
+      checkAnswer(
+        sql(
+          """
+            |SELECT b
+            |FROM testData2
+            |WHERE a = 1
+          """.stripMargin),
+        Row(1) :: Row(2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            |SELECT t.b
+            |FROM testData2 t
+            |WHERE a = 1
+          """.stripMargin),
+        Row(1) :: Row(2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            |SELECT `(a)?+.+`
+            |FROM testData2
+            |WHERE a = 1
+          """.stripMargin),
+        Row(1) :: Row(2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            |SELECT t.`(a)?+.+`
+            |FROM testData2 t
+            |WHERE a = 1
+          """.stripMargin),
+        Row(1) :: Row(2) :: Nil)
+
+      checkAnswer(
+        sql(
+          """
+            |SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+`
+            |FROM testData p join testData2
+            |ON p.key = testData2.a
+            |WHERE key < 3
+          """.stripMargin),
+        Row("1", 1, 1) :: Row("1", 2, 1) :: Row("2", 1, 2) :: Row("2", 2, 2) :: Nil)
+    }
+  }
 }