apache · cloud-fan · Jul 9, 2020 · Jul 10, 2020 · yaooqinn · Jul 10, 2020
diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
@@ -127,7 +127,7 @@ By default `spark.sql.ansi.enabled` is false.
 
 Below is a list of all the keywords in Spark SQL.
 
-|Keyword|Spark SQL<br/>ANSI Mode|Spark SQL<br/>Default Mode|SQL-2011|
+|Keyword|Spark SQL<br/>ANSI Mode|Spark SQL<br/>Default Mode|SQL-2016|
 |-------|----------------------|-------------------------|--------|
 |ADD|non-reserved|non-reserved|non-reserved|
 |AFTER|non-reserved|non-reserved|non-reserved|
@@ -149,7 +149,7 @@ Below is a list of all the keywords in Spark SQL.
 |BUCKETS|non-reserved|non-reserved|non-reserved|
 |BY|non-reserved|non-reserved|reserved|
 |CACHE|non-reserved|non-reserved|non-reserved|
-|CASCADE|non-reserved|non-reserved|reserved|
+|CASCADE|non-reserved|non-reserved|non-reserved|
 |CASE|reserved|non-reserved|reserved|
 |CAST|reserved|non-reserved|reserved|
 |CHANGE|non-reserved|non-reserved|non-reserved|
@@ -193,7 +193,7 @@ Below is a list of all the keywords in Spark SQL.
 |DIRECTORY|non-reserved|non-reserved|non-reserved|
 |DISTINCT|reserved|non-reserved|reserved|
 |DISTRIBUTE|non-reserved|non-reserved|non-reserved|
-|DIV|non-reserved|non-reserved|non-reserved|
+|DIV|non-reserved|non-reserved|not a keyword|
 |DROP|non-reserved|non-reserved|reserved|
 |ELSE|reserved|non-reserved|reserved|
 |END|reserved|non-reserved|reserved|
@@ -228,7 +228,7 @@ Below is a list of all the keywords in Spark SQL.
 |GROUPING|non-reserved|non-reserved|reserved|
 |HAVING|reserved|non-reserved|reserved|
 |HOUR|reserved|non-reserved|reserved|
-|IF|non-reserved|non-reserved|reserved|
+|IF|non-reserved|non-reserved|not a keyword|
 |IGNORE|non-reserved|non-reserved|non-reserved|
 |IMPORT|non-reserved|non-reserved|non-reserved|
 |IN|reserved|non-reserved|reserved|
@@ -302,12 +302,14 @@ Below is a list of all the keywords in Spark SQL.
 |PROPERTIES|non-reserved|non-reserved|non-reserved|
 |PURGE|non-reserved|non-reserved|non-reserved|
 |QUERY|non-reserved|non-reserved|non-reserved|
+|RANGE|non-reserved|non-reserved|reserved|
 |RECORDREADER|non-reserved|non-reserved|non-reserved|
 |RECORDWRITER|non-reserved|non-reserved|non-reserved|
 |RECOVER|non-reserved|non-reserved|non-reserved|
 |REDUCE|non-reserved|non-reserved|non-reserved|
 |REFERENCES|reserved|non-reserved|reserved|
 |REFRESH|non-reserved|non-reserved|non-reserved|
+|REGEXP|non-reserved|non-reserved|not a keyword|
 |RENAME|non-reserved|non-reserved|non-reserved|
 |REPAIR|non-reserved|non-reserved|non-reserved|
 |REPLACE|non-reserved|non-reserved|non-reserved|
@@ -323,6 +325,7 @@ Below is a list of all the keywords in Spark SQL.
 |ROW|non-reserved|non-reserved|reserved|
 |ROWS|non-reserved|non-reserved|reserved|
 |SCHEMA|non-reserved|non-reserved|non-reserved|
+|SCHEMAS|non-reserved|non-reserved|not a keyword|
 |SECOND|reserved|non-reserved|reserved|
 |SELECT|reserved|non-reserved|reserved|
 |SEMI|non-reserved|strict-non-reserved|non-reserved|
@@ -348,6 +351,7 @@ Below is a list of all the keywords in Spark SQL.
 |TABLES|non-reserved|non-reserved|non-reserved|
 |TABLESAMPLE|non-reserved|non-reserved|reserved|
 |TBLPROPERTIES|non-reserved|non-reserved|non-reserved|
+|TEMP|non-reserved|non-reserved|not a keyword|
 |TEMPORARY|non-reserved|non-reserved|non-reserved|
 |TERMINATED|non-reserved|non-reserved|non-reserved|
 |THEN|reserved|non-reserved|reserved|
@@ -360,6 +364,7 @@ Below is a list of all the keywords in Spark SQL.
 |TRIM|non-reserved|non-reserved|non-reserved|
 |TRUE|non-reserved|non-reserved|reserved|
 |TRUNCATE|non-reserved|non-reserved|reserved|
+|TYPE|non-reserved|non-reserved|non-reserved|
 |UNARCHIVE|non-reserved|non-reserved|non-reserved|
 |UNBOUNDED|non-reserved|non-reserved|non-reserved|
 |UNCACHE|non-reserved|non-reserved|non-reserved|

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -1531,6 +1531,7 @@ DIRECTORIES: 'DIRECTORIES';
 DIRECTORY: 'DIRECTORY';
 DISTINCT: 'DISTINCT';
 DISTRIBUTE: 'DISTRIBUTE';
+DIV: 'DIV';
 DROP: 'DROP';
 ELSE: 'ELSE';
 END: 'END';
@@ -1738,7 +1739,6 @@ MINUS: '-';
 ASTERISK: '*';
 SLASH: '/';
 PERCENT: '%';
-DIV: 'DIV';
 TILDE: '~';
 AMPERSAND: '&';
 PIPE: '|';

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SQLKeywordSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import java.io.File
+import java.nio.file.Files
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.catalyst.util.fileToString
+
+trait SQLKeywordUtils extends SQLHelper {
+
+  val sqlSyntaxDefs = {
+    val sqlBasePath = {
+      java.nio.file.Paths.get(sparkHome, "sql", "catalyst", "src", "main", "antlr4", "org",
+        "apache", "spark", "sql", "catalyst", "parser", "SqlBase.g4").toFile
+    }
+    fileToString(sqlBasePath).split("\n")
+  }
+
+  // each element is an array of 4 string: the keyword name, reserve or not in Spark ANSI mode,
+  // Spark non-ANSI mode, and the SQL standard.
+  val keywordsInDoc: Array[Array[String]] = {
+    val docPath = {
+      java.nio.file.Paths.get(sparkHome, "docs", "sql-ref-ansi-compliance.md").toFile
+    }
+    fileToString(docPath).split("\n")
+      .dropWhile(!_.startsWith("|Keyword|")).drop(2).takeWhile(_.startsWith("|"))
+      .map(_.stripPrefix("|").split("\\|").map(_.trim))
+  }
+
+  private def parseAntlrGrammars[T](startTag: String, endTag: String)
+      (f: PartialFunction[String, Seq[T]]): Set[T] = {
+    val keywords = new mutable.ArrayBuffer[T]
+    val default = (_: String) => Nil
+    var startTagFound = false
+    var parseFinished = false
+    val lineIter = sqlSyntaxDefs.toIterator
+    while (!parseFinished && lineIter.hasNext) {
+      val line = lineIter.next()
+      if (line.trim.startsWith(startTag)) {
+        startTagFound = true
+      } else if (line.trim.startsWith(endTag)) {
+        parseFinished = true
+      } else if (startTagFound) {
+        f.applyOrElse(line, default).foreach { symbol =>
+          keywords += symbol
+        }
+      }
+    }
+    assert(keywords.nonEmpty && startTagFound && parseFinished, "cannot extract keywords from " +
+      s"the `SqlBase.g4` file, so please check if the start/end tags (`$startTag` and `$endTag`) " +
+      "are placed correctly in the file.")
+    keywords.toSet
+  }
+
+  // If a symbol does not have the same string with its literal (e.g., `SETMINUS: 'MINUS';`),
+  // we need to map a symbol to actual literal strings.
+  val symbolsToExpandIntoDifferentLiterals = {
+    val kwDef = """([A-Z_]+):(.+);""".r
+    val keywords = parseAntlrGrammars(
+        "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+      case kwDef(symbol, literalDef) =>
+        val splitDefs = literalDef.split("""\|""")
+        val hasMultipleLiterals = splitDefs.length > 1
+        // The case where a symbol has multiple literal definitions,
+        // e.g., `DATABASES: 'DATABASES' | 'SCHEMAS';`.
+        if (hasMultipleLiterals) {
+          // Filters out inappropriate entries, e.g., `!` in `NOT: 'NOT' | '!';`
+          val litDef = """([A-Z_]+)""".r
+          val literals = splitDefs.map(_.replaceAll("'", "").trim).toSeq.flatMap {
+            case litDef(lit) => Some(lit)
+            case _ => None
+          }
+          (symbol, literals) :: Nil
+        } else {
+          val literal = literalDef.replaceAll("'", "").trim
+          // The case where a symbol string and its literal string are different,
+          // e.g., `SETMINUS: 'MINUS';`.
+          if (symbol != literal) {
+            (symbol, literal :: Nil) :: Nil
+          } else {
+            Nil
+          }
+        }
+    }
+    keywords.toMap
+  }
+
+  // All the SQL keywords defined in `SqlBase.g4`
+  val allCandidateKeywords: Set[String] = {
+    val kwDef = """([A-Z_]+):.+;""".r
+    parseAntlrGrammars(
+        "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+      // Parses a pattern, e.g., `AFTER: 'AFTER';`
+      case kwDef(symbol) =>
+        if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+          symbolsToExpandIntoDifferentLiterals(symbol)
+        } else {
+          symbol :: Nil
+        }
+    }
+  }
+
+  val nonReservedKeywordsInAnsiMode: Set[String] = {
+    val kwDef = """\s*[\|:]\s*([A-Z_]+)\s*""".r
+    parseAntlrGrammars("//--ANSI-NON-RESERVED-START", "//--ANSI-NON-RESERVED-END") {
+      // Parses a pattern, e.g., `    | AFTER`
+      case kwDef(symbol) =>
+        if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+          symbolsToExpandIntoDifferentLiterals(symbol)
+        } else {
+          symbol :: Nil
+        }
+    }
+  }
+
+  val reservedKeywordsInAnsiMode = allCandidateKeywords -- nonReservedKeywordsInAnsiMode
+}
+
+class SQLKeywordSuite extends SparkFunSuite with SQLKeywordUtils {
+  test("all keywords are documented") {
+    val documentedKeywords = keywordsInDoc.map(_.head).toSet
+    if (allCandidateKeywords != documentedKeywords) {
+      val undocumented = (allCandidateKeywords -- documentedKeywords).toSeq.sorted
+      fail("Some keywords are not documented: " + undocumented.mkString(", "))
+    }
+  }
+
+  test("Spark keywords are documented correctly") {
+    val reservedKeywordsInDoc = keywordsInDoc.filter(_.apply(1) == "reserved").map(_.head).toSet
+    if (reservedKeywordsInAnsiMode != reservedKeywordsInDoc) {
+      val misImplemented = (reservedKeywordsInDoc -- reservedKeywordsInAnsiMode).toSeq.sorted
+      fail("Some keywords are documented as reserved but actually not: " +
+        misImplemented.mkString(", "))
+    }
+  }
+
+  test("SQL 2016 keywords are documented correctly") {
+    withTempDir { dir =>
+      val tmpFile = new File(dir, "tmp")
+      val is = Thread.currentThread().getContextClassLoader
+        .getResourceAsStream("ansi-sql-2016-reserved-keywords.txt")
+      Files.copy(is, tmpFile.toPath)
+      val reservedKeywordsInSql2016 = Files.readAllLines(tmpFile.toPath)
+        .asScala.filterNot(_.startsWith("--")).map(_.trim).toSet
+      val documented = keywordsInDoc.filter(_.last == "reserved").map(_.head).toSet
+      assert((documented -- reservedKeywordsInSql2016).isEmpty)
+    }
+  }
+}