Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2438,12 +2438,12 @@ setMethod("date_format", signature(y = "Column", x = "character"),
#' from_json
#'
#' Parses a column containing a JSON string into a Column of \code{structType} with the specified
#' \code{schema} or array of \code{structType} if \code{asJsonArray} is set to \code{TRUE}.
#' \code{schema} or array of \code{structType} if \code{as.json.array} is set to \code{TRUE}.
#' If the string is unparseable, the Column will contains the value NA.
#'
#' @param x Column containing the JSON string.
#' @param schema a structType object to use as the schema to use when parsing the JSON string.
#' @param asJsonArray indicating if input string is JSON array of objects or a single object.
#' @param as.json.array indicating if input string is JSON array of objects or a single object.
#' @param ... additional named properties to control how the json is parsed, accepts the same
#' options as the JSON data source.
#'
Expand All @@ -2459,8 +2459,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
#'}
#' @note from_json since 2.2.0
setMethod("from_json", signature(x = "Column", schema = "structType"),
function(x, schema, asJsonArray = FALSE, ...) {
if (asJsonArray) {
function(x, schema, as.json.array = FALSE, ...) {
if (as.json.array) {
jschema <- callJStatic("org.apache.spark.sql.types.DataTypes",
"createArrayType",
schema$jobj)
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1454,7 +1454,7 @@ test_that("column functions", {
jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
df <- as.DataFrame(list(list("people" = jsonArr)))
schema <- structType(structField("name", "string"))
arr <- collect(select(df, alias(from_json(df$people, schema, asJsonArray = TRUE), "arrcol")))
arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
expect_equal(ncol(arr), 1)
expect_equal(nrow(arr), 1)
expect_is(arr[[1]][[1]], "list")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ private[spark] class Client(
).foreach { case (flist, resType, addToClasspath) =>
flist.foreach { file =>
val (_, localizedPath) = distribute(file, resType = resType)
// If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.
// If addToClassPath, we ignore adding jar multiple times to distributed cache.
if (addToClasspath) {
if (localizedPath != null) {
cachedSecondaryJarLinks += localizedPath
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import java.util.Locale
import javax.annotation.concurrent.GuardedBy

import scala.collection.mutable
import scala.util.{Failure, Success, Try}

import com.google.common.cache.{Cache, CacheBuilder}
import org.apache.hadoop.conf.Configuration
Expand Down Expand Up @@ -1202,15 +1203,25 @@ class SessionCatalog(
def listFunctions(db: String, pattern: String): Seq[(FunctionIdentifier, String)] = {
val dbName = formatDatabaseName(db)
requireDbExists(dbName)
val dbFunctions = externalCatalog.listFunctions(dbName, pattern)
.map { f => FunctionIdentifier(f, Some(dbName)) }
val loadedFunctions = StringUtils.filterPattern(functionRegistry.listFunction(), pattern)
.map { f => FunctionIdentifier(f) }
val dbFunctions = externalCatalog.listFunctions(dbName, pattern).map { f =>
FunctionIdentifier(f, Some(dbName)) }
val loadedFunctions =
StringUtils.filterPattern(functionRegistry.listFunction(), pattern).map { f =>
// In functionRegistry, function names are stored as an unquoted format.
Try(parser.parseFunctionIdentifier(f)) match {
case Success(e) => e
case Failure(_) =>
// The names of some built-in functions are not parsable by our parser, e.g., %
FunctionIdentifier(f)
}
}
val functions = dbFunctions ++ loadedFunctions
// The session catalog caches some persistent functions in the FunctionRegistry
// so there can be duplicates.
functions.map {
case f if FunctionRegistry.functionSet.contains(f.funcName) => (f, "SYSTEM")
case f => (f, "USER")
}
}.distinct
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,30 @@ abstract class StringRegexExpression extends BinaryExpression
* Simple RegEx pattern matching function
*/
@ExpressionDescription(
usage = "str _FUNC_ pattern - Returns true if `str` matches `pattern`, or false otherwise.")
usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
"null if any arguments are null, false otherwise.",
extended = """
Arguments:
str - a string expression
pattern - a string expression. The pattern is a string which is matched literally, with
exception to the following special symbols:

_ matches any one character in the input (similar to . in posix regular expressions)

% matches zero ore more characters in the input (similar to .* in posix regular
expressions)

The escape character is '\'. If an escape character precedes a special symbol or another
escape character, the following character is matched literally. It is invalid to escape
any other character.

Examples:
> SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'
true

See also:
Use RLIKE to match with standard regular expressions.
""")
case class Like(left: Expression, right: Expression) extends StringRegexExpression {

override def escape(v: String): String = StringUtils.escapeLikeRegex(v)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,44 @@ package org.apache.spark.sql.catalyst.util

import java.util.regex.{Pattern, PatternSyntaxException}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.unsafe.types.UTF8String

object StringUtils {

// replace the _ with .{1} exactly match 1 time of any character
// replace the % with .*, match 0 or more times with any character
def escapeLikeRegex(v: String): String = {
if (!v.isEmpty) {
"(?s)" + (' ' +: v.init).zip(v).flatMap {
case (prev, '\\') => ""
case ('\\', c) =>
c match {
case '_' => "_"
case '%' => "%"
case _ => Pattern.quote("\\" + c)
}
case (prev, c) =>
/**
* Validate and convert SQL 'like' pattern to a Java regular expression.
*
* Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other
* characters are quoted literally. Escaping is done according to the rules specified in
* [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will
* throw an [[AnalysisException]].
*
* @param pattern the SQL pattern to convert
* @return the equivalent Java regular expression of the pattern
*/
def escapeLikeRegex(pattern: String): String = {
val in = pattern.toIterator
val out = new StringBuilder()

def fail(message: String) = throw new AnalysisException(
s"the pattern '$pattern' is invalid, $message")

while (in.hasNext) {
in.next match {
case '\\' if in.hasNext =>
val c = in.next
c match {
case '_' => "."
case '%' => ".*"
case _ => Pattern.quote(Character.toString(c))
case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c))
case _ => fail(s"the escape character is not allowed to precede '$c'")
}
}.mkString
} else {
v
case '\\' => fail("it is not allowed to end with the escape character")
case '_' => out ++= "."
case '%' => out ++= ".*"
case c => out ++= Pattern.quote(Character.toString(c))
}
}
"(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines
}

private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,38 @@
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.{IntegerType, StringType}

/**
* Unit tests for regular expression (regexp) related SQL expressions.
*/
class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

test("LIKE literal Regular Expression") {
checkEvaluation(Literal.create(null, StringType).like("a"), null)
/**
* Check if a given expression evaluates to an expected output, in case the input is
* a literal and in case the input is in the form of a row.
* @tparam A type of input
* @param mkExpr the expression to test for a given input
* @param input value that will be used to create the expression, as literal and in the form
* of a row
* @param expected the expected output of the expression
* @param inputToExpression an implicit conversion from the input type to its corresponding
* sql expression
*/
def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any)
(implicit inputToExpression: A => Expression): Unit = {
checkEvaluation(mkExpr(input), expected) // check literal input

val regex = 'a.string.at(0)
checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input
}

test("LIKE Pattern") {

// null handling
checkLiteralRow(Literal.create(null, StringType).like(_), "a", null)
checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(
Expand All @@ -39,45 +61,64 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)

checkEvaluation("abdef" like "abdef", true)
checkEvaluation("a_%b" like "a\\__b", true)
checkEvaluation("addb" like "a_%b", true)
checkEvaluation("addb" like "a\\__b", false)
checkEvaluation("addb" like "a%\\%b", false)
checkEvaluation("a_%b" like "a%\\%b", true)
checkEvaluation("addb" like "a%", true)
checkEvaluation("addb" like "**", false)
checkEvaluation("abc" like "a%", true)
checkEvaluation("abc" like "b%", false)
checkEvaluation("abc" like "bc%", false)
checkEvaluation("a\nb" like "a_b", true)
checkEvaluation("ab" like "a%b", true)
checkEvaluation("a\nb" like "a%b", true)
}
// simple patterns
checkLiteralRow("abdef" like _, "abdef", true)
checkLiteralRow("a_%b" like _, "a\\__b", true)
checkLiteralRow("addb" like _, "a_%b", true)
checkLiteralRow("addb" like _, "a\\__b", false)
checkLiteralRow("addb" like _, "a%\\%b", false)
checkLiteralRow("a_%b" like _, "a%\\%b", true)
checkLiteralRow("addb" like _, "a%", true)
checkLiteralRow("addb" like _, "**", false)
checkLiteralRow("abc" like _, "a%", true)
checkLiteralRow("abc" like _, "b%", false)
checkLiteralRow("abc" like _, "bc%", false)
checkLiteralRow("a\nb" like _, "a_b", true)
checkLiteralRow("ab" like _, "a%b", true)
checkLiteralRow("a\nb" like _, "a%b", true)

// empty input
checkLiteralRow("" like _, "", true)
checkLiteralRow("a" like _, "", false)
checkLiteralRow("" like _, "a", false)

// SI-17647 double-escaping backslash
checkLiteralRow("""\\\\""" like _, """%\\%""", true)
checkLiteralRow("""%%""" like _, """%%""", true)
checkLiteralRow("""\__""" like _, """\\\__""", true)
checkLiteralRow("""\\\__""" like _, """%\\%\%""", false)
checkLiteralRow("""_\\\%""" like _, """%\\""", false)

// unicode
// scalastyle:off nonascii
checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true)
checkLiteralRow("a€a" like _, "_€_", true)
checkLiteralRow("a€a" like _, "_\u20AC_", true)
checkLiteralRow("a\u20ACa" like _, "_€_", true)
// scalastyle:on nonascii

// invalid escaping
val invalidEscape = intercept[AnalysisException] {
evaluate("""a""" like """\a""")
}
assert(invalidEscape.getMessage.contains("pattern"))

val endEscape = intercept[AnalysisException] {
evaluate("""a""" like """a\""")
}
assert(endEscape.getMessage.contains("pattern"))

// case
checkLiteralRow("A" like _, "a%", false)
checkLiteralRow("a" like _, "A%", false)
checkLiteralRow("AaA" like _, "_a_", true)

test("LIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abcd" like regEx, null, create_row(null))
checkEvaluation("abdef" like regEx, true, create_row("abdef"))
checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
checkEvaluation("addb" like regEx, true, create_row("a_%b"))
checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
checkEvaluation("addb" like regEx, true, create_row("a%"))
checkEvaluation("addb" like regEx, false, create_row("**"))
checkEvaluation("abc" like regEx, true, create_row("a%"))
checkEvaluation("abc" like regEx, false, create_row("b%"))
checkEvaluation("abc" like regEx, false, create_row("bc%"))
checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
checkEvaluation("ab" like regEx, true, create_row("a%b"))
checkEvaluation("a\nb" like regEx, true, create_row("a%b"))

checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
// example
checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true)
}

test("RLIKE literal Regular Expression") {
checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
test("RLIKE Regular Expression") {
checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null)
checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
Expand All @@ -87,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)

checkEvaluation("abdef" rlike "abdef", true)
checkEvaluation("abbbbc" rlike "a.*c", true)
checkLiteralRow("abdef" rlike _, "abdef", true)
checkLiteralRow("abbbbc" rlike _, "a.*c", true)

checkEvaluation("fofo" rlike "^fo", true)
checkEvaluation("fo\no" rlike "^fo\no$", true)
checkEvaluation("Bn" rlike "^Ba*n", true)
checkEvaluation("afofo" rlike "fo", true)
checkEvaluation("afofo" rlike "^fo", false)
checkEvaluation("Baan" rlike "^Ba?n", false)
checkEvaluation("axe" rlike "pi|apa", false)
checkEvaluation("pip" rlike "^(pi)*$", false)
checkLiteralRow("fofo" rlike _, "^fo", true)
checkLiteralRow("fo\no" rlike _, "^fo\no$", true)
checkLiteralRow("Bn" rlike _, "^Ba*n", true)
checkLiteralRow("afofo" rlike _, "fo", true)
checkLiteralRow("afofo" rlike _, "^fo", false)
checkLiteralRow("Baan" rlike _, "^Ba?n", false)
checkLiteralRow("axe" rlike _, "pi|apa", false)
checkLiteralRow("pip" rlike _, "^(pi)*$", false)

checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)
checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)
checkLiteralRow("abc" rlike _, "^ab", true)
checkLiteralRow("abc" rlike _, "^bc", false)
checkLiteralRow("abc" rlike _, "^ab", true)
checkLiteralRow("abc" rlike _, "^bc", false)

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike "**")
}
}

test("RLIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike regEx, create_row("**"))
val regex = 'a.string.at(0)
evaluate("abbbbc" rlike regex, create_row("**"))
}
}


test("RegexReplace") {
val row1 = create_row("100-200", "(\\d+)", "num")
val row2 = create_row("100-200", "(\\d+)", "###")
Expand Down
Loading