Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,30 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
* Simple RegEx pattern matching function
*/
@ExpressionDescription(
usage = "str _FUNC_ pattern - Returns true if `str` matches `pattern`, or false otherwise.")
usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
"null if any arguments are null, false otherwise.",
extended = """
Arguments:
str - a string expression
pattern - a string expression. The pattern is a string which is matched literally, with
exception to the following special symbols:

_ matches any one character in the input (similar to . in posix regular expressions)

% matches zero ore more characters in the input (similar to .* in posix regular
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ore -> or?

expressions)

The escape character is '\'. If an escape character precedes a special symbol or another
escape character, the following character is matched literally. It is invalid to escape
any other character.

Examples:
> SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'
true

See also:
Use RLIKE to match with standard regular expressions.
""")
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,44 @@ package org.apache.spark.sql.catalyst.util

import java.util.regex.{Pattern, PatternSyntaxException}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.unsafe.types.UTF8String

object StringUtils {

// replace the _ with .{1} exactly match 1 time of any character
// replace the % with .*, match 0 or more times with any character
def escapeLikeRegex(v: String): String = {
if (!v.isEmpty) {
"(?s)" + (' ' +: v.init).zip(v).flatMap {
case (prev, '\\') => ""
case ('\\', c) =>
c match {
case '_' => "_"
case '%' => "%"
case _ => Pattern.quote("\\" + c)
}
case (prev, c) =>
/**
* Validate and convert SQL 'like' pattern to a Java regular expression.
*
* Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other
* characters are quoted literally. Escaping is done according to the rules specified in
* [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will
* throw an [[AnalysisException]].
*
* @param pattern the SQL pattern to convert
* @return the equivalent Java regular expression of the pattern
*/
def escapeLikeRegex(pattern: String): String = {
val in = pattern.toIterator
val out = new StringBuilder()

def fail(message: String) = throw new AnalysisException(
s"the pattern '$pattern' is invalid, $message")

while (in.hasNext) {
in.next match {
case '\\' if in.hasNext =>
val c = in.next
c match {
case '_' => "."
case '%' => ".*"
case _ => Pattern.quote(Character.toString(c))
case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c))
case _ => fail(s"the escape character is not allowed to precede '$c'")
}
}.mkString
} else {
v
case '\\' => fail("it is not allowed to end with the escape character")
case '_' => out ++= "."
case '%' => out ++= ".*"
case c => out ++= Pattern.quote(Character.toString(c))
}
}
"(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines
}

private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,38 @@
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.{IntegerType, StringType}

/**
* Unit tests for regular expression (regexp) related SQL expressions.
*/
class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

test("LIKE literal Regular Expression") {
checkEvaluation(Literal.create(null, StringType).like("a"), null)
/**
* Check if a given expression evaluates to an expected output, in case the input is
* a literal and in case the input is in the form of a row.
* @tparam A type of input
* @param mkExpr the expression to test for a given input
* @param input value that will be used to create the expression, as literal and in the form
* of a row
* @param expected the expected output of the expression
* @param inputToExpression an implicit conversion from the input type to its corresponding
* sql expression
*/
def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any)
(implicit inputToExpression: A => Expression): Unit = {
checkEvaluation(mkExpr(input), expected) // check literal input

val regex = 'a.string.at(0)
checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input
}

test("LIKE Pattern") {

// null handling
checkLiteralRow(Literal.create(null, StringType).like(_), "a", null)
checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(
Expand All @@ -39,45 +61,64 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)

checkEvaluation("abdef" like "abdef", true)
checkEvaluation("a_%b" like "a\\__b", true)
checkEvaluation("addb" like "a_%b", true)
checkEvaluation("addb" like "a\\__b", false)
checkEvaluation("addb" like "a%\\%b", false)
checkEvaluation("a_%b" like "a%\\%b", true)
checkEvaluation("addb" like "a%", true)
checkEvaluation("addb" like "**", false)
checkEvaluation("abc" like "a%", true)
checkEvaluation("abc" like "b%", false)
checkEvaluation("abc" like "bc%", false)
checkEvaluation("a\nb" like "a_b", true)
checkEvaluation("ab" like "a%b", true)
checkEvaluation("a\nb" like "a%b", true)
}
// simple patterns
checkLiteralRow("abdef" like _, "abdef", true)
checkLiteralRow("a_%b" like _, "a\\__b", true)
checkLiteralRow("addb" like _, "a_%b", true)
checkLiteralRow("addb" like _, "a\\__b", false)
checkLiteralRow("addb" like _, "a%\\%b", false)
checkLiteralRow("a_%b" like _, "a%\\%b", true)
checkLiteralRow("addb" like _, "a%", true)
checkLiteralRow("addb" like _, "**", false)
checkLiteralRow("abc" like _, "a%", true)
checkLiteralRow("abc" like _, "b%", false)
checkLiteralRow("abc" like _, "bc%", false)
checkLiteralRow("a\nb" like _, "a_b", true)
checkLiteralRow("ab" like _, "a%b", true)
checkLiteralRow("a\nb" like _, "a%b", true)

// empty input
checkLiteralRow("" like _, "", true)
checkLiteralRow("a" like _, "", false)
checkLiteralRow("" like _, "a", false)

// SI-17647 double-escaping backslash
checkLiteralRow("""\\\\""" like _, """%\\%""", true)
checkLiteralRow("""%%""" like _, """%%""", true)
checkLiteralRow("""\__""" like _, """\\\__""", true)
checkLiteralRow("""\\\__""" like _, """%\\%\%""", false)
checkLiteralRow("""_\\\%""" like _, """%\\""", false)

// unicode
// scalastyle:off nonascii
checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true)
checkLiteralRow("a€a" like _, "_€_", true)
checkLiteralRow("a€a" like _, "_\u20AC_", true)
checkLiteralRow("a\u20ACa" like _, "_€_", true)
// scalastyle:on nonascii

// invalid escaping
val invalidEscape = intercept[AnalysisException] {
evaluate("""a""" like """\a""")
}
assert(invalidEscape.getMessage.contains("pattern"))

val endEscape = intercept[AnalysisException] {
evaluate("""a""" like """a\""")
}
assert(endEscape.getMessage.contains("pattern"))

// case
checkLiteralRow("A" like _, "a%", false)
checkLiteralRow("a" like _, "A%", false)
checkLiteralRow("AaA" like _, "_a_", true)

test("LIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abcd" like regEx, null, create_row(null))
checkEvaluation("abdef" like regEx, true, create_row("abdef"))
checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
checkEvaluation("addb" like regEx, true, create_row("a_%b"))
checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
checkEvaluation("addb" like regEx, true, create_row("a%"))
checkEvaluation("addb" like regEx, false, create_row("**"))
checkEvaluation("abc" like regEx, true, create_row("a%"))
checkEvaluation("abc" like regEx, false, create_row("b%"))
checkEvaluation("abc" like regEx, false, create_row("bc%"))
checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
checkEvaluation("ab" like regEx, true, create_row("a%b"))
checkEvaluation("a\nb" like regEx, true, create_row("a%b"))

checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
// example
checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true)
}

test("RLIKE literal Regular Expression") {
checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
test("RLIKE Regular Expression") {
checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null)
checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
Expand All @@ -87,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)

checkEvaluation("abdef" rlike "abdef", true)
checkEvaluation("abbbbc" rlike "a.*c", true)
checkLiteralRow("abdef" rlike _, "abdef", true)
checkLiteralRow("abbbbc" rlike _, "a.*c", true)

checkEvaluation("fofo" rlike "^fo", true)
checkEvaluation("fo\no" rlike "^fo\no$", true)
checkEvaluation("Bn" rlike "^Ba*n", true)
checkEvaluation("afofo" rlike "fo", true)
checkEvaluation("afofo" rlike "^fo", false)
checkEvaluation("Baan" rlike "^Ba?n", false)
checkEvaluation("axe" rlike "pi|apa", false)
checkEvaluation("pip" rlike "^(pi)*$", false)
checkLiteralRow("fofo" rlike _, "^fo", true)
checkLiteralRow("fo\no" rlike _, "^fo\no$", true)
checkLiteralRow("Bn" rlike _, "^Ba*n", true)
checkLiteralRow("afofo" rlike _, "fo", true)
checkLiteralRow("afofo" rlike _, "^fo", false)
checkLiteralRow("Baan" rlike _, "^Ba?n", false)
checkLiteralRow("axe" rlike _, "pi|apa", false)
checkLiteralRow("pip" rlike _, "^(pi)*$", false)

checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)
checkEvaluation("abc" rlike "^ab", true)
checkEvaluation("abc" rlike "^bc", false)
checkLiteralRow("abc" rlike _, "^ab", true)
checkLiteralRow("abc" rlike _, "^bc", false)
checkLiteralRow("abc" rlike _, "^ab", true)
checkLiteralRow("abc" rlike _, "^bc", false)

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike "**")
}
}

test("RLIKE Non-literal Regular Expression") {
val regEx = 'a.string.at(0)
checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))

intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike regEx, create_row("**"))
val regex = 'a.string.at(0)
evaluate("abbbbc" rlike regex, create_row("**"))
}
}


test("RegexReplace") {
val row1 = create_row("100-200", "(\\d+)", "num")
val row2 = create_row("100-200", "(\\d+)", "###")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ class StringUtilsSuite extends SparkFunSuite {

test("escapeLikeRegex") {
assert(escapeLikeRegex("abdef") === "(?s)\\Qa\\E\\Qb\\E\\Qd\\E\\Qe\\E\\Qf\\E")
assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E_.\\Qb\\E")
assert(escapeLikeRegex("a\\__b") === "(?s)\\Qa\\E\\Q_\\E.\\Qb\\E")
assert(escapeLikeRegex("a_%b") === "(?s)\\Qa\\E..*\\Qb\\E")
assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*%\\Qb\\E")
assert(escapeLikeRegex("a%\\%b") === "(?s)\\Qa\\E.*\\Q%\\E\\Qb\\E")
assert(escapeLikeRegex("a%") === "(?s)\\Qa\\E.*")
assert(escapeLikeRegex("**") === "(?s)\\Q*\\E\\Q*\\E")
assert(escapeLikeRegex("a_b") === "(?s)\\Qa\\E.\\Qb\\E")
Expand Down