Skip to content

Commit e5c1b82

Browse files
yikfcloud-fan
authored andcommitted
[SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
### What changes were proposed in this pull request? Currently, Spark don't support url encode/decode as built-in functions, the user might use reflect instead, It's a bit of a hassle, And often these functions are useful. This pr aims to two points as follow: - add url encode/decode as built-in function support. - tidy up url-related functions to one scala file ### Why are the changes needed? url encode/decode functions are useful ### Does this PR introduce _any_ user-facing change? yes, add new function as built-in function ### How was this patch tested? add new tests Closes #37113 from yikf/url. Authored-by: Yikf <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 3c80ed8 commit e5c1b82

File tree

9 files changed

+513
-226
lines changed

9 files changed

+513
-226
lines changed

sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public class ExpressionInfo {
4545
"collection_funcs", "predicate_funcs", "conditional_funcs", "conversion_funcs",
4646
"csv_funcs", "datetime_funcs", "generator_funcs", "hash_funcs", "json_funcs",
4747
"lambda_funcs", "map_funcs", "math_funcs", "misc_funcs", "string_funcs", "struct_funcs",
48-
"window_funcs", "xml_funcs", "table_funcs"));
48+
"window_funcs", "xml_funcs", "table_funcs", "url_funcs"));
4949

5050
private static final Set<String> validSources =
5151
new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", "java_udf"));

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,6 @@ object FunctionRegistry {
544544
expressionBuilder("lpad", LPadExpressionBuilder),
545545
expression[StringTrimLeft]("ltrim"),
546546
expression[JsonTuple]("json_tuple"),
547-
expression[ParseUrl]("parse_url"),
548547
expression[StringLocate]("position", true),
549548
expression[FormatString]("printf", true),
550549
expression[RegExpExtract]("regexp_extract"),
@@ -588,6 +587,11 @@ object FunctionRegistry {
588587
expression[RegExpSubStr]("regexp_substr"),
589588
expression[RegExpInStr]("regexp_instr"),
590589

590+
// url functions
591+
expression[UrlEncode]("url_encode"),
592+
expression[UrlDecode]("url_decode"),
593+
expression[ParseUrl]("parse_url"),
594+
591595
// datetime functions
592596
expression[AddMonths]("add_months"),
593597
expression[CurrentDate]("current_date"),

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 0 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,9 @@
1717

1818
package org.apache.spark.sql.catalyst.expressions
1919

20-
import java.net.{URI, URISyntaxException}
2120
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
2221
import java.util.{Base64 => JBase64}
2322
import java.util.{HashMap, Locale, Map => JMap}
24-
import java.util.regex.Pattern
2523

2624
import scala.collection.mutable.ArrayBuffer
2725

@@ -1626,181 +1624,6 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera
16261624
copy(str = newFirst, len = newSecond, pad = newThird)
16271625
}
16281626

1629-
object ParseUrl {
1630-
private val HOST = UTF8String.fromString("HOST")
1631-
private val PATH = UTF8String.fromString("PATH")
1632-
private val QUERY = UTF8String.fromString("QUERY")
1633-
private val REF = UTF8String.fromString("REF")
1634-
private val PROTOCOL = UTF8String.fromString("PROTOCOL")
1635-
private val FILE = UTF8String.fromString("FILE")
1636-
private val AUTHORITY = UTF8String.fromString("AUTHORITY")
1637-
private val USERINFO = UTF8String.fromString("USERINFO")
1638-
private val REGEXPREFIX = "(&|^)"
1639-
private val REGEXSUBFIX = "=([^&]*)"
1640-
}
1641-
1642-
/**
1643-
* Extracts a part from a URL
1644-
*/
1645-
@ExpressionDescription(
1646-
usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
1647-
examples = """
1648-
Examples:
1649-
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST');
1650-
spark.apache.org
1651-
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY');
1652-
query=1
1653-
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query');
1654-
1
1655-
""",
1656-
since = "2.0.0",
1657-
group = "string_funcs")
1658-
case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled)
1659-
extends Expression with ExpectsInputTypes with CodegenFallback {
1660-
def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled)
1661-
1662-
override def nullable: Boolean = true
1663-
override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType)
1664-
override def dataType: DataType = StringType
1665-
override def prettyName: String = "parse_url"
1666-
1667-
// If the url is a constant, cache the URL object so that we don't need to convert url
1668-
// from UTF8String to String to URL for every row.
1669-
@transient private lazy val cachedUrl = children(0) match {
1670-
case Literal(url: UTF8String, _) if url ne null => getUrl(url)
1671-
case _ => null
1672-
}
1673-
1674-
// If the key is a constant, cache the Pattern object so that we don't need to convert key
1675-
// from UTF8String to String to StringBuilder to String to Pattern for every row.
1676-
@transient private lazy val cachedPattern = children(2) match {
1677-
case Literal(key: UTF8String, _) if key ne null => getPattern(key)
1678-
case _ => null
1679-
}
1680-
1681-
// If the partToExtract is a constant, cache the Extract part function so that we don't need
1682-
// to check the partToExtract for every row.
1683-
@transient private lazy val cachedExtractPartFunc = children(1) match {
1684-
case Literal(part: UTF8String, _) => getExtractPartFunc(part)
1685-
case _ => null
1686-
}
1687-
1688-
import ParseUrl._
1689-
1690-
override def checkInputDataTypes(): TypeCheckResult = {
1691-
if (children.size > 3 || children.size < 2) {
1692-
TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments")
1693-
} else {
1694-
super[ExpectsInputTypes].checkInputDataTypes()
1695-
}
1696-
}
1697-
1698-
private def getPattern(key: UTF8String): Pattern = {
1699-
Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
1700-
}
1701-
1702-
private def getUrl(url: UTF8String): URI = {
1703-
try {
1704-
new URI(url.toString)
1705-
} catch {
1706-
case e: URISyntaxException if failOnError =>
1707-
throw QueryExecutionErrors.invalidUrlError(url, e)
1708-
case _: URISyntaxException => null
1709-
}
1710-
}
1711-
1712-
private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
1713-
1714-
// partToExtract match {
1715-
// case HOST => _.toURL().getHost
1716-
// case PATH => _.toURL().getPath
1717-
// case QUERY => _.toURL().getQuery
1718-
// case REF => _.toURL().getRef
1719-
// case PROTOCOL => _.toURL().getProtocol
1720-
// case FILE => _.toURL().getFile
1721-
// case AUTHORITY => _.toURL().getAuthority
1722-
// case USERINFO => _.toURL().getUserInfo
1723-
// case _ => (url: URI) => null
1724-
// }
1725-
1726-
partToExtract match {
1727-
case HOST => _.getHost
1728-
case PATH => _.getRawPath
1729-
case QUERY => _.getRawQuery
1730-
case REF => _.getRawFragment
1731-
case PROTOCOL => _.getScheme
1732-
case FILE =>
1733-
(url: URI) =>
1734-
if (url.getRawQuery ne null) {
1735-
url.getRawPath + "?" + url.getRawQuery
1736-
} else {
1737-
url.getRawPath
1738-
}
1739-
case AUTHORITY => _.getRawAuthority
1740-
case USERINFO => _.getRawUserInfo
1741-
case _ => (url: URI) => null
1742-
}
1743-
}
1744-
1745-
private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = {
1746-
val m = pattern.matcher(query.toString)
1747-
if (m.find()) {
1748-
UTF8String.fromString(m.group(2))
1749-
} else {
1750-
null
1751-
}
1752-
}
1753-
1754-
private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
1755-
if (cachedExtractPartFunc ne null) {
1756-
UTF8String.fromString(cachedExtractPartFunc.apply(url))
1757-
} else {
1758-
UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url))
1759-
}
1760-
}
1761-
1762-
private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = {
1763-
if (cachedUrl ne null) {
1764-
extractFromUrl(cachedUrl, partToExtract)
1765-
} else {
1766-
val currentUrl = getUrl(url)
1767-
if (currentUrl ne null) {
1768-
extractFromUrl(currentUrl, partToExtract)
1769-
} else {
1770-
null
1771-
}
1772-
}
1773-
}
1774-
1775-
override def eval(input: InternalRow): Any = {
1776-
val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]}
1777-
if (evaluated.contains(null)) return null
1778-
if (evaluated.size == 2) {
1779-
parseUrlWithoutKey(evaluated(0), evaluated(1))
1780-
} else {
1781-
// 3-arg, i.e. QUERY with key
1782-
assert(evaluated.size == 3)
1783-
if (evaluated(1) != QUERY) {
1784-
return null
1785-
}
1786-
1787-
val query = parseUrlWithoutKey(evaluated(0), evaluated(1))
1788-
if (query eq null) {
1789-
return null
1790-
}
1791-
1792-
if (cachedPattern ne null) {
1793-
extractValueFromQuery(query, cachedPattern)
1794-
} else {
1795-
extractValueFromQuery(query, getPattern(evaluated(2)))
1796-
}
1797-
}
1798-
}
1799-
1800-
override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl =
1801-
copy(children = newChildren)
1802-
}
1803-
18041627
/**
18051628
* Returns the input formatted according do printf-style format strings
18061629
*/

0 commit comments

Comments
 (0)