|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.catalyst.expressions |
19 | 19 |
|
20 | | -import java.net.{URI, URISyntaxException} |
21 | 20 | import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} |
22 | 21 | import java.util.{Base64 => JBase64} |
23 | 22 | import java.util.{HashMap, Locale, Map => JMap} |
24 | | -import java.util.regex.Pattern |
25 | 23 |
|
26 | 24 | import scala.collection.mutable.ArrayBuffer |
27 | 25 |
|
@@ -1626,181 +1624,6 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera |
1626 | 1624 | copy(str = newFirst, len = newSecond, pad = newThird) |
1627 | 1625 | } |
1628 | 1626 |
|
1629 | | -object ParseUrl { |
1630 | | - private val HOST = UTF8String.fromString("HOST") |
1631 | | - private val PATH = UTF8String.fromString("PATH") |
1632 | | - private val QUERY = UTF8String.fromString("QUERY") |
1633 | | - private val REF = UTF8String.fromString("REF") |
1634 | | - private val PROTOCOL = UTF8String.fromString("PROTOCOL") |
1635 | | - private val FILE = UTF8String.fromString("FILE") |
1636 | | - private val AUTHORITY = UTF8String.fromString("AUTHORITY") |
1637 | | - private val USERINFO = UTF8String.fromString("USERINFO") |
1638 | | - private val REGEXPREFIX = "(&|^)" |
1639 | | - private val REGEXSUBFIX = "=([^&]*)" |
1640 | | -} |
1641 | | - |
1642 | | -/** |
1643 | | - * Extracts a part from a URL |
1644 | | - */ |
1645 | | -@ExpressionDescription( |
1646 | | - usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", |
1647 | | - examples = """ |
1648 | | - Examples: |
1649 | | - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); |
1650 | | - spark.apache.org |
1651 | | - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); |
1652 | | - query=1 |
1653 | | - > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); |
1654 | | - 1 |
1655 | | - """, |
1656 | | - since = "2.0.0", |
1657 | | - group = "string_funcs") |
1658 | | -case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled) |
1659 | | - extends Expression with ExpectsInputTypes with CodegenFallback { |
1660 | | - def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled) |
1661 | | - |
1662 | | - override def nullable: Boolean = true |
1663 | | - override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType) |
1664 | | - override def dataType: DataType = StringType |
1665 | | - override def prettyName: String = "parse_url" |
1666 | | - |
1667 | | - // If the url is a constant, cache the URL object so that we don't need to convert url |
1668 | | - // from UTF8String to String to URL for every row. |
1669 | | - @transient private lazy val cachedUrl = children(0) match { |
1670 | | - case Literal(url: UTF8String, _) if url ne null => getUrl(url) |
1671 | | - case _ => null |
1672 | | - } |
1673 | | - |
1674 | | - // If the key is a constant, cache the Pattern object so that we don't need to convert key |
1675 | | - // from UTF8String to String to StringBuilder to String to Pattern for every row. |
1676 | | - @transient private lazy val cachedPattern = children(2) match { |
1677 | | - case Literal(key: UTF8String, _) if key ne null => getPattern(key) |
1678 | | - case _ => null |
1679 | | - } |
1680 | | - |
1681 | | - // If the partToExtract is a constant, cache the Extract part function so that we don't need |
1682 | | - // to check the partToExtract for every row. |
1683 | | - @transient private lazy val cachedExtractPartFunc = children(1) match { |
1684 | | - case Literal(part: UTF8String, _) => getExtractPartFunc(part) |
1685 | | - case _ => null |
1686 | | - } |
1687 | | - |
1688 | | - import ParseUrl._ |
1689 | | - |
1690 | | - override def checkInputDataTypes(): TypeCheckResult = { |
1691 | | - if (children.size > 3 || children.size < 2) { |
1692 | | - TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments") |
1693 | | - } else { |
1694 | | - super[ExpectsInputTypes].checkInputDataTypes() |
1695 | | - } |
1696 | | - } |
1697 | | - |
1698 | | - private def getPattern(key: UTF8String): Pattern = { |
1699 | | - Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX) |
1700 | | - } |
1701 | | - |
1702 | | - private def getUrl(url: UTF8String): URI = { |
1703 | | - try { |
1704 | | - new URI(url.toString) |
1705 | | - } catch { |
1706 | | - case e: URISyntaxException if failOnError => |
1707 | | - throw QueryExecutionErrors.invalidUrlError(url, e) |
1708 | | - case _: URISyntaxException => null |
1709 | | - } |
1710 | | - } |
1711 | | - |
1712 | | - private def getExtractPartFunc(partToExtract: UTF8String): URI => String = { |
1713 | | - |
1714 | | - // partToExtract match { |
1715 | | - // case HOST => _.toURL().getHost |
1716 | | - // case PATH => _.toURL().getPath |
1717 | | - // case QUERY => _.toURL().getQuery |
1718 | | - // case REF => _.toURL().getRef |
1719 | | - // case PROTOCOL => _.toURL().getProtocol |
1720 | | - // case FILE => _.toURL().getFile |
1721 | | - // case AUTHORITY => _.toURL().getAuthority |
1722 | | - // case USERINFO => _.toURL().getUserInfo |
1723 | | - // case _ => (url: URI) => null |
1724 | | - // } |
1725 | | - |
1726 | | - partToExtract match { |
1727 | | - case HOST => _.getHost |
1728 | | - case PATH => _.getRawPath |
1729 | | - case QUERY => _.getRawQuery |
1730 | | - case REF => _.getRawFragment |
1731 | | - case PROTOCOL => _.getScheme |
1732 | | - case FILE => |
1733 | | - (url: URI) => |
1734 | | - if (url.getRawQuery ne null) { |
1735 | | - url.getRawPath + "?" + url.getRawQuery |
1736 | | - } else { |
1737 | | - url.getRawPath |
1738 | | - } |
1739 | | - case AUTHORITY => _.getRawAuthority |
1740 | | - case USERINFO => _.getRawUserInfo |
1741 | | - case _ => (url: URI) => null |
1742 | | - } |
1743 | | - } |
1744 | | - |
1745 | | - private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = { |
1746 | | - val m = pattern.matcher(query.toString) |
1747 | | - if (m.find()) { |
1748 | | - UTF8String.fromString(m.group(2)) |
1749 | | - } else { |
1750 | | - null |
1751 | | - } |
1752 | | - } |
1753 | | - |
1754 | | - private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = { |
1755 | | - if (cachedExtractPartFunc ne null) { |
1756 | | - UTF8String.fromString(cachedExtractPartFunc.apply(url)) |
1757 | | - } else { |
1758 | | - UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url)) |
1759 | | - } |
1760 | | - } |
1761 | | - |
1762 | | - private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = { |
1763 | | - if (cachedUrl ne null) { |
1764 | | - extractFromUrl(cachedUrl, partToExtract) |
1765 | | - } else { |
1766 | | - val currentUrl = getUrl(url) |
1767 | | - if (currentUrl ne null) { |
1768 | | - extractFromUrl(currentUrl, partToExtract) |
1769 | | - } else { |
1770 | | - null |
1771 | | - } |
1772 | | - } |
1773 | | - } |
1774 | | - |
1775 | | - override def eval(input: InternalRow): Any = { |
1776 | | - val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]} |
1777 | | - if (evaluated.contains(null)) return null |
1778 | | - if (evaluated.size == 2) { |
1779 | | - parseUrlWithoutKey(evaluated(0), evaluated(1)) |
1780 | | - } else { |
1781 | | - // 3-arg, i.e. QUERY with key |
1782 | | - assert(evaluated.size == 3) |
1783 | | - if (evaluated(1) != QUERY) { |
1784 | | - return null |
1785 | | - } |
1786 | | - |
1787 | | - val query = parseUrlWithoutKey(evaluated(0), evaluated(1)) |
1788 | | - if (query eq null) { |
1789 | | - return null |
1790 | | - } |
1791 | | - |
1792 | | - if (cachedPattern ne null) { |
1793 | | - extractValueFromQuery(query, cachedPattern) |
1794 | | - } else { |
1795 | | - extractValueFromQuery(query, getPattern(evaluated(2))) |
1796 | | - } |
1797 | | - } |
1798 | | - } |
1799 | | - |
1800 | | - override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl = |
1801 | | - copy(children = newChildren) |
1802 | | -} |
1803 | | - |
1804 | 1627 | /** |
1805 | 1628 | * Returns the input formatted according do printf-style format strings |
1806 | 1629 | */ |
|
0 commit comments