Skip to content

Commit

Permalink
feat(pandas): add ExtractURLField functions
Browse files Browse the repository at this point in the history
  • Loading branch information
mesejo authored and jcrist committed Jul 15, 2023
1 parent a2ae7ae commit e369333
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 2 deletions.
77 changes: 77 additions & 0 deletions ibis/backends/pandas/execution/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import operator
from functools import partial, reduce
from urllib.parse import parse_qs, urlsplit

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -481,3 +482,79 @@ def execute_json_getitem_series_str_int(_, data, key, **kwargs):
@execute_node.register(ops.JSONGetItem, pd.Series, pd.Series)
def execute_json_getitem_series_series(_, data, key, **kwargs):
return pd.Series(map(try_getitem, data, key), dtype="object")


def _extract_url_field(data, field_name):
if isinstance(data, str):
return getattr(urlsplit(data), field_name, "")

return pd.Series(
[getattr(urlsplit(string), field_name, "") for string in data],
dtype=data.dtype,
name=data.name,
)


@execute_node.register(ops.ExtractProtocol, (pd.Series, str))
def execute_extract_protocol(op, data, **kwargs):
return _extract_url_field(data, "scheme")


@execute_node.register(ops.ExtractAuthority, (pd.Series, str))
def execute_extract_authority(op, data, **kwargs):
return _extract_url_field(data, "netloc")


@execute_node.register(ops.ExtractPath, (pd.Series, str))
def execute_extract_path(op, data, **kwargs):
return _extract_url_field(data, "path")


@execute_node.register(ops.ExtractFragment, (pd.Series, str))
def execute_extract_fragment(op, data, **kwargs):
return _extract_url_field(data, "fragment")


@execute_node.register(ops.ExtractHost, (pd.Series, str))
def execute_extract_host(op, data, **kwargs):
return _extract_url_field(data, "hostname")


@execute_node.register(ops.ExtractQuery, (pd.Series, str), (str, type(None)))
def execute_extract_query(op, data, key, **kwargs):
def extract_query_param(url, param_name):
query = urlsplit(url).query
if param_name is not None:
value = parse_qs(query)[param_name]
return value if len(value) > 1 else value[0]
else:
return query

if isinstance(data, str):
return extract_query_param(data, key)

return pd.Series(
[extract_query_param(url, key) for url in data],
dtype=data.dtype,
name=data.name,
)


@execute_node.register(ops.ExtractUserInfo, (pd.Series, str))
def execute_extract_user_info(op, data, **kwargs):
def extract_user_info(url):
url_parts = urlsplit(url)

username = url_parts.username or ""
password = url_parts.password or ""

return f"{username}:{password}"

if isinstance(data, str):
return extract_user_info(data)

return pd.Series(
[extract_user_info(string) for string in data],
dtype=data.dtype,
name=data.name,
)
7 changes: 5 additions & 2 deletions ibis/backends/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,11 @@ def test_substr_with_null_values(backend, alltypes, df):
lambda d: d.file(),
"/docs/books/tutorial/index.html?name=networking",
id="file",
marks=[
pytest.mark.notimpl(
["pandas", "dask"], raises=com.OperationNotDefinedError
),
],
),
param(lambda d: d.path(), "/docs/books/tutorial/index.html", id="path"),
param(lambda d: d.query(), "name=networking", id="query"),
Expand All @@ -965,12 +970,10 @@ def test_substr_with_null_values(backend, alltypes, df):
@pytest.mark.notimpl(
[
"bigquery",
"dask",
"datafusion",
"duckdb",
"mssql",
"mysql",
"pandas",
"polars",
"postgres",
"pyspark",
Expand Down

0 comments on commit e369333

Please sign in to comment.