Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose the separator parameter added upstream to parse_qs and parse_qsl #167

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,10 @@ def test_url_query_parameter(self):
self.assertEqual(
url_query_parameter("product.html?id=", "id", keep_blank_values=1), ""
)
self.assertEqual(
url_query_parameter("product.html?id=200;foo=bar", "id", separator=';'),
'200',
)

def test_url_query_parameter_2(self):
"""
Expand Down Expand Up @@ -958,6 +962,14 @@ def test_add_or_replace_parameter_fail(self):
"http://domain/test?arg1=v3&arg2=v2",
)

@pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164")
def test_add_or_replace_parameter_semicolon(self):
url = 'http://domain/test?arg1=v1;arg2=v2;arg3=v3'
self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4', separator=';'),
'http://domain/test?arg1=v1;arg2=v2;arg3=v3;arg4=v4')
self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3', separator=';'),
'http://domain/test?arg1=v1;arg2=v2;arg3=nv3')

def test_add_or_replace_parameters(self):
url = "http://domain/test"
self.assertEqual(
Expand Down Expand Up @@ -1157,6 +1169,11 @@ def test_typical_usage(self):
"http://www.example.com/do?a=1",
)

@pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164")
def test_typical_usage_semicolon(self):
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1;b=2;a=3", query_separator=';'),
"http://www.example.com/do?a=3;b=2;c=1")

def test_port_number(self):
self.assertEqual(
canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),
Expand Down
59 changes: 47 additions & 12 deletions w3lib/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import posixpath
import re
import string
from inspect import getfullargspec
from typing import (
cast,
Callable,
Expand All @@ -20,8 +21,8 @@
Union,
)
from urllib.parse import (
parse_qs,
parse_qsl,
parse_qs as _parse_qs,
parse_qsl as _parse_qsl,
ParseResult,
quote,
unquote_to_bytes,
Expand All @@ -41,6 +42,23 @@
from ._url import _SPECIAL_SCHEMES


_REMOVE_SEPARATOR = 'separator' not in getfullargspec(_parse_qs)[0]


def _handle_separator(func, *args, **kwargs):
if _REMOVE_SEPARATOR:
kwargs.pop('separator', None)
return func(*args, **kwargs)


def parse_qs(*args, **kwargs):
return _handle_separator(_parse_qs, *args, **kwargs)


def parse_qsl(*args, **kwargs):
return _handle_separator(_parse_qsl, *args, **kwargs)


# error handling function for bytes-to-Unicode decoding errors with URLs
def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
error = cast(AnyUnicodeError, error)
Expand Down Expand Up @@ -200,6 +218,8 @@ def url_query_parameter(
parameter: str,
default: Optional[str] = None,
keep_blank_values: Union[bool, int] = 0,
*,
separator: str = '&',
) -> Optional[str]:
"""Return the value of a url parameter, given the url and parameter name

Expand Down Expand Up @@ -230,7 +250,9 @@ def url_query_parameter(
"""

queryparams = parse_qs(
urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values)
urlsplit(str(url))[3],
keep_blank_values=bool(keep_blank_values),
separator=separator,
)
if parameter in queryparams:
return queryparams[parameter][0]
Expand Down Expand Up @@ -305,9 +327,13 @@ def url_query_cleaner(
return url


def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
def _add_or_replace_parameters(url: str, params: Dict[str, str], *, separator: str = '&') -> str:
parsed = urlsplit(url)
current_args = parse_qsl(parsed.query, keep_blank_values=True)
current_args = parse_qsl(
parsed.query,
keep_blank_values=True,
separator=separator,
)

new_args = []
seen_params = set()
Expand All @@ -327,7 +353,7 @@ def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
return urlunsplit(parsed._replace(query=query))


def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
def add_or_replace_parameter(url: str, name: str, new_value: str, *, separator: str = '&') -> str:
"""Add or remove a parameter to a given url

>>> import w3lib.url
Expand All @@ -340,10 +366,10 @@ def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
>>>

"""
return _add_or_replace_parameters(url, {name: new_value})
return _add_or_replace_parameters(url, {name: new_value}, separator=separator)


def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
def add_or_replace_parameters(url: str, new_parameters: Dict[str, str], *, separator: str = '&') -> str:
"""Add or remove a parameters to a given url

>>> import w3lib.url
Expand All @@ -355,7 +381,7 @@ def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
>>>

"""
return _add_or_replace_parameters(url, new_parameters)
return _add_or_replace_parameters(url, new_parameters, separator=separator)


def path_to_file_uri(path: str) -> str:
Expand Down Expand Up @@ -528,6 +554,8 @@ def canonicalize_url(
keep_blank_values: bool = True,
keep_fragments: bool = False,
encoding: Optional[str] = None,
*,
query_separator: str = '&'
) -> str:
r"""Canonicalize the given url by applying the following procedures:

Expand Down Expand Up @@ -600,7 +628,11 @@ def canonicalize_url(
# Similar considerations apply to query parts. The functionality of
# IRIs (namely, to be able to include non-ASCII characters) can only be
# used if the query part is encoded in UTF-8.
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
keyvals = parse_qsl_to_bytes(
query,
keep_blank_values,
separator=query_separator,
)

keyvals.sort()
query = urlencode(keyvals)
Expand Down Expand Up @@ -642,7 +674,10 @@ def parse_url(


def parse_qsl_to_bytes(
qs: str, keep_blank_values: bool = False
qs: str,
keep_blank_values: bool = False,
*,
separator: str = '&',
) -> List[Tuple[bytes, bytes]]:
"""Parse a query given as a string argument.

Expand All @@ -665,7 +700,7 @@ def parse_qsl_to_bytes(
# with unquote_to_bytes(s)
coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args)
qs, _coerce_result = coerce_args(qs)
pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
pairs = qs.split(separator)
r = []
for name_value in pairs:
if not name_value:
Expand Down