Skip to content

Commit 5e67815

Browse files
committed
pythongh-102153: Start stripping C0 control and space chars in urlsplit
1 parent 4a3ea1f commit 5e67815

File tree

4 files changed

+54
-3
lines changed

4 files changed

+54
-3
lines changed

Doc/library/urllib.parse.rst

+7-2
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,9 @@ or on combining URL components into a URL string.
324324
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
325325
decomposed before parsing, no error will be raised.
326326

327-
Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline
328-
``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL.
327+
Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0
328+
control and space characters are stripped from the URL. ``\n``, ``\r`` and
329+
tab ``\t`` characters are removed from the URL at any position.
329330

330331
.. versionchanged:: 3.6
331332
Out-of-range port numbers now raise :exc:`ValueError`, instead of
@@ -338,6 +339,10 @@ or on combining URL components into a URL string.
338339
.. versionchanged:: 3.10
339340
ASCII newline and tab characters are stripped from the URL.
340341

342+
.. versionchanged:: 3.12
343+
Leading and trailing C0 control and space characters are stripped from
344+
the URL
345+
341346
.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser
342347

343348
.. function:: urlunsplit(parts)

Lib/test/test_urlparse.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -649,14 +649,52 @@ def test_urlsplit_remove_unsafe_bytes(self):
649649
self.assertEqual(p.scheme, "http")
650650
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
651651

652+
def test_urlsplit_strip_url(self):
653+
noise = bytes([*range(0, 0x1f), 0x20])
654+
base_url = "http://User:[email protected]:080/doc/?query=yes#frag"
655+
656+
url = noise.decode() + base_url + noise.decode()
657+
p = urllib.parse.urlsplit(url)
658+
self.assertEqual(p.scheme, "http")
659+
self.assertEqual(p.netloc, "User:[email protected]:080")
660+
self.assertEqual(p.path, "/doc/")
661+
self.assertEqual(p.query, "query=yes")
662+
self.assertEqual(p.fragment, "frag")
663+
self.assertEqual(p.username, "User")
664+
self.assertEqual(p.password, "Pass")
665+
self.assertEqual(p.hostname, "www.python.org")
666+
self.assertEqual(p.port, 80)
667+
self.assertEqual(p.geturl(), base_url)
668+
669+
url = noise + base_url.encode() + noise
670+
p = urllib.parse.urlsplit(url)
671+
self.assertEqual(p.scheme, b"http")
672+
self.assertEqual(p.netloc, b"User:[email protected]:080")
673+
self.assertEqual(p.path, b"/doc/")
674+
self.assertEqual(p.query, b"query=yes")
675+
self.assertEqual(p.fragment, b"frag")
676+
self.assertEqual(p.username, b"User")
677+
self.assertEqual(p.password, b"Pass")
678+
self.assertEqual(p.hostname, b"www.python.org")
679+
self.assertEqual(p.port, 80)
680+
self.assertEqual(p.geturl(), base_url.encode())
681+
682+
# with scheme as cache-key
683+
url = "//www.python.org/"
684+
scheme = noise.decode() + "https" + noise.decode()
685+
for _ in range(2):
686+
p = urllib.parse.urlsplit(url, scheme=scheme)
687+
self.assertEqual(p.scheme, "https")
688+
self.assertEqual(p.geturl(), "https://www.python.org/")
689+
652690
def test_attributes_bad_port(self):
653691
"""Check handling of invalid ports."""
654692
for bytes in (False, True):
655693
for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):
656694
for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
657695
with self.subTest(bytes=bytes, parse=parse, port=port):
658696
netloc = "www.example.net:" + port
659-
url = "http://" + netloc
697+
url = "http://" + netloc + "/"
660698
if bytes:
661699
if netloc.isascii() and port.isascii():
662700
netloc = netloc.encode("ascii")

Lib/urllib/parse.py

+5
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@
7979
'0123456789'
8080
'+-.')
8181

82+
# Leading and trailing C0 control and space to be stripped per WHATWG spec
83+
_URL_CHARS_TO_STRIP = "".join([*(chr(i) for i in range(0, 0x1f + 1)), " "])
84+
8285
# Unsafe bytes to be removed per WHATWG spec
8386
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
8487

@@ -452,6 +455,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
452455
"""
453456

454457
url, scheme, _coerce_result = _coerce_args(url, scheme)
458+
url = url.strip(_URL_CHARS_TO_STRIP)
459+
scheme = scheme.strip(_URL_CHARS_TO_STRIP)
455460

456461
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
457462
url = url.replace(b, "")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and
2+
space characters following the controlling specification for URLs defined by
3+
WHATWG in response to CVE-2023-24329. Patch by Illia Volochii.

0 commit comments

Comments
 (0)