Skip to content

Commit 6755796

Browse files
Remove chardet/charset-normalizer. Add fallback_charset_resolver ClientSession parameter. (#7561)
Co-authored-by: Sam Bull <[email protected]>
1 parent 5816e0e commit 6755796

21 files changed

+103
-187
lines changed

Diff for: .mypy.ini

-3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,6 @@ ignore_missing_imports = True
3535
[mypy-brotli]
3636
ignore_missing_imports = True
3737

38-
[mypy-cchardet]
39-
ignore_missing_imports = True
40-
4138
[mypy-gunicorn.*]
4239
ignore_missing_imports = True
4340

Diff for: CHANGES/7561.feature

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Replace automatic character set detection with a `fallback_charset_resolver` parameter
2+
in `ClientSession` to allow user-supplied character set detection functions.

Diff for: CONTRIBUTORS.txt

+1
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ Jesus Cea
175175
Jian Zeng
176176
Jinkyu Yi
177177
Joel Watts
178+
John Parton
178179
Jon Nabozny
179180
Jonas Krüger Svensson
180181
Jonas Obrist

Diff for: README.rst

+1-5
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,17 @@ Requirements
162162
============
163163

164164
- async-timeout_
165-
- charset-normalizer_
166165
- multidict_
167166
- yarl_
168167
- frozenlist_
169168

170-
Optionally you may install the cChardet_ and aiodns_ libraries (highly
171-
recommended for sake of speed).
169+
Optionally you may install the aiodns_ library (highly recommended for sake of speed).
172170

173-
.. _charset-normalizer: https://pypi.org/project/charset-normalizer
174171
.. _aiodns: https://pypi.python.org/pypi/aiodns
175172
.. _multidict: https://pypi.python.org/pypi/multidict
176173
.. _frozenlist: https://pypi.org/project/frozenlist/
177174
.. _yarl: https://pypi.python.org/pypi/yarl
178175
.. _async-timeout: https://pypi.python.org/pypi/async_timeout
179-
.. _cChardet: https://pypi.python.org/pypi/cchardet
180176

181177
License
182178
=======

Diff for: aiohttp/client.py

+5
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ class ClientTimeout:
162162
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)
163163

164164
_RetType = TypeVar("_RetType")
165+
_CharsetResolver = Callable[[ClientResponse, bytes], str]
165166

166167

167168
@final
@@ -192,6 +193,7 @@ class ClientSession:
192193
"_read_bufsize",
193194
"_max_line_size",
194195
"_max_field_size",
196+
"_resolve_charset",
195197
)
196198

197199
def __init__(
@@ -221,6 +223,7 @@ def __init__(
221223
read_bufsize: int = 2**16,
222224
max_line_size: int = 8190,
223225
max_field_size: int = 8190,
226+
fallback_charset_resolver: _CharsetResolver = lambda r, b: "utf-8",
224227
) -> None:
225228
if base_url is None or isinstance(base_url, URL):
226229
self._base_url: Optional[URL] = base_url
@@ -291,6 +294,8 @@ def __init__(
291294
for trace_config in self._trace_configs:
292295
trace_config.freeze()
293296

297+
self._resolve_charset = fallback_charset_resolver
298+
294299
def __init_subclass__(cls: Type["ClientSession"]) -> None:
295300
raise TypeError(
296301
"Inheritance class {} from ClientSession "

Diff for: aiohttp/client_reqrep.py

+26-26
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from typing import (
1515
TYPE_CHECKING,
1616
Any,
17+
Callable,
1718
Dict,
1819
Iterable,
1920
List,
@@ -73,11 +74,6 @@
7374
ssl = None # type: ignore[assignment]
7475
SSLContext = object # type: ignore[misc,assignment]
7576

76-
try:
77-
import cchardet as chardet
78-
except ImportError: # pragma: no cover
79-
import charset_normalizer as chardet
80-
8177

8278
__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")
8379

@@ -686,7 +682,7 @@ class ClientResponse(HeadersMixin):
686682
_raw_headers: RawHeaders = None # type: ignore[assignment]
687683

688684
_connection = None # current connection
689-
_source_traceback = None
685+
_source_traceback: Optional[traceback.StackSummary] = None
690686
# set up by ClientRequest after ClientResponse object creation
691687
# post-init stage allows to not change ctor signature
692688
_closed = True # to allow __del__ for non-initialized properly response
@@ -725,6 +721,15 @@ def __init__(
725721
self._loop = loop
726722
# store a reference to session #1985
727723
self._session: Optional[ClientSession] = session
724+
# Save reference to _resolve_charset, so that get_encoding() will still
725+
# work after the response has finished reading the body.
726+
if session is None:
727+
# TODO: Fix session=None in tests (see ClientRequest.__init__).
728+
self._resolve_charset: Callable[
729+
["ClientResponse", bytes], str
730+
] = lambda *_: "utf-8"
731+
else:
732+
self._resolve_charset = session._resolve_charset
728733
if loop.get_debug():
729734
self._source_traceback = traceback.extract_stack(sys._getframe(1))
730735

@@ -1012,27 +1017,22 @@ def get_encoding(self) -> str:
10121017

10131018
encoding = mimetype.parameters.get("charset")
10141019
if encoding:
1015-
try:
1016-
codecs.lookup(encoding)
1017-
except LookupError:
1018-
encoding = None
1019-
if not encoding:
1020-
if mimetype.type == "application" and (
1021-
mimetype.subtype == "json" or mimetype.subtype == "rdap"
1022-
):
1023-
# RFC 7159 states that the default encoding is UTF-8.
1024-
# RFC 7483 defines application/rdap+json
1025-
encoding = "utf-8"
1026-
elif self._body is None:
1027-
raise RuntimeError(
1028-
"Cannot guess the encoding of " "a not yet read body"
1029-
)
1030-
else:
1031-
encoding = chardet.detect(self._body)["encoding"]
1032-
if not encoding:
1033-
encoding = "utf-8"
1020+
with contextlib.suppress(LookupError):
1021+
return codecs.lookup(encoding).name
1022+
1023+
if mimetype.type == "application" and (
1024+
mimetype.subtype == "json" or mimetype.subtype == "rdap"
1025+
):
1026+
# RFC 7159 states that the default encoding is UTF-8.
1027+
# RFC 7483 defines application/rdap+json
1028+
return "utf-8"
1029+
1030+
if self._body is None:
1031+
raise RuntimeError(
1032+
"Cannot compute fallback encoding of a not yet read body"
1033+
)
10341034

1035-
return encoding
1035+
return self._resolve_charset(self, self._body)
10361036

10371037
async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
10381038
"""Read response payload and decode."""

Diff for: docs/_snippets/cchardet-unmaintained-admonition.rst

-5
This file was deleted.

Diff for: docs/client_advanced.rst

+30
Original file line numberDiff line numberDiff line change
@@ -740,3 +740,33 @@ HTTP Pipelining
740740
---------------
741741

742742
aiohttp does not support HTTP/HTTPS pipelining.
743+
744+
745+
Character Set Detection
746+
-----------------------
747+
748+
If you encounter a :exc:`UnicodeDecodeError` when using :meth:`ClientResponse.text()`
749+
this may be because the response does not include the charset needed
750+
to decode the body.
751+
752+
If you know the correct encoding for a request, you can simply specify
753+
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).
754+
755+
Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
756+
can be used to introduce charset guessing functionality. When a charset is not found
757+
in the Content-Type header, this function will be called to get the charset encoding. For
758+
example, this can be used with the ``chardetng_py`` library.::
759+
760+
from chardetng_py import detect
761+
762+
def charset_resolver(resp: ClientResponse, body: bytes) -> str:
763+
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
764+
return detect(body, allow_utf8=True, tld=tld)
765+
766+
ClientSession(fallback_charset_resolver=charset_resolver)
767+
768+
Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::
769+
770+
from charset_normalizer import detect
771+
772+
ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")

Diff for: docs/client_reference.rst

+22-37
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
5151
read_bufsize=2**16, \
5252
requote_redirect_url=True, \
5353
trust_env=False, \
54-
trace_configs=None)
54+
trace_configs=None, \
55+
fallback_charset_resolver=lambda r, b: "utf-8")
5556

5657
The class for creating client sessions and making requests.
5758

@@ -208,6 +209,16 @@ The client session supports the context manager protocol for self closing.
208209
disabling. See :ref:`aiohttp-client-tracing-reference` for
209210
more information.
210211

212+
:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
213+
A :term:`callable` that accepts a :class:`ClientResponse` and the
214+
:class:`bytes` contents, and returns a :class:`str` which will be used as
215+
the encoding parameter to :meth:`bytes.decode()`.
216+
217+
This function will be called when the charset is not known (e.g. not specified in the
218+
Content-Type header). The default function simply defaults to ``utf-8``.
219+
220+
.. versionadded:: 3.8.6
221+
211222
.. attribute:: closed
212223

213224
``True`` if the session has been closed, ``False`` otherwise.
@@ -1406,12 +1417,8 @@ Response object
14061417
Read response's body and return decoded :class:`str` using
14071418
specified *encoding* parameter.
14081419

1409-
If *encoding* is ``None`` content encoding is autocalculated
1410-
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
1411-
header is not provided by server.
1412-
1413-
:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
1414-
*cchardet* is not available.
1420+
If *encoding* is ``None`` content encoding is determined from the
1421+
Content-Type header, or using the ``fallback_charset_resolver`` function.
14151422

14161423
Close underlying connection if data reading gets an error,
14171424
release connection otherwise.
@@ -1420,35 +1427,21 @@ Response object
14201427
``None`` for encoding autodetection
14211428
(default).
14221429

1423-
:return str: decoded *BODY*
1424-
1425-
:raise LookupError: if the encoding detected by cchardet is
1426-
unknown by Python (e.g. VISCII).
14271430

1428-
.. note::
1431+
:raises: :exc:`UnicodeDecodeError` if decoding fails. See also
1432+
:meth:`get_encoding`.
14291433

1430-
If response has no ``charset`` info in ``Content-Type`` HTTP
1431-
header :term:`cchardet` / :term:`charset-normalizer` is used for
1432-
content encoding autodetection.
1433-
1434-
It may hurt performance. If page encoding is known passing
1435-
explicit *encoding* parameter might help::
1436-
1437-
await resp.text('ISO-8859-1')
1434+
:return str: decoded *BODY*
14381435

14391436
.. method:: json(*, encoding=None, loads=json.loads, \
14401437
content_type='application/json')
14411438
:async:
14421439

14431440
Read response's body as *JSON*, return :class:`dict` using
14441441
specified *encoding* and *loader*. If data is not still available
1445-
a ``read`` call will be done,
1442+
a ``read`` call will be done.
14461443

1447-
If *encoding* is ``None`` content encoding is autocalculated
1448-
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
1449-
*cchardet* is not available.
1450-
1451-
if response's `content-type` does not match `content_type` parameter
1444+
If response's `content-type` does not match `content_type` parameter
14521445
:exc:`aiohttp.ContentTypeError` get raised.
14531446
To disable content type check pass ``None`` value.
14541447

@@ -1480,17 +1473,9 @@ Response object
14801473

14811474
.. method:: get_encoding()
14821475

1483-
Automatically detect content encoding using ``charset`` info in
1484-
``Content-Type`` HTTP header. If this info is not exists or there
1485-
are no appropriate codecs for encoding then :term:`cchardet` /
1486-
:term:`charset-normalizer` is used.
1487-
1488-
Beware that it is not always safe to use the result of this function to
1489-
decode a response. Some encodings detected by cchardet are not known by
1490-
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.
1491-
1492-
:raise RuntimeError: if called before the body has been read,
1493-
for :term:`cchardet` usage
1476+
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
1477+
If no charset is present or the charset is not understood by Python, the
1478+
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.
14941479

14951480
.. versionadded:: 3.0
14961481

Diff for: docs/glossary.rst

-16
Original file line numberDiff line numberDiff line change
@@ -45,22 +45,6 @@
4545
Any object that can be called. Use :func:`callable` to check
4646
that.
4747

48-
charset-normalizer
49-
50-
The Real First Universal Charset Detector.
51-
Open, modern and actively maintained alternative to Chardet.
52-
53-
https://pypi.org/project/charset-normalizer/
54-
55-
cchardet
56-
57-
cChardet is high speed universal character encoding detector -
58-
binding to charsetdetect.
59-
60-
https://pypi.python.org/pypi/cchardet/
61-
62-
.. include:: _snippets/cchardet-unmaintained-admonition.rst
63-
6448
gunicorn
6549

6650
Gunicorn 'Green Unicorn' is a Python WSGI HTTP Server for

Diff for: docs/index.rst

+3-21
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,6 @@ Library Installation
3333
3434
$ pip install aiohttp
3535
36-
You may want to install *optional* :term:`cchardet` library as faster
37-
replacement for :term:`charset-normalizer`:
38-
39-
.. code-block:: bash
40-
41-
$ pip install cchardet
42-
43-
.. include:: _snippets/cchardet-unmaintained-admonition.rst
44-
4536
For speeding up DNS resolving by client API you may install
4637
:term:`aiodns` as well.
4738
This option is highly recommended:
@@ -53,9 +44,9 @@ This option is highly recommended:
5344
Installing all speedups in one command
5445
--------------------------------------
5546

56-
The following will get you ``aiohttp`` along with :term:`cchardet`,
57-
:term:`aiodns` and ``Brotli`` in one bundle. No need to type
58-
separate commands anymore!
47+
The following will get you ``aiohttp`` along with :term:`aiodns` and ``Brotli`` in one
48+
bundle.
49+
No need to type separate commands anymore!
5950

6051
.. code-block:: bash
6152
@@ -157,17 +148,8 @@ Dependencies
157148
============
158149

159150
- *async_timeout*
160-
- *charset-normalizer*
161151
- *multidict*
162152
- *yarl*
163-
- *Optional* :term:`cchardet` as faster replacement for
164-
:term:`charset-normalizer`.
165-
166-
Install it explicitly via:
167-
168-
.. code-block:: bash
169-
170-
$ pip install cchardet
171153

172154
- *Optional* :term:`aiodns` for fast DNS resolving. The
173155
library is highly recommended.

0 commit comments

Comments
 (0)