diff --git a/sdk/core/azure-core/CHANGELOG.md b/sdk/core/azure-core/CHANGELOG.md index 80a33bb00acc..9a8bf9291e74 100644 --- a/sdk/core/azure-core/CHANGELOG.md +++ b/sdk/core/azure-core/CHANGELOG.md @@ -6,7 +6,9 @@ - Cut hard dependency on requests library -### Breaking Changes +### Breaking Changes in the Provisional `azure.core.rest` package + +- `azure.core.rest` will not try to guess the `charset` anymore if it was impossible to extract it from `HttpResponse` analysis. This removes our dependency on `charset`. ### Key Bugs Fixed diff --git a/sdk/core/azure-core/azure/core/rest/_helpers.py b/sdk/core/azure-core/azure/core/rest/_helpers.py index 27e11299fdb5..4975d09d655c 100644 --- a/sdk/core/azure-core/azure/core/rest/_helpers.py +++ b/sdk/core/azure-core/azure/core/rest/_helpers.py @@ -50,11 +50,6 @@ from urlparse import urlparse # type: ignore except ImportError: from urllib.parse import urlparse -try: - import cchardet as chardet -except ImportError: # pragma: no cover - import chardet # type: ignore -from ..exceptions import ResponseNotReadError ################################### TYPES SECTION ######################### @@ -285,22 +280,21 @@ def from_pipeline_transport_request_helper(request_class, pipeline_transport_req ) def get_charset_encoding(response): + # type: (...) -> Optional[str] content_type = response.headers.get("Content-Type") if not content_type: return None _, params = cgi.parse_header(content_type) encoding = params.get('charset') # -> utf-8 - if encoding is None: - if content_type in ("application/json", "application/rdap+json"): - # RFC 7159 states that the default encoding is UTF-8. - # RFC 7483 defines application/rdap+json - encoding = "utf-8" - else: - try: - encoding = chardet.detect(response.content)["encoding"] - except ResponseNotReadError: - pass if encoding is None or not lookup_encoding(encoding): return None return encoding + +def decode_to_text(encoding, content): + # type: (Optional[str], bytes) -> str + if encoding == "utf-8": + encoding = "utf-8-sig" + if encoding: + return content.decode(encoding) + return codecs.getincrementaldecoder("utf-8-sig")(errors="replace").decode(content) diff --git a/sdk/core/azure-core/azure/core/rest/_rest.py b/sdk/core/azure-core/azure/core/rest/_rest.py index 56823ab6d927..295051a83c50 100644 --- a/sdk/core/azure-core/azure/core/rest/_rest.py +++ b/sdk/core/azure-core/azure/core/rest/_rest.py @@ -42,6 +42,7 @@ to_pipeline_transport_request_helper, from_pipeline_transport_request_helper, get_charset_encoding, + decode_to_text, ) from ..exceptions import ResponseNotReadError if TYPE_CHECKING: @@ -205,6 +206,7 @@ def __init__(self, **kwargs): self._json = None # this is filled in ContentDecodePolicy, when we deserialize self._connection_data_block_size = None # type: Optional[int] self._content = None # type: Optional[bytes] + self._text = None # type: Optional[str] @property def url(self): @@ -215,13 +217,18 @@ def url(self): @property def encoding(self): # type: (...) -> Optional[str] - """Returns the response encoding. By default, is specified - by the response Content-Type header. + """Returns the response encoding. + + :return: The response encoding. We either return the encoding set by the user, + or try extracting the encoding from the response's content type. If all fails, + we return `None`. + :rtype: optional[str] """ try: return self._encoding except AttributeError: - return get_charset_encoding(self) + self._encoding = get_charset_encoding(self) # type: Optional[str] + return self._encoding @encoding.setter def encoding(self, value): @@ -233,10 +240,13 @@ def encoding(self, value): def text(self): # type: (...) -> str """Returns the response body as a string""" - encoding = self.encoding - if encoding == "utf-8" or encoding is None: - encoding = "utf-8-sig" - return self.content.decode(encoding) + if self._text is None: + content = self.content + if not content: + self._text = "" + else: + self._text = decode_to_text(self.encoding, self.content) + return self._text def json(self): # type: (...) -> Any diff --git a/sdk/core/azure-core/azure/core/rest/_rest_py3.py b/sdk/core/azure-core/azure/core/rest/_rest_py3.py index 80a91004c38f..af957f767a39 100644 --- a/sdk/core/azure-core/azure/core/rest/_rest_py3.py +++ b/sdk/core/azure-core/azure/core/rest/_rest_py3.py @@ -55,7 +55,8 @@ format_parameters, to_pipeline_transport_request_helper, from_pipeline_transport_request_helper, - get_charset_encoding + get_charset_encoding, + decode_to_text, ) from ._helpers_py3 import set_content_body from ..exceptions import ResponseNotReadError @@ -235,6 +236,7 @@ def __init__( self._connection_data_block_size = None self._json = None # this is filled in ContentDecodePolicy, when we deserialize self._content = None # type: Optional[bytes] + self._text = None # type: Optional[str] @property def url(self) -> str: @@ -243,13 +245,18 @@ def url(self) -> str: @property def encoding(self) -> Optional[str]: - """Returns the response encoding. By default, is specified - by the response Content-Type header. + """Returns the response encoding. + + :return: The response encoding. We either return the encoding set by the user, + or try extracting the encoding from the response's content type. If all fails, + we return `None`. + :rtype: optional[str] """ try: return self._encoding except AttributeError: - return get_charset_encoding(self) + self._encoding: Optional[str] = get_charset_encoding(self) + return self._encoding @encoding.setter def encoding(self, value: str) -> None: @@ -259,10 +266,13 @@ def encoding(self, value: str) -> None: @property def text(self) -> str: """Returns the response body as a string""" - encoding = self.encoding - if encoding == "utf-8" or encoding is None: - encoding = "utf-8-sig" - return self.content.decode(encoding) + if self._text is None: + content = self.content + if not content: + self._text = "" + else: + self._text = decode_to_text(self.encoding, self.content) + return self._text def json(self) -> Any: """Returns the whole body as a json object. diff --git a/sdk/core/azure-core/tests/testserver_tests/async_tests/test_rest_http_response_async.py b/sdk/core/azure-core/tests/testserver_tests/async_tests/test_rest_http_response_async.py index 317b74c3bac0..47286b76254b 100644 --- a/sdk/core/azure-core/tests/testserver_tests/async_tests/test_rest_http_response_async.py +++ b/sdk/core/azure-core/tests/testserver_tests/async_tests/test_rest_http_response_async.py @@ -149,7 +149,7 @@ async def test_response_no_charset_with_ascii_content(send_request): assert response.headers["Content-Type"] == "text/plain" assert response.status_code == 200 - assert response.encoding == 'ascii' + assert response.encoding is None content = await response.read() assert content == b"Hello, world!" assert response.text == "Hello, world!" @@ -165,8 +165,8 @@ async def test_response_no_charset_with_iso_8859_1_content(send_request): request=HttpRequest("GET", "/encoding/iso-8859-1"), ) await response.read() - assert response.text == u"Accented: Österreich" - assert response.encoding == 'ISO-8859-1' + assert response.text == "Accented: �sterreich" # aiohttp is having diff behavior than requests + assert response.encoding is None # NOTE: aiohttp isn't liking this # @pytest.mark.asyncio @@ -187,7 +187,7 @@ async def test_json(send_request): ) await response.read() assert response.json() == {"greeting": "hello", "recipient": "world"} - assert response.encoding == 'utf-8' + assert response.encoding is None @pytest.mark.asyncio async def test_json_with_specified_encoding(send_request): diff --git a/sdk/core/azure-core/tests/testserver_tests/test_rest_http_response.py b/sdk/core/azure-core/tests/testserver_tests/test_rest_http_response.py index 83255119f4ab..804a98b8890e 100644 --- a/sdk/core/azure-core/tests/testserver_tests/test_rest_http_response.py +++ b/sdk/core/azure-core/tests/testserver_tests/test_rest_http_response.py @@ -138,7 +138,7 @@ def test_response_no_charset_with_ascii_content(send_request): assert response.headers["Content-Type"] == "text/plain" assert response.status_code == 200 - assert response.encoding == 'ascii' + assert response.encoding is None assert response.text == "Hello, world!" @@ -151,7 +151,7 @@ def test_response_no_charset_with_iso_8859_1_content(send_request): request=HttpRequest("GET", "/encoding/iso-8859-1"), ) assert response.text == u"Accented: Österreich" - assert response.encoding == 'ISO-8859-1' + assert response.encoding is None def test_response_set_explicit_encoding(send_request): # Deliberately incorrect charset @@ -168,7 +168,7 @@ def test_json(send_request): request=HttpRequest("GET", "/basic/json"), ) assert response.json() == {"greeting": "hello", "recipient": "world"} - assert response.encoding == 'utf-8-sig' # for requests, we use utf-8-sig instead of utf-8 bc of requests behavior + assert response.encoding is None def test_json_with_specified_encoding(send_request): response = send_request(