-
Notifications
You must be signed in to change notification settings - Fork 15
/
http.py
263 lines (201 loc) · 8.91 KB
/
http.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
from __future__ import annotations
import json
from hashlib import sha1
from typing import Any, Optional, TypeVar, Union
from urllib.parse import urljoin
import attrs
from w3lib.encoding import (
html_body_declared_encoding,
html_to_unicode,
http_content_type_encoding,
read_bom,
resolve_encoding,
)
from w3lib.url import canonicalize_url
from web_poet._base import _HttpHeaders
from web_poet.mixins import SelectableMixin, UrlShortcutsMixin
from web_poet.utils import _create_deprecated_class, memoizemethod_noargs
from .url import RequestUrl as _RequestUrl
from .url import ResponseUrl as _ResponseUrl
T_headers = TypeVar("T_headers", bound=_HttpHeaders)
RequestUrl = _create_deprecated_class("RequestUrl", _RequestUrl)
ResponseUrl = _create_deprecated_class("ResponseUrl", _ResponseUrl)
class HttpRequestBody(bytes):
"""A container for holding the raw HTTP request body in bytes format."""
pass
class HttpResponseBody(bytes):
"""A container for holding the raw HTTP response body in bytes format."""
def bom_encoding(self) -> Optional[str]:
"""Returns the encoding from the byte order mark if present."""
return read_bom(self)[0]
def declared_encoding(self) -> Optional[str]:
"""Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found"""
return html_body_declared_encoding(self)
def json(self) -> Any:
"""
Deserialize a JSON document to a Python object.
"""
return json.loads(self)
class HttpRequestHeaders(_HttpHeaders):
"""A container for holding the HTTP request headers.
It's able to accept instantiation via an Iterable of Tuples:
>>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")]
>>> HttpRequestHeaders(pairs)
<HttpRequestHeaders('Content-Encoding': 'gzip', 'content-length': '648')>
It's also accepts a mapping of key-value pairs as well:
>>> pairs = {"Content-Encoding": "gzip", "content-length": "648"}
>>> headers = HttpRequestHeaders(pairs)
>>> headers
<HttpRequestHeaders('Content-Encoding': 'gzip', 'content-length': '648')>
Note that this also supports case insensitive header-key lookups:
>>> headers.get("content-encoding")
'gzip'
>>> headers.get("Content-Length")
'648'
These are just a few of the functionalities it inherits from
:class:`multidict.CIMultiDict`. For more info on its other features, read
the API spec of :class:`multidict.CIMultiDict`.
"""
pass
class HttpResponseHeaders(_HttpHeaders):
"""A container for holding the HTTP response headers.
It's able to accept instantiation via an Iterable of Tuples:
>>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")]
>>> HttpResponseHeaders(pairs)
<HttpResponseHeaders('Content-Encoding': 'gzip', 'content-length': '648')>
It's also accepts a mapping of key-value pairs as well:
>>> pairs = {"Content-Encoding": "gzip", "content-length": "648"}
>>> headers = HttpResponseHeaders(pairs)
>>> headers
<HttpResponseHeaders('Content-Encoding': 'gzip', 'content-length': '648')>
Note that this also supports case insensitive header-key lookups:
>>> headers.get("content-encoding")
'gzip'
>>> headers.get("Content-Length")
'648'
These are just a few of the functionalities it inherits from
:class:`multidict.CIMultiDict`. For more info on its other features, read
the API spec of :class:`multidict.CIMultiDict`.
"""
def declared_encoding(self) -> Optional[str]:
"""Return encoding detected from the Content-Type header, or None
if encoding is not found"""
content_type = self.get("Content-Type", "")
return http_content_type_encoding(content_type)
@attrs.define(auto_attribs=False, slots=False, eq=False)
class HttpRequest:
"""Represents a generic HTTP request used by other functionalities in
**web-poet** like :class:`~.HttpClient`.
.. tip:: To build a request to submit an HTML form, use the
:doc:`form2request library <form2request:index>`, which provides
integration with web-poet.
"""
url: _RequestUrl = attrs.field(converter=_RequestUrl)
method: str = attrs.field(default="GET", kw_only=True)
headers: HttpRequestHeaders = attrs.field(
factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True
)
body: HttpRequestBody = attrs.field(
factory=HttpRequestBody, converter=HttpRequestBody, kw_only=True
)
def urljoin(self, url: Union[str, _RequestUrl, _ResponseUrl]) -> _RequestUrl:
"""Return *url* as an absolute URL.
If *url* is relative, it is made absolute relative to :attr:`url`."""
return _RequestUrl(urljoin(str(self.url), str(url)))
@attrs.define(auto_attribs=False, slots=False, eq=False)
class HttpResponse(SelectableMixin, UrlShortcutsMixin):
"""A container for the contents of a response, downloaded directly using an
HTTP client.
``url`` should be a URL of the response (after all redirects),
not a URL of the request, if possible.
``body`` contains the raw HTTP response body.
The following are optional since it would depend on the source of the
``HttpResponse`` if these are available or not. For example, the responses
could simply come off from a local HTML file which doesn't contain ``headers``
and ``status``.
``status`` should represent the int status code of the HTTP response.
``headers`` should contain the HTTP response headers.
``encoding`` encoding of the response. If None (default), encoding
is auto-detected from headers and body content.
"""
url: _ResponseUrl = attrs.field(converter=_ResponseUrl)
body: HttpResponseBody = attrs.field(converter=HttpResponseBody)
status: Optional[int] = attrs.field(default=None, kw_only=True)
headers: HttpResponseHeaders = attrs.field(
factory=HttpResponseHeaders, converter=HttpResponseHeaders, kw_only=True
)
_encoding: Optional[str] = attrs.field(default=None, kw_only=True)
_DEFAULT_ENCODING = "ascii"
_cached_text: Optional[str] = None
@property
def text(self) -> str:
"""
Content of the HTTP body, converted to unicode
using the detected encoding of the response, according
to the web browser rules (respecting Content-Type header, etc.)
"""
# Access self.encoding before self._cached_text, because
# there is a chance self._cached_text would be already populated
# while detecting the encoding
encoding = self.encoding
if self._cached_text is None:
fake_content_type_header = f"charset={encoding}"
encoding, text = html_to_unicode(fake_content_type_header, self.body)
self._cached_text = text
return self._cached_text
def _selector_input(self) -> str:
return self.text
@property
def encoding(self) -> Optional[str]:
"""Encoding of the response"""
return (
self._encoding
or self._body_bom_encoding()
or self._headers_declared_encoding()
or self._body_declared_encoding()
or self._body_inferred_encoding()
)
@memoizemethod_noargs
def json(self) -> Any:
"""Deserialize a JSON document to a Python object."""
return self.body.json()
@memoizemethod_noargs
def _body_bom_encoding(self) -> Optional[str]:
return self.body.bom_encoding()
@memoizemethod_noargs
def _headers_declared_encoding(self) -> Optional[str]:
return self.headers.declared_encoding()
@memoizemethod_noargs
def _body_declared_encoding(self) -> Optional[str]:
return self.body.declared_encoding()
@memoizemethod_noargs
def _body_inferred_encoding(self) -> Optional[str]:
content_type = self.headers.get("Content-Type", "")
body_encoding, text = html_to_unicode(
content_type,
self.body,
# FIXME: type ignore can be removed when the following is released:
# https://github.com/scrapy/w3lib/pull/190
auto_detect_fun=self._auto_detect_fun, # type: ignore[arg-type]
default_encoding=self._DEFAULT_ENCODING,
)
self._cached_text = text
return body_encoding
def _auto_detect_fun(self, body: bytes) -> Optional[str]:
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
try:
body.decode(enc)
except UnicodeError:
continue
return resolve_encoding(enc)
def request_fingerprint(req: HttpRequest) -> str:
"""Return the fingerprint of the request."""
fp = sha1()
fp.update(req.method.encode() + b"\n")
fp.update(canonicalize_url(str(req.url)).encode() + b"\n")
for name, value in sorted(req.headers.items()):
fp.update(f"{name.title()}:{value}\n".encode())
fp.update(b"\n")
fp.update(req.body)
return fp.hexdigest()