Skip to content

Commit

Permalink
PoC of PEP 691
Browse files Browse the repository at this point in the history
  • Loading branch information
dstufft committed Jul 15, 2022
1 parent e89e391 commit 6f167b5
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 54 deletions.
154 changes: 103 additions & 51 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import email.message
import functools
import itertools
import json
import logging
import os
import re
Expand Down Expand Up @@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
return None


class _NotHTML(Exception):
class _NotAPIContent(Exception):
def __init__(self, content_type: str, request_desc: str) -> None:
super().__init__(content_type, request_desc)
self.content_type = content_type
self.request_desc = request_desc


def _ensure_html_header(response: Response) -> None:
"""Check the Content-Type header to ensure the response contains HTML.
def _ensure_api_header(response: Response) -> None:
"""
Check the Content-Type header to ensure the response contains a Simple
API Response.
Raises `_NotHTML` if the content type is not text/html.
Raises `_NotAPIContent` if the content type is not a valid content-type.
"""
content_type = response.headers.get("Content-Type", "")
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type, response.request.method)

content_type_l = content_type.lower()
if content_type_l.startswith("text/html"):
return
elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"):
return
elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
return

raise _NotAPIContent(content_type, response.request.method)


class _NotHTTP(Exception):
pass


def _ensure_html_response(url: str, session: PipSession) -> None:
"""Send a HEAD request to the URL, and ensure the response contains HTML.
def _ensure_api_response(url: str, session: PipSession) -> None:
"""
Send a HEAD request to the URL, and ensure the response contains a simple
API Response.
Raises `_NotHTTP` if the URL is not available for a HEAD request, or
`_NotHTML` if the content type is not text/html.
`_NotAPIContent` if the content type is not a valid content type.
"""
scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
if scheme not in {"http", "https"}:
Expand All @@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
resp = session.head(url, allow_redirects=True)
raise_for_status(resp)

_ensure_html_header(resp)
_ensure_api_header(resp)


def _get_html_response(url: str, session: PipSession) -> Response:
"""Access an HTML page with GET, and return the response.
def _get_simple_response(url: str, session: PipSession) -> Response:
"""Access an Simple API response with GET, and return the response.
This consists of three parts:
1. If the URL looks suspiciously like an archive, send a HEAD first to
check the Content-Type is HTML, to avoid downloading a large file.
Raise `_NotHTTP` if the content type cannot be determined, or
`_NotHTML` if it is not HTML.
check the Content-Type is HTML or Simple API, to avoid downloading a
large file. Raise `_NotHTTP` if the content type cannot be determined, or
`_NotAPIContent` if it is not HTML or a Simple API.
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check the Content-Type header to make sure we got HTML, and raise
`_NotHTML` otherwise.
3. Check the Content-Type header to make sure we got a Simple API response,
and raise `_NotAPIContent` otherwise.
"""
if is_archive_file(Link(url).filename):
_ensure_html_response(url, session=session)
_ensure_api_response(url, session=session)

logger.debug("Getting page %s", redact_auth_from_url(url))

resp = session.get(
url,
headers={
"Accept": "text/html",
"Accept": ", ".join(
[
"application/vnd.pypi.simple.v1+json",
"application/vnd.pypi.simple.v1+html; q=0.2",
"text/html; q=0.1",
]
),
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
Expand All @@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
_ensure_html_header(resp)
# url we cannot know ahead of time for sure if something is a
# Simple API response or not. However we can check after we've
# downloaded it.
_ensure_api_header(resp)

return resp

Expand Down Expand Up @@ -273,7 +293,7 @@ def _create_link_from_element(


class CacheablePageContent:
def __init__(self, page: "HTMLPage") -> None:
def __init__(self, page: "IndexContent") -> None:
assert page.cache_link_parsing
self.page = page

Expand All @@ -286,15 +306,15 @@ def __hash__(self) -> int:

class ParseLinks(Protocol):
def __call__(
self, page: "HTMLPage", use_deprecated_html5lib: bool
self, page: "IndexContent", use_deprecated_html5lib: bool
) -> Iterable[Link]:
...


def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
Given a function that parses an Iterable[Link] from an IndexContent, cache the
function's result (keyed by CacheablePageContent), unless the IndexContent
`page` has `page.cache_link_parsing == False`.
"""

Expand All @@ -305,15 +325,17 @@ def wrapper(
return list(fn(cacheable_page.page, use_deprecated_html5lib))

@functools.wraps(fn)
def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
def wrapper_wrapper(
page: "IndexContent", use_deprecated_html5lib: bool
) -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib))

return wrapper_wrapper


def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
Expand All @@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
yield link


@with_cached_html_pages
def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
@with_cached_index_content
def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
"""

content_type_l = page.content_type.lower()
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
file_url = file.get("url")
if file_url is None:
continue

# The Link.yanked_reason expects an empty string instead of a boolean.
yanked_reason = file.get("yanked")
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
# The Link.yanked_reason expects None instead of False
elif not yanked_reason:
yanked_reason = None

yield Link(
_clean_link(urllib.parse.urljoin(page.url, file_url)),
comes_from=page.url,
requires_python=file.get("requires-python"),
yanked_reason=yanked_reason,
)

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return
Expand All @@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
yield link


class HTMLPage:
"""Represents one page, along with its URL"""
class IndexContent:
"""Represents one response (or page), along with its URL"""

def __init__(
self,
content: bytes,
content_type: str,
encoding: Optional[str],
url: str,
cache_link_parsing: bool = True,
Expand All @@ -383,6 +429,7 @@ def __init__(
have this set to False, for example.
"""
self.content = content
self.content_type = content_type
self.encoding = encoding
self.url = url
self.cache_link_parsing = cache_link_parsing
Expand Down Expand Up @@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
return None


def _handle_get_page_fail(
def _handle_get_simple_fail(
link: Link,
reason: Union[str, Exception],
meth: Optional[Callable[..., None]] = None,
Expand All @@ -429,19 +476,22 @@ def _handle_get_page_fail(
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
def _make_index_content(
response: Response, cache_link_parsing: bool = True
) -> IndexContent:
encoding = _get_encoding_from_headers(response.headers)
return HTMLPage(
return IndexContent(
response.content,
response.headers["Content-Type"],
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing,
)


def _get_html_page(
def _get_index_content(
link: Link, session: Optional[PipSession] = None
) -> Optional["HTMLPage"]:
) -> Optional["IndexContent"]:
if session is None:
raise TypeError(
"_get_html_page() missing 1 required keyword argument: 'session'"
Expand All @@ -468,37 +518,39 @@ def _get_html_page(
url += "/"
url = urllib.parse.urljoin(url, "index.html")
logger.debug(" file: URL is directory, getting %s", url)
# TODO: index.json?

try:
resp = _get_html_response(url, session=session)
resp = _get_simple_response(url, session=session)
except _NotHTTP:
logger.warning(
"Skipping page %s because it looks like an archive, and cannot "
"be checked by a HTTP HEAD request.",
link,
)
except _NotHTML as exc:
except _NotAPIContent as exc:
logger.warning(
"Skipping page %s because the %s request got Content-Type: %s."
"The only supported Content-Type is text/html",
"Skipping page %s because the %s request got Content-Type: %s. "
"The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
"application/vnd.pypi.simple.v1+html, and text/html",
link,
exc.request_desc,
exc.content_type,
)
except NetworkConnectionError as exc:
_handle_get_page_fail(link, exc)
_handle_get_simple_fail(link, exc)
except RetryError as exc:
_handle_get_page_fail(link, exc)
_handle_get_simple_fail(link, exc)
except SSLError as exc:
reason = "There was a problem confirming the ssl certificate: "
reason += str(exc)
_handle_get_page_fail(link, reason, meth=logger.info)
_handle_get_simple_fail(link, reason, meth=logger.info)
except requests.ConnectionError as exc:
_handle_get_page_fail(link, f"connection error: {exc}")
_handle_get_simple_fail(link, f"connection error: {exc}")
except requests.Timeout:
_handle_get_page_fail(link, "timed out")
_handle_get_simple_fail(link, "timed out")
else:
return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
return None


Expand Down Expand Up @@ -561,11 +613,11 @@ def create(
def find_links(self) -> List[str]:
return self.search_scope.find_links

def fetch_page(self, location: Link) -> Optional[HTMLPage]:
def fetch_response(self, location: Link) -> Optional[IndexContent]:
"""
Fetch an HTML page containing package links.
"""
return _get_html_page(location, session=self.session)
return _get_index_content(location, session=self.session)

def collect_sources(
self,
Expand Down
6 changes: 3 additions & 3 deletions src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,11 +792,11 @@ def process_project_url(
"Fetching project page and analyzing links: %s",
project_url,
)
html_page = self._link_collector.fetch_page(project_url)
if html_page is None:
index_response = self._link_collector.fetch_response(project_url)
if index_response is None:
return []

page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
page_links = list(parse_links(index_response, self._use_deprecated_html5lib))

with indent_log():
package_links = self.evaluate_links(
Expand Down

0 comments on commit 6f167b5

Please sign in to comment.