Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use data-dist-info-metadata (PEP 658) to decouple resolution from downloading #11111

Merged
merged 18 commits into from
Sep 10, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
d0813e8
create LinkHash and check out dist-info-metadata (PEP 658)
cosmicexplorer May 11, 2022
28b6134
add NEWS entry
cosmicexplorer May 11, 2022
f675e09
explain why we don't validate hexdigests now
cosmicexplorer May 12, 2022
6675ba3
fix importlib.metadata case
cosmicexplorer May 13, 2022
80e044a
make it work without --use-feature=fast-deps!
cosmicexplorer May 15, 2022
266c5cd
respond to review comments
cosmicexplorer Sep 5, 2022
8da1bdc
ensure PEP 691 json parsing also supports PEP 658 dist-info-metadata
cosmicexplorer Sep 5, 2022
23e5492
add test case for dist-info-metadata key from PEP 691!!
cosmicexplorer Sep 5, 2022
a685a98
remove unused code after html5lib was removed!!
cosmicexplorer Sep 5, 2022
96bd60e
rename WheelMetadata to InMemoryMetadata as per review comments
cosmicexplorer Sep 5, 2022
2aa1c2f
rename from_metadata_file{,_contents}() to avoid ambiguity pointed ou…
cosmicexplorer Sep 5, 2022
e5b2fcd
add tests for PEP 658 metadata with wheel files too
cosmicexplorer Sep 5, 2022
89f235d
add a note about further testing the json client
cosmicexplorer Sep 5, 2022
fc9bcdd
refactor and rename some variables in a confusing code of terse code
cosmicexplorer Sep 7, 2022
036bfc0
update message for MetadataIncohsistent to allow use with PEP 658 met…
cosmicexplorer Sep 7, 2022
8aeb108
raise MetadataInconsistent if the name from METADATA doesn't match th…
cosmicexplorer Sep 7, 2022
931501f
Use pathlib shorthand to write file
uranusjr Sep 7, 2022
f8d6dae
Switch to f-string
uranusjr Sep 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/11111.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Use the ``data-dist-info-metadata`` attribute from :pep:`658` to resolve distribution metadata without downloading the dist yet.
131 changes: 21 additions & 110 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@
import json
import logging
import os
import re
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from html.parser import HTMLParser
from optparse import Values
from typing import (
Expand All @@ -34,12 +32,12 @@
from pip._vendor.requests.exceptions import RetryError, SSLError

from pip._internal.exceptions import NetworkConnectionError
from pip._internal.models.link import Link
from pip._internal.models.link import HTMLElement, Link
cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
from pip._internal.models.search_scope import SearchScope
from pip._internal.network.session import PipSession
from pip._internal.network.utils import raise_for_status
from pip._internal.utils.filetypes import is_archive_file
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.vcs import vcs

from .sources import CandidatesFromPage, LinkSource, build_source
Expand All @@ -51,7 +49,6 @@

logger = logging.getLogger(__name__)

HTMLElement = xml.etree.ElementTree.Element
ResponseHeaders = MutableMapping[str, str]


Expand Down Expand Up @@ -191,92 +188,25 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib.parse.quote(urllib.parse.unquote(part))


def _clean_file_url_path(part: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib.request.pathname2url(urllib.request.url2pathname(part))

def _determine_base_url(document: HTMLElement, page_url: str) -> str:
"""Determine the HTML document's base URL.

# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
This looks for a ``<base>`` tag in the HTML document. If present, its href
attribute denotes the base URL of anchor tags in the document. If there is
no such tag (or if it does not have a valid href attribute), the HTML
file's URL is used as the base URL.

:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.

def _clean_url_path(path: str, is_local_path: bool) -> str:
TODO: Remove when `html5lib` is dropped.
pradyunsg marked this conversation as resolved.
Show resolved Hide resolved
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

return "".join(cleaned_parts)


def _clean_link(url: str) -> str:
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))


def _create_link_from_element(
element_attribs: Dict[str, Optional[str]],
page_url: str,
base_url: str,
) -> Optional[Link]:
"""
Convert an anchor element's attributes in a simple repository page to a Link.
"""
href = element_attribs.get("href")
if not href:
return None

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = element_attribs.get("data-requires-python")
yanked_reason = element_attribs.get("data-yanked")

link = Link(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
)

return link
for base in document.findall(".//base"):
href = base.get("href")
if href is not None:
return href
return page_url


class CacheablePageContent:
Expand Down Expand Up @@ -326,25 +256,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
data = json.loads(page.content)
for file in data.get("files", []):
file_url = file.get("url")
if file_url is None:
link = Link.from_json(file, page.url)
if link is None:
continue

# The Link.yanked_reason expects an empty string instead of a boolean.
yanked_reason = file.get("yanked")
if yanked_reason and not isinstance(yanked_reason, str):
yanked_reason = ""
# The Link.yanked_reason expects None instead of False
elif not yanked_reason:
yanked_reason = None

yield Link(
_clean_link(urllib.parse.urljoin(page.url, file_url)),
comes_from=page.url,
requires_python=file.get("requires-python"),
yanked_reason=yanked_reason,
hashes=file.get("hashes", {}),
)
yield link
return

parser = HTMLLinkParser(page.url)
Expand All @@ -354,11 +269,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
url = page.url
base_url = parser.base_url or url
for anchor in parser.anchors:
link = _create_link_from_element(
anchor,
page_url=url,
base_url=base_url,
)
link = Link.from_element(anchor, page_url=url, base_url=base_url)
if link is None:
continue
yield link
Expand Down
21 changes: 21 additions & 0 deletions src/pip/_internal/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,24 @@ def get_wheel_distribution(wheel: Wheel, canonical_name: str) -> BaseDistributio
:param canonical_name: Normalized project name of the given wheel.
"""
return select_backend().Distribution.from_wheel(wheel, canonical_name)


def get_metadata_distribution(
metadata_path: str,
filename: str,
canonical_name: str,
) -> BaseDistribution:
"""Get the representation of the specified METADATA file.

This returns a Distribution instance from the chosen backend based on the contents
of the file at ``metadata_path``.

:param metadata_path: Path to the METADATA file.
:param filename: Filename for the dist this metadata represents.
:param canonical_name: Normalized project name of the given dist.
"""
return select_backend().Distribution.from_metadata_file(
metadata_path,
filename,
canonical_name,
)
15 changes: 15 additions & 0 deletions src/pip/_internal/metadata/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,21 @@ def from_directory(cls, directory: str) -> "BaseDistribution":
"""
raise NotImplementedError()

@classmethod
def from_metadata_file(
cls,
metadata_path: str,
filename: str,
project_name: str,
) -> "BaseDistribution":
"""Load the distribution from the contents of a METADATA file.

:param metadata: The path to a METADATA file.
:param filename: File name for the dist with this metadata.
:param project_name: Name of the project this dist represents.
"""
raise NotImplementedError()

@classmethod
def from_wheel(cls, wheel: "Wheel", name: str) -> "BaseDistribution":
"""Load the distribution from a given wheel.
Expand Down
11 changes: 11 additions & 0 deletions src/pip/_internal/metadata/importlib/_dists.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ def from_directory(cls, directory: str) -> BaseDistribution:
dist = importlib.metadata.Distribution.at(info_location)
return cls(dist, info_location, info_location.parent)

@classmethod
def from_metadata_file(
cls,
metadata_path: str,
filename: str,
project_name: str,
cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
) -> BaseDistribution:
metadata_location = pathlib.Path(metadata_path)
dist = importlib.metadata.Distribution.at(metadata_location.parent)
cosmicexplorer marked this conversation as resolved.
Show resolved Hide resolved
return cls(dist, metadata_location.parent, None)

@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:
Expand Down
19 changes: 19 additions & 0 deletions src/pip/_internal/metadata/pkg_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,25 @@ def from_directory(cls, directory: str) -> BaseDistribution:
dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata)
return cls(dist)

@classmethod
def from_metadata_file(
cls,
metadata_path: str,
filename: str,
project_name: str,
) -> BaseDistribution:
with open(metadata_path, "rb") as f:
metadata = f.read()
metadata_text = {
"METADATA": metadata,
}
dist = pkg_resources.DistInfoDistribution(
location=filename,
metadata=WheelMetadata(metadata_text, filename),
project_name=project_name,
)
return cls(dist)

@classmethod
def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution:
try:
Expand Down
Loading