Skip to content

Commit

Permalink
only download metadata we need (instead of all metadata)
Browse files Browse the repository at this point in the history
  • Loading branch information
radoering committed Feb 4, 2024
1 parent d3d201f commit c286e84
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 92 deletions.
153 changes: 69 additions & 84 deletions src/poetry/repositories/http_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import functools
import hashlib

from collections import defaultdict
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
Expand All @@ -16,7 +15,6 @@

from poetry.core.constraints.version import parse_constraint
from poetry.core.packages.dependency import Dependency
from poetry.core.packages.utils.link import Link
from poetry.core.utils.helpers import temporary_directory
from poetry.core.version.markers import parse_marker

Expand All @@ -38,6 +36,7 @@

if TYPE_CHECKING:
from packaging.utils import NormalizedName
from poetry.core.packages.utils.link import Link

from poetry.repositories.link_sources.base import LinkSource
from poetry.utils.authenticator import RepositoryCertificateConfig
Expand Down Expand Up @@ -110,10 +109,9 @@ def _cached_or_downloaded_file(
)
yield filepath

def _get_info_from_wheel(self, url: str) -> PackageInfo:
def _get_info_from_wheel(self, link: Link) -> PackageInfo:
from poetry.inspection.info import PackageInfo

link = Link(url)
netloc = link.netloc

# If "lazy-wheel" is enabled and the domain supports range requests
Expand Down Expand Up @@ -149,37 +147,73 @@ def _get_info_from_wheel(self, url: str) -> PackageInfo:
level="debug",
)
self._supports_range_requests[netloc] = True
return self._get_info_from_wheel(link.url)
return self._get_info_from_wheel(link)

def _get_info_from_sdist(self, url: str) -> PackageInfo:
def _get_info_from_sdist(self, link: Link) -> PackageInfo:
from poetry.inspection.info import PackageInfo

with self._cached_or_downloaded_file(Link(url)) as filepath:
with self._cached_or_downloaded_file(link) as filepath:
return PackageInfo.from_sdist(filepath)

@staticmethod
def _get_info_from_metadata(
url: str, metadata: dict[str, pkginfo.Distribution]
) -> PackageInfo | None:
if url in metadata:
dist = metadata[url]
return PackageInfo(
name=dist.name,
version=dist.version,
summary=dist.summary,
requires_dist=list(dist.requires_dist),
requires_python=dist.requires_python,
)
def _get_info_from_metadata(self, link: Link) -> PackageInfo | None:
if link.has_metadata:
try:
assert link.metadata_url is not None
response = self.session.get(link.metadata_url)
distribution = pkginfo.Distribution()
if link.metadata_hash_name is not None:
metadata_hash = getattr(hashlib, link.metadata_hash_name)(
response.text.encode()
).hexdigest()

if metadata_hash != link.metadata_hash:
self._log(
f"Metadata file hash ({metadata_hash}) does not match"
f" expected hash ({link.metadata_hash})."
f" Metadata file for {link.filename} will be ignored.",
level="warning",
)
return None

distribution.parse(response.content)
return PackageInfo(
name=distribution.name,
version=distribution.version,
summary=distribution.summary,
requires_dist=list(distribution.requires_dist),
requires_python=distribution.requires_python,
)

except requests.HTTPError:
self._log(
f"Failed to retrieve metadata at {link.metadata_url}",
level="warning",
)

return None

def _get_info_from_urls(
def _get_info_from_links(
self,
urls: dict[str, list[str]],
metadata: dict[str, pkginfo.Distribution] | None = None,
links: list[Link],
*,
ignore_yanked: bool = True,
) -> PackageInfo:
metadata = metadata or {}
# Sort links by distribution type
wheels: list[Link] = []
sdists: list[Link] = []
for link in links:
if link.yanked and ignore_yanked:
# drop yanked files unless the entire release is yanked
continue
if link.is_wheel:
wheels.append(link)
elif link.filename.endswith(
(".tar.gz", ".zip", ".bz2", ".xz", ".Z", ".tar")
):
sdists.append(link)

# Prefer to read data from wheels: this is faster and more reliable
if wheels := urls.get("bdist_wheel"):
if wheels:
# We ought just to be able to look at any of the available wheels to read
# metadata, they all should give the same answer.
#
Expand All @@ -194,8 +228,7 @@ def _get_info_from_urls(
universal_python3_wheel = None
platform_specific_wheels = []
for wheel in wheels:
link = Link(wheel)
m = wheel_file_re.match(link.filename)
m = wheel_file_re.match(wheel.filename)
if not m:
continue

Expand All @@ -216,17 +249,17 @@ def _get_info_from_urls(

if universal_wheel is not None:
return self._get_info_from_metadata(
universal_wheel, metadata
universal_wheel
) or self._get_info_from_wheel(universal_wheel)

info = None
if universal_python2_wheel and universal_python3_wheel:
info = self._get_info_from_metadata(
universal_python2_wheel, metadata
universal_python2_wheel
) or self._get_info_from_wheel(universal_python2_wheel)

py3_info = self._get_info_from_metadata(
universal_python3_wheel, metadata
universal_python3_wheel
) or self._get_info_from_wheel(universal_python3_wheel)

if info.requires_python or py3_info.requires_python:
Expand Down Expand Up @@ -278,71 +311,23 @@ def _get_info_from_urls(
# Prefer non platform specific wheels
if universal_python3_wheel:
return self._get_info_from_metadata(
universal_python3_wheel, metadata
universal_python3_wheel
) or self._get_info_from_wheel(universal_python3_wheel)

if universal_python2_wheel:
return self._get_info_from_metadata(
universal_python2_wheel, metadata
universal_python2_wheel
) or self._get_info_from_wheel(universal_python2_wheel)

if platform_specific_wheels:
first_wheel = platform_specific_wheels[0]
return self._get_info_from_metadata(
first_wheel, metadata
first_wheel
) or self._get_info_from_wheel(first_wheel)

return self._get_info_from_metadata(
urls["sdist"][0], metadata
) or self._get_info_from_sdist(urls["sdist"][0])

def _get_info_from_links(
self,
links: list[Link],
*,
ignore_yanked: bool = True,
) -> PackageInfo:
urls = defaultdict(list)
metadata: dict[str, pkginfo.Distribution] = {}
for link in links:
if link.yanked and ignore_yanked:
# drop yanked files unless the entire release is yanked
continue
if link.has_metadata:
try:
assert link.metadata_url is not None
response = self.session.get(link.metadata_url)
distribution = pkginfo.Distribution()
if link.metadata_hash_name is not None:
metadata_hash = getattr(hashlib, link.metadata_hash_name)(
response.text.encode()
).hexdigest()

if metadata_hash != link.metadata_hash:
self._log(
f"Metadata file hash ({metadata_hash}) does not match"
f" expected hash ({link.metadata_hash})."
f" Metadata file for {link.filename} will be ignored.",
level="warning",
)
continue

distribution.parse(response.content)
metadata[link.url] = distribution
except requests.HTTPError:
self._log(
f"Failed to retrieve metadata at {link.metadata_url}",
level="warning",
)

if link.is_wheel:
urls["bdist_wheel"].append(link.url)
elif link.filename.endswith(
(".tar.gz", ".zip", ".bz2", ".xz", ".Z", ".tar")
):
urls["sdist"].append(link.url)

return self._get_info_from_urls(urls, metadata)
return self._get_info_from_metadata(sdists[0]) or self._get_info_from_sdist(
sdists[0]
)

def _links_to_data(self, links: list[Link], data: PackageInfo) -> dict[str, Any]:
if not links:
Expand Down
17 changes: 9 additions & 8 deletions tests/repositories/test_http_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

from packaging.metadata import parse_email
from poetry.core.packages.utils.link import Link

from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported
from poetry.repositories.http_repository import HTTPRepository
Expand Down Expand Up @@ -61,7 +62,7 @@ def test_get_info_from_wheel(
if lazy_wheel and supports_range_requests is not None:
repo._supports_range_requests[domain] = supports_range_requests

info = repo._get_info_from_wheel(url)
info = repo._get_info_from_wheel(Link(url))
assert info.name == "poetry-core"
assert info.version == "1.5.0"
assert info.requires_dist == [
Expand Down Expand Up @@ -110,45 +111,45 @@ def test_get_info_from_wheel_state_sequence(mocker: MockerFixture) -> None:

filename = "poetry_core-1.5.0-py3-none-any.whl"
domain = "foo.com"
url = f"https://{domain}/{filename}"
link = Link(f"https://{domain}/{filename}")
repo = MockRepository()

# 1. range request and download
mock_metadata_from_wheel_url.side_effect = HTTPRangeRequestUnsupported
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 1
assert mock_download.call_count == 1
assert mock_download.call_args[1]["raise_accepts_ranges"] is False

# 2. only download
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 1
assert mock_download.call_count == 2
assert mock_download.call_args[1]["raise_accepts_ranges"] is True

# 3. download and range request
mock_metadata_from_wheel_url.side_effect = None
mock_download.side_effect = HTTPRangeRequestSupported
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 2
assert mock_download.call_count == 3
assert mock_download.call_args[1]["raise_accepts_ranges"] is True

# 4. only range request
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 3
assert mock_download.call_count == 3

# 5. range request and download
mock_metadata_from_wheel_url.side_effect = HTTPRangeRequestUnsupported
mock_download.side_effect = None
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 4
assert mock_download.call_count == 4
assert mock_download.call_args[1]["raise_accepts_ranges"] is False

# 6. only range request
mock_metadata_from_wheel_url.side_effect = None
repo._get_info_from_wheel(url)
repo._get_info_from_wheel(link)
assert mock_metadata_from_wheel_url.call_count == 5
assert mock_download.call_count == 4

0 comments on commit c286e84

Please sign in to comment.