From 61877fa3d5670e4fd71b6d9c6650ae115a4cc217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Fri, 9 Sep 2022 07:18:18 +0200 Subject: [PATCH] fix: performance regression when parsing links from legacy repositories (#6442) Resolves: #6436 Measurements of `poetry lock` with warm cache with example pyproject.toml from #6436: |test case|time in s|peak memory usage in MB| |---|---|---| |legacy repository (before)|422|113| |legacy repository (after)|3|118| |pypi repository|1|92| `backports.cached-property` is used in order to support cached_property on Python 3.7. (cherry picked from commit c4b2253793cd6b41a99e25e479e40b776cca0a0e) Co-authored-by: Jarrod Moore Co-authored-by: Bjorn Neergaard --- poetry.lock | 14 ++++++++- pyproject.toml | 1 + src/poetry/repositories/link_sources/base.py | 33 +++++++++----------- src/poetry/repositories/link_sources/html.py | 16 +++++++--- src/poetry/utils/_compat.py | 16 +++++++++- tests/repositories/link_sources/test_base.py | 27 ++++++++++------ tests/repositories/link_sources/test_html.py | 3 +- tests/repositories/test_legacy_repository.py | 17 ++-------- 8 files changed, 77 insertions(+), 50 deletions(-) diff --git a/poetry.lock b/poetry.lock index b9c414bca8b..e771520ea46 100644 --- a/poetry.lock +++ b/poetry.lock @@ -12,6 +12,14 @@ docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] +[[package]] +name = "backports.cached-property" +version = "1.0.2" +description = "cached_property() - computed once per instance, cached as attribute" +category = "main" +optional = false +python-versions = ">=3.6.0" + [[package]] name = "CacheControl" version = "0.12.11" @@ -969,13 +977,17 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "853bf1ff15c31c912ce1f017cab05956ac09e0fe1a8262a40d77363192c5f934" +content-hash = "583302ecdf45ce586b5c640d3942765615bb6e0c4749be43230b1cf02fd7d16e" [metadata.files] attrs = [ {file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"}, {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"}, ] +"backports.cached-property" = [ + {file = "backports.cached-property-1.0.2.tar.gz", hash = "sha256:9306f9eed6ec55fd156ace6bc1094e2c86fae5fb2bf07b6a9c00745c656e75dd"}, + {file = "backports.cached_property-1.0.2-py3-none-any.whl", hash = "sha256:baeb28e1cd619a3c9ab8941431fe34e8490861fb998c6c4590693d50171db0cc"}, +] CacheControl = [ {file = "CacheControl-0.12.11-py2.py3-none-any.whl", hash = "sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b"}, {file = "CacheControl-0.12.11.tar.gz", hash = "sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144"}, diff --git a/pyproject.toml b/pyproject.toml index b5d44776e4a..a4260f8c017 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ python = "^3.7" poetry-core = "1.1.0" poetry-plugin-export = "^1.0.6" +"backports.cached-property" = { version = "^1.0.2", python = "<3.8" } cachecontrol = { version = "^0.12.9", extras = ["filecache"] } cachy = "^0.3.0" cleo = "^1.0.0a5" diff --git a/src/poetry/repositories/link_sources/base.py b/src/poetry/repositories/link_sources/base.py index 6e07de3cfdd..fb6dc67b6b3 100644 --- a/src/poetry/repositories/link_sources/base.py +++ b/src/poetry/repositories/link_sources/base.py @@ -3,13 +3,15 @@ import logging import re -from abc import abstractmethod from typing import TYPE_CHECKING +from typing import DefaultDict +from typing import List from packaging.utils import canonicalize_name from poetry.core.packages.package import Package from poetry.core.semver.version import Version +from poetry.utils._compat import cached_property from poetry.utils.patterns import sdist_file_re from poetry.utils.patterns import wheel_file_re @@ -20,6 +22,8 @@ from packaging.utils import NormalizedName from poetry.core.packages.utils.link import Link + LinkCache = DefaultDict[NormalizedName, DefaultDict[Version, List[Link]]] + logger = logging.getLogger(__name__) @@ -44,16 +48,8 @@ def __init__(self, url: str) -> None: def url(self) -> str: return self._url - def versions(self, name: str) -> Iterator[Version]: - name = canonicalize_name(name) - seen: set[Version] = set() - - for link in self.links: - pkg = self.link_package_data(link) - - if pkg and pkg.name == name and pkg.version not in seen: - seen.add(pkg.version) - yield pkg.version + def versions(self, name: NormalizedName) -> Iterator[Version]: + yield from self._link_cache[name] @property def packages(self) -> Iterator[Package]: @@ -64,9 +60,10 @@ def packages(self) -> Iterator[Package]: yield pkg @property - @abstractmethod def links(self) -> Iterator[Link]: - raise NotImplementedError() + for links_per_version in self._link_cache.values(): + for links in links_per_version.values(): + yield from links @classmethod def link_package_data(cls, link: Link) -> Package | None: @@ -102,11 +99,7 @@ def link_package_data(cls, link: Link) -> Package | None: def links_for_version( self, name: NormalizedName, version: Version ) -> Iterator[Link]: - for link in self.links: - pkg = self.link_package_data(link) - - if pkg and pkg.name == name and pkg.version == version: - yield link + yield from self._link_cache[name][version] def clean_link(self, url: str) -> str: """Makes sure a link is fully encoded. That is, if a ' ' shows up in @@ -127,3 +120,7 @@ def yanked(self, name: NormalizedName, version: Version) -> str | bool: if reasons: return "\n".join(sorted(reasons)) return True + + @cached_property + def _link_cache(self) -> LinkCache: + raise NotImplementedError() diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index d72a8e06ded..5ea4eff78d0 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -3,16 +3,19 @@ import urllib.parse import warnings +from collections import defaultdict from html import unescape from typing import TYPE_CHECKING from poetry.core.packages.utils.link import Link from poetry.repositories.link_sources.base import LinkSource +from poetry.utils._compat import cached_property if TYPE_CHECKING: - from collections.abc import Iterator + from poetry.repositories.link_sources.base import LinkCache + with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -25,8 +28,9 @@ def __init__(self, url: str, content: str) -> None: self._parsed = html5lib.parse(content, namespaceHTMLElements=False) - @property - def links(self) -> Iterator[Link]: + @cached_property + def _link_cache(self) -> LinkCache: + links: LinkCache = defaultdict(lambda: defaultdict(list)) for anchor in self._parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") @@ -44,7 +48,11 @@ def links(self) -> Iterator[Link]: if link.ext not in self.SUPPORTED_FORMATS: continue - yield link + pkg = self.link_package_data(link) + if pkg: + links[pkg.name][pkg.version].append(link) + + return links class SimpleRepositoryPage(HTMLPage): diff --git a/src/poetry/utils/_compat.py b/src/poetry/utils/_compat.py index a1c81582366..9ee8875f061 100644 --- a/src/poetry/utils/_compat.py +++ b/src/poetry/utils/_compat.py @@ -13,6 +13,12 @@ else: from importlib import metadata +if sys.version_info < (3, 8): + # compatibility for python <3.8 + from backports.cached_property import cached_property +else: + from functools import cached_property + WINDOWS = sys.platform == "win32" @@ -53,4 +59,12 @@ def list_to_shell_command(cmd: list[str]) -> str: ) -__all__ = ["WINDOWS", "decode", "encode", "list_to_shell_command", "metadata", "to_str"] +__all__ = [ + "WINDOWS", + "cached_property", + "decode", + "encode", + "list_to_shell_command", + "metadata", + "to_str", +] diff --git a/tests/repositories/link_sources/test_base.py b/tests/repositories/link_sources/test_base.py index c949f90a5eb..ef400a716d7 100644 --- a/tests/repositories/link_sources/test_base.py +++ b/tests/repositories/link_sources/test_base.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict from typing import TYPE_CHECKING from unittest.mock import PropertyMock @@ -24,16 +25,22 @@ def link_source(mocker: MockerFixture) -> LinkSource: url = "https://example.org" link_source = LinkSource(url) mocker.patch( - f"{LinkSource.__module__}.{LinkSource.__qualname__}.links", + f"{LinkSource.__module__}.{LinkSource.__qualname__}._link_cache", new_callable=PropertyMock, - return_value=iter( - [ - Link(f"{url}/demo-0.1.0.tar.gz"), - Link(f"{url}/demo-0.1.0_invalid.tar.gz"), - Link(f"{url}/invalid.tar.gz"), - Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"), - Link(f"{url}/demo-0.1.1.tar.gz"), - ] + return_value=defaultdict( + lambda: defaultdict(list), + { + canonicalize_name("demo"): defaultdict( + list, + { + Version.parse("0.1.0"): [ + Link(f"{url}/demo-0.1.0.tar.gz"), + Link(f"{url}/demo-0.1.0-py2.py3-none-any.whl"), + ], + Version.parse("0.1.1"): [Link(f"{url}/demo-0.1.1.tar.gz")], + }, + ), + }, ), ) return link_source @@ -63,7 +70,7 @@ def test_link_package_data(filename: str, expected: Package | None) -> None: ], ) def test_versions(name: str, expected: set[Version], link_source: LinkSource) -> None: - assert set(link_source.versions(name)) == expected + assert set(link_source.versions(canonicalize_name(name))) == expected def test_packages(link_source: LinkSource) -> None: diff --git a/tests/repositories/link_sources/test_html.py b/tests/repositories/link_sources/test_html.py index 35b9ebfaa88..780262d9ada 100644 --- a/tests/repositories/link_sources/test_html.py +++ b/tests/repositories/link_sources/test_html.py @@ -2,6 +2,7 @@ import pytest +from packaging.utils import canonicalize_name from poetry.core.packages.utils.link import Link from poetry.core.semver.version import Version @@ -90,4 +91,4 @@ def test_yanked(yanked_attrs: tuple[str, str], expected: bool | str) -> None: content = DEMO_TEMPLATE.format(anchors) page = HTMLPage("https://example.org", content) - assert page.yanked("demo", Version.parse("0.1")) == expected + assert page.yanked(canonicalize_name("demo"), Version.parse("0.1")) == expected diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index 061471409d9..042e8064a52 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -102,25 +102,12 @@ def test_page_invalid_version_link() -> None: assert page is not None links = list(page.links) - assert len(links) == 2 + assert len(links) == 1 - versions = list(page.versions("poetry")) + versions = list(page.versions(canonicalize_name("poetry"))) assert len(versions) == 1 assert versions[0].to_string() == "0.1.0" - invalid_link = None - - for link in links: - if link.filename.startswith("poetry-21"): - invalid_link = link - break - - links_010 = list(page.links_for_version(canonicalize_name("poetry"), versions[0])) - assert invalid_link not in links_010 - - assert invalid_link - assert not page.link_package_data(invalid_link) - packages = list(page.packages) assert len(packages) == 1 assert packages[0].name == "poetry"