Skip to content

Commit

Permalink
Clean up metadata backfill leftovers (pypi#17465)
Browse files Browse the repository at this point in the history
* Clean up metadata backfill leftovers

* Linting

* Add back task as no-op

* Linting
  • Loading branch information
di authored Jan 21, 2025
1 parent 211456a commit 7ebe87a
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 283 deletions.
14 changes: 1 addition & 13 deletions tests/unit/packaging/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# limitations under the License.

import pretend
import pytest

from celery.schedules import crontab

Expand All @@ -29,14 +28,12 @@
from warehouse.packaging.services import project_service_factory
from warehouse.packaging.tasks import (
check_file_cache_tasks_outstanding,
sync_bigquery_release_files,
update_description_html,
)
from warehouse.rate_limiting import IRateLimiter, RateLimit


@pytest.mark.parametrize("with_bq_sync", [True, False])
def test_includeme(monkeypatch, with_bq_sync):
def test_includeme(monkeypatch):
storage_class = pretend.stub(
create_service=pretend.call_recorder(lambda *a, **kw: pretend.stub())
)
Expand All @@ -55,8 +52,6 @@ def key_factory(keystring, iterate_on=None, if_attr_exists=None):
"warehouse.packaging.project_create_user_ratelimit_string": "20 per hour",
"warehouse.packaging.project_create_ip_ratelimit_string": "40 per hour",
}
if with_bq_sync:
settings["warehouse.release_files_table"] = "fizzbuzz"

config = pretend.stub(
maybe_dotted=lambda dotted: storage_class,
Expand Down Expand Up @@ -169,13 +164,6 @@ def key_factory(keystring, iterate_on=None, if_attr_exists=None):
),
]

if with_bq_sync:
assert (
pretend.call(crontab(minute="*"), sync_bigquery_release_files)
in config.add_periodic_task.calls
)
pass

assert (
pretend.call(crontab(minute="*/1"), check_file_cache_tasks_outstanding)
in config.add_periodic_task.calls
Expand Down
154 changes: 1 addition & 153 deletions tests/unit/packaging/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@
import warehouse.packaging.tasks

from warehouse.accounts.models import WebAuthn
from warehouse.packaging.models import Description, MissingDatasetFile
from warehouse.packaging.models import Description
from warehouse.packaging.tasks import (
check_file_cache_tasks_outstanding,
compute_2fa_metrics,
compute_packaging_metrics,
sync_bigquery_release_files,
sync_file_to_cache,
update_bigquery_release_files,
update_description_html,
Expand All @@ -37,9 +36,7 @@
from warehouse.utils import readme
from warehouse.utils.row_counter import compute_row_counts

from ...common.db.classifiers import ClassifierFactory
from ...common.db.packaging import (
DependencyFactory,
DescriptionFactory,
FileFactory,
ProjectFactory,
Expand Down Expand Up @@ -675,155 +672,6 @@ def test_var_is_none(self):
update_bigquery_release_files(task, request, dist_metadata)


class TestSyncBigQueryMetadata:
@pytest.mark.filterwarnings(
"ignore:This collection has been invalidated.:sqlalchemy.exc.SAWarning"
)
@pytest.mark.parametrize(
("release_files_table", "expected_get_table_calls"),
[
(
"example.pypi.distributions",
[pretend.call("example.pypi.distributions")],
),
(
"example.pypi.distributions some.other.table",
[
pretend.call("example.pypi.distributions"),
pretend.call("some.other.table"),
],
),
],
)
@pytest.mark.parametrize("bq_schema", [bq_schema])
def test_sync_rows(
self,
db_request,
monkeypatch,
release_files_table,
expected_get_table_calls,
bq_schema,
):
project = ProjectFactory.create()
description = DescriptionFactory.create()
release = ReleaseFactory.create(
project=project,
description=description,
license_expression="Apache-2.0",
license_files=["LICENSE.APACHE"],
)
release.platform = "test_platform"
release_file = FileFactory.create(
release=release,
filename=f"{project.name}-{release.version}.tar.gz",
md5_digest="feca4238a0b923820dcc509a6f75849b",
packagetype="sdist",
)
release_file2 = FileFactory.create(
release=release,
filename=f"{project.name}-{release.version}-py3-none-any.whl",
md5_digest="fecasd342fb952820dcc509a6f75849b",
packagetype="bdist_wheel",
)
release._classifiers.append(ClassifierFactory.create(classifier="foo :: bar"))
release._classifiers.append(ClassifierFactory.create(classifier="foo :: baz"))
release._classifiers.append(ClassifierFactory.create(classifier="fiz :: buz"))
DependencyFactory.create(release=release, kind=1)
DependencyFactory.create(release=release, kind=1)
DependencyFactory.create(release=release, kind=2)
DependencyFactory.create(release=release, kind=3)
DependencyFactory.create(release=release, kind=4)
missing = MissingDatasetFile(file_id=release_file.id)
db_request.db.add(missing)

query = pretend.stub(
result=pretend.call_recorder(
lambda *a, **kw: [{"md5_digest": release_file2.md5_digest}]
)
)
get_table = pretend.stub(schema=bq_schema)
bigquery = pretend.stub(
get_table=pretend.call_recorder(lambda t: get_table),
insert_rows_json=pretend.call_recorder(lambda *a, **kw: None),
query=pretend.call_recorder(lambda q: query),
)

@pretend.call_recorder
def find_service(name=None):
if name == "gcloud.bigquery":
return bigquery
raise LookupError

db_request.find_service = find_service
db_request.registry.settings = {
"warehouse.release_files_table": release_files_table,
"sync_release_file_backfill.batch_size": 10,
}

sync_bigquery_release_files(db_request)

assert db_request.find_service.calls == [pretend.call(name="gcloud.bigquery")]
assert bigquery.get_table.calls == expected_get_table_calls
assert bigquery.query.calls == []
assert bigquery.insert_rows_json.calls == [
pretend.call(
table=table,
json_rows=[
{
"metadata_version": None,
"name": project.name,
"version": release.version,
"summary": release.summary,
"description": description.raw,
"description_content_type": description.content_type or None,
"author": release.author or None,
"author_email": release.author_email or None,
"maintainer": release.maintainer or None,
"maintainer_email": release.maintainer_email or None,
"license": release.license or None,
"license_expression": release.license_expression or None,
"license_files": release.license_files or [],
"keywords": release.keywords or None,
"classifiers": release.classifiers or [],
"platform": [release.platform] or [],
"home_page": release.home_page or None,
"download_url": release.download_url or None,
"requires_python": release.requires_python or None,
"requires": release.requires or [],
"provides": release.provides or [],
"obsoletes": release.obsoletes or [],
"requires_dist": release.requires_dist or [],
"provides_dist": release.provides_dist or [],
"obsoletes_dist": release.obsoletes_dist or [],
"requires_external": release.requires_external or [],
"project_urls": release.project_urls or [],
"uploaded_via": release_file.uploaded_via,
"upload_time": release_file.upload_time.isoformat(),
"filename": release_file.filename,
"size": release_file.size,
"path": release_file.path,
"python_version": release_file.python_version,
"packagetype": release_file.packagetype,
"comment_text": release_file.comment_text or None,
"has_signature": False,
"md5_digest": release_file.md5_digest,
"sha256_digest": release_file.sha256_digest,
"blake2_256_digest": release_file.blake2_256_digest,
},
],
)
for table in release_files_table.split()
]

assert missing.processed

def test_var_is_none(self):
request = pretend.stub(
registry=pretend.stub(settings={"warehouse.release_files_table": None})
)
sync_bigquery_release_files(request)


def test_compute_2fa_metrics(db_request, monkeypatch):
# A user without 2FA enabled
UserFactory.create(totp_secret=None, webauthn=[])
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,6 @@ def __init__(self):
"warehouse.organizations.max_undecided_organization_applications": 3,
"reconcile_file_storages.batch_size": 100,
"metadata_backfill.batch_size": 500,
"sync_release_file_backfill.batch_size": 10,
"gcloud.service_account_info": {},
"warehouse.forklift.legacy.MAX_FILESIZE_MIB": 100,
"warehouse.forklift.legacy.MAX_PROJECT_SIZE_GIB": 10,
Expand Down
7 changes: 0 additions & 7 deletions warehouse/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,13 +439,6 @@ def configure(settings=None):
coercer=int,
default=500,
)
maybe_set(
settings,
"sync_release_file_backfill.batch_size",
"SYNC_RELEASE_FILE_BACKFILL_BATCH_SIZE",
coercer=int,
default=10,
)
maybe_set_compound(settings, "billing", "backend", "BILLING_BACKEND")
maybe_set_compound(settings, "files", "backend", "FILES_BACKEND")
maybe_set_compound(settings, "archive_files", "backend", "ARCHIVE_FILES_BACKEND")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Remove MissingDatasetFile
Revision ID: 2a2c32c47a8f
Revises: 77d52a945a5f
Create Date: 2025-01-21 15:49:29.129691
"""

import sqlalchemy as sa

from alembic import op

revision = "2a2c32c47a8f"
down_revision = "77d52a945a5f"


def upgrade():
op.drop_table("missing_dataset_files")


def downgrade():
op.create_table(
"missing_dataset_files",
sa.Column("file_id", sa.UUID(), autoincrement=False, nullable=False),
sa.Column("processed", sa.BOOLEAN(), autoincrement=False, nullable=True),
sa.Column(
"id",
sa.UUID(),
server_default=sa.text("gen_random_uuid()"),
autoincrement=False,
nullable=False,
),
sa.ForeignKeyConstraint(
["file_id"], ["release_files.id"], name="missing_dataset_files_file_id_fkey"
),
sa.PrimaryKeyConstraint("id", name="missing_dataset_files_pkey"),
)
4 changes: 0 additions & 4 deletions warehouse/packaging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
check_file_cache_tasks_outstanding,
compute_2fa_metrics,
compute_packaging_metrics,
sync_bigquery_release_files,
update_description_html,
)
from warehouse.rate_limiting import IRateLimiter, RateLimit
Expand Down Expand Up @@ -197,6 +196,3 @@ def includeme(config):

# Add a periodic task to generate general metrics
config.add_periodic_task(crontab(minute="*/5"), compute_packaging_metrics)

if config.get_settings().get("warehouse.release_files_table"):
config.add_periodic_task(crontab(minute="*"), sync_bigquery_release_files)
8 changes: 0 additions & 8 deletions warehouse/packaging/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,11 +1133,3 @@ class AlternateRepository(db.Model):
name: Mapped[str]
url: Mapped[str]
description: Mapped[str]


class MissingDatasetFile(db.Model):
__tablename__ = "missing_dataset_files"

file_id: Mapped[UUID] = mapped_column(ForeignKey("release_files.id"))
file: Mapped[File] = orm.relationship()
processed: Mapped[bool] = mapped_column(default=None, nullable=True)
Loading

0 comments on commit 7ebe87a

Please sign in to comment.