From 05a74197333aca9acadd79393f1365ebc0d7c5d6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 19:02:08 -0800 Subject: [PATCH 01/59] migration fixes: - use group instead of distinct in unique page stats, page filename migration - consolidate collection recompute stats into single function - bump to 1.14.0-beta.2 --- backend/btrixcloud/colls.py | 22 +++++-------------- backend/btrixcloud/main_bg.py | 3 +-- .../migrations/migration_0037_upload_pages.py | 3 +-- .../migration_0042_page_filenames.py | 21 +++++++++++++++--- backend/btrixcloud/pages.py | 13 ++++++++--- 5 files changed, 35 insertions(+), 27 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 2103c61e40..6d96dce8f4 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -631,14 +631,11 @@ async def download_collection(self, coll_id: UUID, org: Organization): resp, headers=headers, media_type="application/wacz+zip" ) - async def recalculate_org_collection_counts_tags(self, org: Organization): - """Recalculate counts and tags for collections in org""" - collections, _ = await self.list_collections( - org, - page_size=100_000, - ) - for coll in collections: - await self.update_collection_counts_and_tags(coll.id) + async def recalculate_org_collection_stats(self, org: Organization): + """recalculate counts, tags and dates for all collections in an org""" + async for coll in self.collections.find({"oid": org.id}, projection={"_id": 1}): + await self.update_collection_counts_and_tags(coll.get("_id")) + await self.update_collection_dates(coll.get("_id")) async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" @@ -693,15 +690,6 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): }, ) - async def recalculate_org_collection_dates(self, org: Organization): - """Recalculate earliest and latest dates for collections in org""" - collections, _ = await self.list_collections( - org, - page_size=100_000, - ) - for coll in collections: - await self.update_collection_dates(coll.id) - async def update_collection_dates(self, coll_id: UUID): """Update collection earliest and latest dates from page timestamps""" # pylint: disable=too-many-locals diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 024f61b45d..4a57f96f06 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -61,8 +61,7 @@ async def main(): if job_type == BgJobType.READD_ORG_PAGES: try: await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) - await coll_ops.recalculate_org_collection_dates(org) - await coll_ops.recalculate_org_collection_counts_tags(org) + await coll_ops.recalculate_org_collection_stats(org) return 0 # pylint: disable=broad-exception-caught except Exception: diff --git a/backend/btrixcloud/migrations/migration_0037_upload_pages.py b/backend/btrixcloud/migrations/migration_0037_upload_pages.py index 1c7e4a80a3..8f896c5c92 100644 --- a/backend/btrixcloud/migrations/migration_0037_upload_pages.py +++ b/backend/btrixcloud/migrations/migration_0037_upload_pages.py @@ -59,8 +59,7 @@ async def migrate_up(self): async for org_dict in mdb_orgs.find({}): org = Organization.from_dict(org_dict) try: - await self.coll_ops.recalculate_org_collection_dates(org) - await self.coll_ops.recalculate_org_collection_counts_tags(org) + await self.coll_ops.recalculate_org_collection_stats(org) # pylint: disable=broad-exception-caught except Exception as err: print( diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 5410d4b593..1d99e5676b 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -32,12 +32,27 @@ async def migrate_up(self): ) return - crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) + # crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) + # crawl_count = len(crawl_ids_to_update) + aggregate = [ + {"$match": {"filename": None}}, + {"$group": {"_id": "$crawl_id"}}, + ] + total_agg = aggregate.copy() + total_agg.append({"$count": "count"}) + + res = pages_mdb.aggregate(total_agg) + res = await res.to_list(1) + crawl_count = res[0].get("count") + + print(f"Total crawls to update pages for: {crawl_count}") + + cursor = pages_mdb.aggregate(aggregate) - crawl_count = len(crawl_ids_to_update) current_index = 1 - for crawl_id in crawl_ids_to_update: + async for crawl in cursor: + crawl_id = crawl.get("_id") print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) try: await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index b1ad1ee6d1..56e0cf0833 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -954,10 +954,17 @@ def get_crawl_type_from_pages_route(self, request: Request): async def get_unique_page_count(self, crawl_ids: List[str]) -> int: """Get count of unique page URLs across list of archived items""" - unique_pages = await self.pages.distinct( - "url", {"crawl_id": {"$in": crawl_ids}} + # unique_pages = await self.pages.distinct( + # "url", {"crawl_id": {"$in": crawl_ids}} + # ) + count = 0 + cursor = self.pages.aggregate( + [{"$match": {"crawl_id": {"$in": crawl_ids}}}, {"$group": {"_id": "$url"}}] ) - return len(unique_pages) or 0 + async for res in cursor: + count += 1 + + return count async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" From 0ae4291580b1a3356f8937df1020b2a559462271 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 19:03:21 -0800 Subject: [PATCH 02/59] bump version to 1.14.0-beta.2 --- backend/btrixcloud/version.py | 2 +- chart/Chart.yaml | 2 +- chart/values.yaml | 4 ++-- frontend/package.json | 2 +- version.txt | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index fd88c44e5a..12b96af282 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.1" +__version__ = "1.14.0-beta.2" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index c6e88b1225..70dbbae1c6 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.1 +version: v1.14.0-beta.2 dependencies: - name: btrix-admin-logging diff --git a/chart/values.yaml b/chart/values.yaml index 69a710f9c6..d4ec5f54e7 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.1" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.2" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -158,7 +158,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.1" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.2" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index cac6f29c3c..16f6887821 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.1", + "version": "1.14.0-beta.2", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/version.txt b/version.txt index fd7dfd5d2c..36c001f1fd 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.14.0-beta.1 +1.14.0-beta.2 From feb4f095a3b8461b752055180831a92c3b179f17 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 19:30:15 -0800 Subject: [PATCH 03/59] bump to 1.14.0-beta.3 add timeout for backend gunicorn worker update --- backend/btrixcloud/pages.py | 2 +- backend/btrixcloud/version.py | 2 +- chart/Chart.yaml | 2 +- chart/templates/backend.yaml | 2 ++ chart/values.yaml | 7 +++++-- frontend/package.json | 2 +- version.txt | 2 +- 7 files changed, 12 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 56e0cf0833..7c5a40072f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -961,7 +961,7 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int: cursor = self.pages.aggregate( [{"$match": {"crawl_id": {"$in": crawl_ids}}}, {"$group": {"_id": "$url"}}] ) - async for res in cursor: + async for _res in cursor: count += 1 return count diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index 12b96af282..0fd1c29877 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.2" +__version__ = "1.14.0-beta.3" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 70dbbae1c6..f4641e6416 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.2 +version: v1.14.0-beta.3 dependencies: - name: btrix-admin-logging diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 3ce6dec50b..9f20eabb7e 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -71,6 +71,8 @@ spec: - "{{ .Values.backend_workers | default 1 }}" - --worker-class - uvicorn.workers.UvicornWorker + - --timeout + - {{ .Values.backend_worker_timeout }} envFrom: - configMapRef: diff --git a/chart/values.yaml b/chart/values.yaml index d4ec5f54e7..abd86aef27 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.2" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.3" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -111,6 +111,9 @@ backend_password_secret: "PASSWORD!" # number of workers per pod backend_workers: 1 +# for gunicorn --timeout +backend_worker_timeout: 60 + backend_cpu: "100m" backend_memory: "350Mi" @@ -158,7 +161,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.2" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.3" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index 16f6887821..7dfbf529b3 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.2", + "version": "1.14.0-beta.3", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/version.txt b/version.txt index 36c001f1fd..841e7a8e4c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.14.0-beta.2 +1.14.0-beta.3 From 16dd7a4c35126da254009419a55c12222cb34ed0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 19:49:33 -0800 Subject: [PATCH 04/59] typo fix --- chart/templates/backend.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 9f20eabb7e..8d87d1192e 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -72,7 +72,7 @@ spec: - --worker-class - uvicorn.workers.UvicornWorker - --timeout - - {{ .Values.backend_worker_timeout }} + - "{{ .Values.backend_worker_timeout }}" envFrom: - configMapRef: From 9ab4dacfb6e6d02b0bd9abf03df5d24dcd7f6c39 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 21:36:10 -0800 Subject: [PATCH 05/59] empty filenames migration: use existing bulk re_add method for faster migration --- backend/btrixcloud/migrations/migration_0042_page_filenames.py | 3 ++- backend/btrixcloud/pages.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 1d99e5676b..369510e991 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -55,7 +55,8 @@ async def migrate_up(self): crawl_id = crawl.get("_id") print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) try: - await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) + await self.page_ops.re_add_crawl_pages(crawl_id) + # await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) # pylint: disable=broad-exception-caught except Exception as err: print( diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 7c5a40072f..f8ddb9ae1f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -784,7 +784,7 @@ async def list_collection_pages( return [PageOut.from_dict(data) for data in items], total - async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): + async def re_add_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete existing pages for crawl and re-add from WACZs.""" try: From 20b6a7350518f5e0c900826c8d4386b276590323 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 21:36:42 -0800 Subject: [PATCH 06/59] bump to 1.14.0-beta.4 --- backend/btrixcloud/version.py | 2 +- chart/Chart.yaml | 2 +- chart/values.yaml | 4 ++-- frontend/package.json | 2 +- version.txt | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index 0fd1c29877..de6e390544 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.3" +__version__ = "1.14.0-beta.4" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index f4641e6416..d074ec0492 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.3 +version: v1.14.0-beta.4 dependencies: - name: btrix-admin-logging diff --git a/chart/values.yaml b/chart/values.yaml index abd86aef27..c3aefbb85c 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.3" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.4" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -161,7 +161,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.3" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.4" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index 7dfbf529b3..67fb1506de 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.3", + "version": "1.14.0-beta.4", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/version.txt b/version.txt index 841e7a8e4c..377350eda3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.14.0-beta.3 +1.14.0-beta.4 From b57e4fb53e23d7d1ebcd92c1688ea191ce139895 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 14:33:15 -0500 Subject: [PATCH 07/59] Add version to BaseCrawl, set to 2 for new crawls/uploads Also adds isMigrating, which we'll use in migration 0042 --- backend/btrixcloud/crawls.py | 1 + backend/btrixcloud/models.py | 7 +++++++ backend/btrixcloud/uploads.py | 1 + 3 files changed, 9 insertions(+) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 87ea7e3ae1..9c3f18695b 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -382,6 +382,7 @@ async def add_new_crawl( crawlerChannel=crawlconfig.crawlerChannel, proxyId=crawlconfig.proxyId, image=image, + version=2, ) try: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 058b7927fc..282f5a65ce 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -805,6 +805,9 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 + isMigrating: Optional[bool] = None + version: Optional[int] = None + # ============================================================================ class CollIdName(BaseModel): @@ -882,6 +885,10 @@ class CrawlOut(BaseMongoModel): filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 + # Set to older version by default, crawls with optimized + # pages will have this explicitly set to 2 + version: Optional[int] = 1 + # ============================================================================ class UpdateCrawl(BaseModel): diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 3324efdd89..80771f3c17 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -178,6 +178,7 @@ async def _create_upload( fileSize=file_size, started=now, finished=now, + version=2, ) # result = await self.crawls.insert_one(uploaded.to_dict()) From e390ba311e78ed3655447234bf4c900672dd4bf0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 15:01:04 -0500 Subject: [PATCH 08/59] Use optimize pages background job in 0042 - Adds new optimize pages background job that updates crawl pages and sets version on updated crawls to 2 - New job uses a new migration_job.yaml template with parallelism set to 3 - Update BackgroundJob model and some ops methods to allow for creating and retrying a background job with no oid - Add new API endpoint to retry one specific background job that isn't tied to a specific org (superuser-only) --- backend/btrixcloud/background_jobs.py | 72 ++++++++++++++++++- backend/btrixcloud/crawlmanager.py | 41 ++++++++--- backend/btrixcloud/main_bg.py | 12 +++- .../migration_0042_page_filenames.py | 54 ++++---------- backend/btrixcloud/models.py | 12 +++- backend/btrixcloud/pages.py | 44 ++++++++++++ chart/app-templates/migration_job.yaml | 59 +++++++++++++++ 7 files changed, 242 insertions(+), 52 deletions(-) create mode 100644 chart/app-templates/migration_job.yaml diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 507949604e..08bd7a8b0b 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -23,6 +23,7 @@ DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, PaginatedBackgroundJobResponse, AnyJob, StorageRef, @@ -424,6 +425,49 @@ async def create_re_add_org_pages_job( print(f"warning: re-add org pages job could not be started: {exc}") return None + async def create_optimize_crawl_pages_job( + self, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ): + """Create job to optimize crawl pages""" + + try: + job_id = await self.crawl_manager.run_optimize_pages_job( + crawl_type=crawl_type, + existing_job_id=existing_job_id, + ) + if existing_job_id: + optimize_pages_job = await self.get_background_job(existing_job_id) + previous_attempt = { + "started": optimize_pages_job.started, + "finished": optimize_pages_job.finished, + } + if optimize_pages_job.previousAttempts: + optimize_pages_job.previousAttempts.append(previous_attempt) + else: + optimize_pages_job.previousAttempts = [previous_attempt] + optimize_pages_job.started = dt_now() + optimize_pages_job.finished = None + optimize_pages_job.success = None + else: + optimize_pages_job = OptimizePagesJob( + id=job_id, + crawl_type=crawl_type, + started=dt_now(), + ) + + await self.jobs.find_one_and_update( + {"_id": job_id}, {"$set": optimize_pages_job.to_dict()}, upsert=True + ) + + return job_id + # pylint: disable=broad-exception-caught + except Exception as exc: + # pylint: disable=raise-missing-from + print(f"warning: optimize pages job could not be started: {exc}") + return None + async def job_finished( self, job_id: str, @@ -478,6 +522,7 @@ async def get_background_job( DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, ]: """Get background job""" query: dict[str, object] = {"_id": job_id} @@ -504,6 +549,9 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): if data["type"] == BgJobType.READD_ORG_PAGES: return ReAddOrgPagesJob.from_dict(data) + if data["type"] == BgJobType.OPTIMIZE_PAGES: + return OptimizePagesJob.from_dict(data) + return DeleteOrgJob.from_dict(data) async def list_background_jobs( @@ -590,7 +638,7 @@ async def get_replica_job_file( raise HTTPException(status_code=404, detail="file_not_found") async def retry_background_job( - self, job_id: str, org: Organization + self, job_id: str, org: Optional[Organization] = None ) -> Dict[str, Union[bool, Optional[str]]]: """Retry background job""" job = await self.get_background_job(job_id, org.id) @@ -652,6 +700,12 @@ async def retry_background_job( existing_job_id=job_id, ) + if job.type == BgJobType.OPTIMIZE_PAGES: + await self.create_optimize_crawl_pages_job( + job.crawl_type, + existing_job_id=job_id, + ) + return {"success": True} async def retry_failed_background_jobs( @@ -679,7 +733,9 @@ async def retry_all_failed_background_jobs( """ bg_tasks = set() async for job in self.jobs.find({"success": False}): - org = await self.org_ops.get_org_by_id(job["oid"]) + org = None + if job["oid"]: + org = await self.org_ops.get_org_by_id(job["oid"]) task = asyncio.create_task(self.retry_background_job(job["_id"], org)) bg_tasks.add(task) task.add_done_callback(bg_tasks.discard) @@ -722,7 +778,17 @@ async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep return await ops.get_background_job(job_id) - @router.post("/{job_id}/retry", response_model=SuccessResponse) + @app.post( + "/orgs/all/jobs/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"] + ) + async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep)): + """Retry backgound migration job""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return await ops.retry_background_job(job_id) + + @router.post("/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"]) async def retry_background_job( job_id: str, org: Organization = Depends(org_crawl_dep), diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 2b5f194a4a..d77e547996 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -128,7 +128,7 @@ async def run_delete_org_job( job_id = f"delete-org-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, job_id, job_type=BgJobType.DELETE_ORG.value + job_id, job_type=BgJobType.DELETE_ORG.value, oid=oid ) async def run_recalculate_org_stats_job( @@ -144,9 +144,7 @@ async def run_recalculate_org_stats_job( job_id = f"org-stats-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, - job_id, - job_type=BgJobType.RECALCULATE_ORG_STATS.value, + job_id, job_type=BgJobType.RECALCULATE_ORG_STATS.value, oid=oid ) async def run_re_add_org_pages_job( @@ -163,27 +161,54 @@ async def run_re_add_org_pages_job( job_id = f"org-pages-{oid}-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - oid, job_id, job_type=BgJobType.READD_ORG_PAGES.value, + oid=oid, + crawl_type=crawl_type, + ) + + async def run_optimize_pages_job( + self, + crawl_type: Optional[str] = None, + existing_job_id: Optional[str] = None, + ) -> str: + """run job to recalculate storage stats for the org""" + + if existing_job_id: + job_id = existing_job_id + else: + job_id = f"optimize-pages-{secrets.token_hex(5)}" + + return await self._run_bg_job_with_ops_classes( + job_id, + job_type=BgJobType.OPTIMIZE_PAGES.value, + migration_job=True, crawl_type=crawl_type, ) async def _run_bg_job_with_ops_classes( - self, oid: str, job_id: str, job_type: str, **kwargs + self, + job_id: str, + job_type: str, + oid: Optional[str] = None, + migration_job: bool = False, + **kwargs, ) -> str: """run background job with access to ops classes""" params = { "id": job_id, - "oid": oid, "job_type": job_type, "backend_image": os.environ.get("BACKEND_IMAGE", ""), "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), **kwargs, } + if oid: + params["oid"] = oid + + template = "migration_job.yaml" if migration_job else "background_job.yaml" - data = self.templates.env.get_template("background_job.yaml").render(params) + data = self.templates.env.get_template(template).render(params) await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 4a57f96f06..2d33dd3e4a 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -30,6 +30,17 @@ async def main(): (org_ops, _, _, _, _, page_ops, coll_ops, _, _, _, _, user_manager) = init_ops() + # Run job (generic) + if job_type == BgJobType.OPTIMIZE_PAGES: + try: + await page_ops.optimize_crawl_pages(version=2, crawl_type=crawl_type) + return 0 + # pylint: disable=broad-exception-caught + except Exception: + traceback.print_exc() + return 1 + + # Run job (org-specific) if not oid: print("Org id missing, quitting") return 1 @@ -39,7 +50,6 @@ async def main(): print("Org id invalid, quitting") return 1 - # Run job if job_type == BgJobType.DELETE_ORG: try: await org_ops.delete_org_and_data(org, user_manager) diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 369510e991..5cb57f69f8 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -15,52 +15,28 @@ class Migration(BaseMigration): def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) - self.page_ops = kwargs.get("page_ops") + self.background_job_ops = kwargs.get("background_job_ops") async def migrate_up(self): """Perform migration up. - Add filename to all pages that don't currently have it stored, - iterating through each archived item and its WACZ files as necessary + Optimize crawl pages for optimized replay in background job by adding + filename, isSeed, depth, and favIconUrl as needed. """ - pages_mdb = self.mdb["pages"] - - if self.page_ops is None: + if self.background_job_ops is None: print( - "Unable to add filename and other fields to pages, missing page_ops", + "Unable to start background job to optimize pages, ops class missing", flush=True, ) return - # crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) - # crawl_count = len(crawl_ids_to_update) - aggregate = [ - {"$match": {"filename": None}}, - {"$group": {"_id": "$crawl_id"}}, - ] - total_agg = aggregate.copy() - total_agg.append({"$count": "count"}) - - res = pages_mdb.aggregate(total_agg) - res = await res.to_list(1) - crawl_count = res[0].get("count") - - print(f"Total crawls to update pages for: {crawl_count}") - - cursor = pages_mdb.aggregate(aggregate) - - current_index = 1 - - async for crawl in cursor: - crawl_id = crawl.get("_id") - print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) - try: - await self.page_ops.re_add_crawl_pages(crawl_id) - # await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) - # pylint: disable=broad-exception-caught - except Exception as err: - print( - f"Error adding filename and other fields to pages in item {crawl_id}: {err}", - flush=True, - ) - current_index += 1 + try: + await self.background_job_ops.create_optimize_crawl_pages_job( + crawl_type="crawl" + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to start background job to optimize pages: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 282f5a65ce..c5f7dcd9ee 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2540,6 +2540,7 @@ class BgJobType(str, Enum): DELETE_ORG = "delete-org" RECALCULATE_ORG_STATS = "recalculate-org-stats" READD_ORG_PAGES = "readd-org-pages" + OPTIMIZE_PAGES = "optimize-pages" # ============================================================================ @@ -2548,7 +2549,7 @@ class BackgroundJob(BaseMongoModel): id: str type: BgJobType - oid: UUID + oid: Optional[UUID] = None success: Optional[bool] = None started: datetime finished: Optional[datetime] = None @@ -2601,6 +2602,14 @@ class ReAddOrgPagesJob(BackgroundJob): crawl_type: Optional[str] = None +# ============================================================================ +class OptimizePagesJob(BackgroundJob): + """Model for tracking jobs to optimize pages across all orgs""" + + type: Literal[BgJobType.OPTIMIZE_PAGES] = BgJobType.OPTIMIZE_PAGES + crawl_type: Optional[str] = None + + # ============================================================================ # Union of all job types, for response model @@ -2612,6 +2621,7 @@ class ReAddOrgPagesJob(BackgroundJob): DeleteOrgJob, RecalculateOrgStatsJob, ReAddOrgPagesJob, + OptimizePagesJob, ] ] diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index f8ddb9ae1f..d1fc6b643a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -5,6 +5,7 @@ import asyncio import os import re +import time import traceback import urllib.parse from datetime import datetime @@ -977,6 +978,49 @@ async def set_archived_item_page_counts(self, crawl_id: str): {"$set": {"uniquePageCount": unique_page_count, "pageCount": page_count}}, ) + async def optimize_crawl_pages( + self, version: int = 2, crawl_type: Optional[str] = None + ): + """Iterate through crawls, optimizing pages""" + while True: + # Pull new crawl + match_query = {"version": {"$ne": version}, "isMigrating": {"$ne": True}} + if crawl_type in ("crawl", "upload"): + match_query["type"] = crawl_type + + next_crawl = await self.crawls.find_one(match_query) + if next_crawl is None: + break + + crawl_id = next_crawl.get("_id") + + # Set isMigrating + await self.crawls.find_one_and_update( + {"_id": crawl_id}, {"$set": {"isMigrating": True}} + ) + + # Re-add crawl pages if at least one page doesn't have filename set + has_page_no_filename = await self.pages.find_one( + {"crawl_id": crawl_id, "filename": None} + ) + + if has_page_no_filename: + await self.re_add_crawl_pages(crawl_id) + + # Update crawl status + await self.crawls.find_one_and_update( + {"_id": crawl_id}, {"$set": {"version": version, "isMigrating": False}} + ) + + # Wait until all pods are fully done before returning. For k8s job + # parallelism to work as expected, pods must only return exit code 0 + # once the work in all pods is fully complete. + while True: + in_progress = await self.crawls.find_one({"isMigrating": True}) + if in_progress is None: + break + time.sleep(5) + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme diff --git a/chart/app-templates/migration_job.yaml b/chart/app-templates/migration_job.yaml new file mode 100644 index 0000000000..d48fe5fac8 --- /dev/null +++ b/chart/app-templates/migration_job.yaml @@ -0,0 +1,59 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ id }}" + labels: + role: "background-job" + job_type: {{ job_type }} + +spec: + ttlSecondsAfterFinished: 90 + backoffLimit: 3 + template: + spec: + parallelism: 3 + restartPolicy: Never + priorityClassName: bg-job + podFailurePolicy: + rules: + - action: FailJob + onExitCodes: + containerName: btrixbgjob + operator: NotIn + values: [0] + + volumes: + - name: ops-configs + secret: + secretName: ops-configs + + containers: + - name: btrixbgjob + image: {{ backend_image }} + imagePullPolicy: {{ pull_policy }} + env: + - name: BG_JOB_TYPE + value: {{ job_type }} + + - name: CRAWL_TYPE + value: {{ crawl_type }} + + envFrom: + - configMapRef: + name: backend-env-config + - secretRef: + name: mongo-auth + + volumeMounts: + - name: ops-configs + mountPath: /ops-configs/ + + command: ["python3", "-m", "btrixcloud.main_bg"] + + resources: + limits: + memory: "500Mi" + + requests: + memory: "250Mi" + cpu: "200m" From 4ea162f932068aa2d49a1e944e0d5ef63fbd3b2a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 15:34:15 -0500 Subject: [PATCH 09/59] Hacky workaroud: Pass default org to retry_background_job Now that we have background jobs that aren't tied to a specific org, much of the background_jobs module needs to be reworked to account for that. That will take some time, so for now, so that we can test the migration, we just pass the default org to retry_background_job if the job doesn't have an oid set. --- backend/btrixcloud/background_jobs.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 08bd7a8b0b..b45c02e3d3 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -638,10 +638,10 @@ async def get_replica_job_file( raise HTTPException(status_code=404, detail="file_not_found") async def retry_background_job( - self, job_id: str, org: Optional[Organization] = None + self, job_id: str, org: Organization ) -> Dict[str, Union[bool, Optional[str]]]: """Retry background job""" - job = await self.get_background_job(job_id, org.id) + job = await self.get_background_job(job_id) if not job: raise HTTPException(status_code=404, detail="job_not_found") @@ -733,9 +733,11 @@ async def retry_all_failed_background_jobs( """ bg_tasks = set() async for job in self.jobs.find({"success": False}): - org = None if job["oid"]: org = await self.org_ops.get_org_by_id(job["oid"]) + else: + # Hacky workaround until we rework retry_background_job + org = await self.org_ops.get_default_org() task = asyncio.create_task(self.retry_background_job(job["_id"], org)) bg_tasks.add(task) task.add_done_callback(bg_tasks.discard) @@ -786,7 +788,18 @@ async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.retry_background_job(job_id) + job = await ops.get_background_job(job_id) + + if job.oid: + org = await ops.org_ops.get_org_by_id(job.oid) + # Use default org for org-less jobs without oid for now, until we + # can rework retry_background_job + elif job.type == BgJobType.OPTIMIZE_PAGES: + org = await ops.org_ops.get_default_org() + else: + return HTTPException(status_code=404, detail="job_not_found") + + return await ops.retry_background_job(job_id, org) @router.post("/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"]) async def retry_background_job( From b67e2c7f2ef255a4dd5e928fd7ed7c171c59ee4d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 15:51:26 -0500 Subject: [PATCH 10/59] Make sure job failure emails are sent even if no oid --- backend/btrixcloud/background_jobs.py | 10 ++++++---- backend/btrixcloud/emailsender.py | 2 +- backend/btrixcloud/operator/bgjobs.py | 8 +++++++- chart/email-templates/failed_bg_job | 3 ++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b45c02e3d3..043b394dd9 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -472,14 +472,14 @@ async def job_finished( self, job_id: str, job_type: str, - oid: UUID, success: bool, finished: datetime, + oid: Optional[UUID] = None, ) -> None: """Update job as finished, including job-specific task handling""" - job = await self.get_background_job(job_id, oid) + job = await self.get_background_job(job_id) if job.finished: return @@ -499,14 +499,16 @@ async def job_finished( flush=True, ) superuser = await self.user_manager.get_superuser() - org = await self.org_ops.get_org_by_id(job.oid) + org = None + if job.oid: + org = await self.org_ops.get_org_by_id(job.oid) await asyncio.get_event_loop().run_in_executor( None, self.email.send_background_job_failed, job, - org, finished, superuser.email, + org, ) await self.jobs.find_one_and_update( diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index e710f99ce9..7651e8dc3f 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -154,9 +154,9 @@ def send_user_forgot_password(self, receiver_email, token, headers=None): def send_background_job_failed( self, job: Union[CreateReplicaJob, DeleteReplicaJob], - org: Organization, finished: datetime, receiver_email: str, + org: Optional[Organization] = None, ): """Send background job failed email to superuser""" self._send_encrypted( diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py index 4538582c08..3bb1c8d593 100644 --- a/backend/btrixcloud/operator/bgjobs.py +++ b/backend/btrixcloud/operator/bgjobs.py @@ -49,9 +49,15 @@ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: if not finished: finished = dt_now() + try: + oid = UUID(oid) + # pylint: disable=broad-except + except Exception: + oid = None + try: await self.background_job_ops.job_finished( - job_id, job_type, UUID(oid), success=success, finished=finished + job_id, job_type, success=success, finished=finished, oid=oid ) # print( # f"{job_type} background job completed: success: {success}, {job_id}", diff --git a/chart/email-templates/failed_bg_job b/chart/email-templates/failed_bg_job index 27e1136314..528cfc33f3 100644 --- a/chart/email-templates/failed_bg_job +++ b/chart/email-templates/failed_bg_job @@ -2,8 +2,9 @@ Failed Background Job ~~~ Failed Background Job --------------------- - +{% if org %} Organization: {{ org.name }} ({{ job.oid }}) +{% endif %} Job type: {{ job.type }} Job ID: {{ job.id }} From bd2e2c254d4874f99d4fc5dd9caab740e77c327c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 15:54:43 -0500 Subject: [PATCH 11/59] Fix typing error --- backend/btrixcloud/operator/bgjobs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py index 3bb1c8d593..dc7cb63d77 100644 --- a/backend/btrixcloud/operator/bgjobs.py +++ b/backend/btrixcloud/operator/bgjobs.py @@ -50,14 +50,14 @@ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: finished = dt_now() try: - oid = UUID(oid) + org_id = UUID(oid) # pylint: disable=broad-except except Exception: - oid = None + org_id = None try: await self.background_job_ops.job_finished( - job_id, job_type, success=success, finished=finished, oid=oid + job_id, job_type, success=success, finished=finished, oid=org_id ) # print( # f"{job_type} background job completed: success: {success}, {job_id}", From eb455a1e2df7a7e300b36572a499c98de60ef53e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 16:07:40 -0500 Subject: [PATCH 12/59] Only include replay optimization fields if all pages are optimized Only include initialPages, pagesQueryUrl, and preloadResources in replay.json responses for crawls and collections if all of the relevant crawls have version set to 2. --- backend/btrixcloud/basecrawls.py | 17 +++++++++-------- backend/btrixcloud/colls.py | 30 ++++++++++++++++++------------ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index b5bb52a33f..ed71afe081 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -170,16 +170,17 @@ async def get_crawl_out( if coll_ids: res["collections"] = await self.colls.get_collection_names(coll_ids) - res["initialPages"], _ = await self.page_ops.list_pages( - crawlid, is_seed=True, page_size=25 - ) - - oid = res.get("oid") - if oid: - res["pagesQueryUrl"] = ( - get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + if res.get("version", 1) == 2: + res["initialPages"], _ = await self.page_ops.list_pages( + crawlid, is_seed=True, page_size=25 ) + oid = res.get("oid") + if oid: + res["pagesQueryUrl"] = ( + get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + ) + crawl = CrawlOutWithResources.from_dict(res) if not skip_resources: diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 6d96dce8f4..91c54687c3 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -347,21 +347,24 @@ async def get_collection_out( result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: - result["resources"], result["preloadResources"] = ( + result["resources"], result["preloadResources"], pages_optimized = ( await self.get_collection_crawl_resources( coll_id, include_preloads=True ) ) - result["initialPages"], result["totalPages"] = ( + initial_pages, result["totalPages"] = ( await self.page_ops.list_collection_pages(coll_id, page_size=25) ) public = "public/" if public_or_unlisted_only else "" - result["pagesQueryUrl"] = ( - get_origin(headers) - + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" - ) + + if pages_optimized: + result["initialPages"] = initial_pages + result["pagesQueryUrl"] = ( + get_origin(headers) + + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" + ) thumbnail = result.get("thumbnail") if thumbnail: @@ -388,7 +391,7 @@ async def get_public_collection_out( if result.get("access") not in allowed_access: raise HTTPException(status_code=404, detail="collection_not_found") - result["resources"], _ = await self.get_collection_crawl_resources(coll_id) + result["resources"], _, _ = await self.get_collection_crawl_resources(coll_id) thumbnail = result.get("thumbnail") if thumbnail: @@ -487,7 +490,7 @@ async def list_collections( collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: - res["resources"], res["preloadResources"] = ( + res["resources"], res["preloadResources"], _ = ( await self.get_collection_crawl_resources( res["_id"], include_preloads=not public_colls_out ) @@ -521,6 +524,8 @@ async def get_collection_crawl_resources( _ = await self.get_collection_raw(coll_id) resources = [] + preload_resources: List[PreloadResource] = [] + pages_optimized = True crawls, _ = await self.crawl_ops.list_all_base_crawls( collection_id=coll_id, @@ -532,15 +537,16 @@ async def get_collection_crawl_resources( for crawl in crawls: if crawl.resources: resources.extend(crawl.resources) - - preload_resources: List[PreloadResource] = [] + if crawl.version != 2: + include_preloads = False + pages_optimized = False if include_preloads: no_page_items = await self.get_collection_resources_with_no_pages(crawls) for item in no_page_items: preload_resources.append(item) - return resources, preload_resources + return resources, preload_resources, pages_optimized async def get_collection_resources_with_no_pages( self, crawls: List[CrawlOutWithResources] @@ -1056,7 +1062,7 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)): try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: - results[collection.name], _ = ( + results[collection.name], _, _ = ( await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: From bef7a18228a4a3b9b5123da92a65e6058eb2a4df Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 16:14:46 -0500 Subject: [PATCH 13/59] Set isMigrating at same time as pulling next crawl --- backend/btrixcloud/pages.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index d1fc6b643a..7c50b263bb 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -983,31 +983,27 @@ async def optimize_crawl_pages( ): """Iterate through crawls, optimizing pages""" while True: - # Pull new crawl + # Pull new crawl and set isMigrating match_query = {"version": {"$ne": version}, "isMigrating": {"$ne": True}} if crawl_type in ("crawl", "upload"): match_query["type"] = crawl_type - next_crawl = await self.crawls.find_one(match_query) + next_crawl = await self.crawls.find_one_and_update( + match_query, {"$set": {"isMigrating": True}} + ) if next_crawl is None: break crawl_id = next_crawl.get("_id") - # Set isMigrating - await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$set": {"isMigrating": True}} - ) - # Re-add crawl pages if at least one page doesn't have filename set has_page_no_filename = await self.pages.find_one( {"crawl_id": crawl_id, "filename": None} ) - if has_page_no_filename: await self.re_add_crawl_pages(crawl_id) - # Update crawl status + # Update crawl version and unset isMigrating await self.crawls.find_one_and_update( {"_id": crawl_id}, {"$set": {"version": version, "isMigrating": False}} ) From 4f905556683e66b16d33e6aaa06b0ab01e12e841 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 16:29:31 -0500 Subject: [PATCH 14/59] Redo background_jobs typing, add superuser non-org list endpoint --- backend/btrixcloud/background_jobs.py | 96 +++++++++++++++++++-------- 1 file changed, 67 insertions(+), 29 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 043b394dd9..6f00c6ef2b 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -558,7 +558,7 @@ def _get_job_by_type_from_data(self, data: dict[str, object]): async def list_background_jobs( self, - org: Organization, + org: Optional[Organization] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, success: Optional[bool] = None, @@ -572,7 +572,10 @@ async def list_background_jobs( page = page - 1 skip = page_size * page - query: dict[str, object] = {"oid": org.id} + query: dict[str, object] = {} + + if org: + query["oid"] = org.id if success in (True, False): query["success"] = success @@ -640,8 +643,8 @@ async def get_replica_job_file( raise HTTPException(status_code=404, detail="file_not_found") async def retry_background_job( - self, job_id: str, org: Organization - ) -> Dict[str, Union[bool, Optional[str]]]: + self, job_id: str, org: Optional[Organization] = None + ): """Retry background job""" job = await self.get_background_job(job_id) if not job: @@ -653,6 +656,22 @@ async def retry_background_job( if job.success: raise HTTPException(status_code=400, detail="job_already_succeeded") + if org: + return await self.retry_org_background_job(job, org) + + if job.type == BgJobType.OPTIMIZE_PAGES: + await self.create_optimize_crawl_pages_job( + job.crawl_type, + existing_job_id=job_id, + ) + return {"success": True} + + return {"success": False} + + async def retry_org_background_job( + self, job: BackgroundJob, org: Organization + ) -> Dict[str, Union[bool, Optional[str]]]: + """Retry background job specific to one org""" if job.type == BgJobType.CREATE_REPLICA: file = await self.get_replica_job_file(job, org) primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) @@ -670,6 +689,7 @@ async def retry_background_job( primary_endpoint, existing_job_id=job_id, ) + return {"success": True} if job.type == BgJobType.DELETE_REPLICA: file = await self.get_replica_job_file(job, org) @@ -682,18 +702,21 @@ async def retry_background_job( force_start_immediately=True, existing_job_id=job_id, ) + return {"success": True} if job.type == BgJobType.DELETE_ORG: await self.create_delete_org_job( org, existing_job_id=job_id, ) + return {"success": True} if job.type == BgJobType.RECALCULATE_ORG_STATS: await self.create_recalculate_org_stats_job( org, existing_job_id=job_id, ) + return {"success": True} if job.type == BgJobType.READD_ORG_PAGES: await self.create_re_add_org_pages_job( @@ -701,16 +724,11 @@ async def retry_background_job( job.crawl_type, existing_job_id=job_id, ) + return {"success": True} - if job.type == BgJobType.OPTIMIZE_PAGES: - await self.create_optimize_crawl_pages_job( - job.crawl_type, - existing_job_id=job_id, - ) - - return {"success": True} + return {"success": False} - async def retry_failed_background_jobs( + async def retry_failed_org_background_jobs( self, org: Organization ) -> Dict[str, Union[bool, Optional[str]]]: """Retry all failed background jobs in an org @@ -735,11 +753,9 @@ async def retry_all_failed_background_jobs( """ bg_tasks = set() async for job in self.jobs.find({"success": False}): + org = None if job["oid"]: org = await self.org_ops.get_org_by_id(job["oid"]) - else: - # Hacky workaround until we rework retry_background_job - org = await self.org_ops.get_default_org() task = asyncio.create_task(self.retry_background_job(job["_id"], org)) bg_tasks.add(task) task.add_done_callback(bg_tasks.discard) @@ -767,7 +783,7 @@ def init_background_jobs_api( "/{job_id}", response_model=AnyJob, ) - async def get_background_job( + async def get_org_background_job( job_id: str, org: Organization = Depends(org_crawl_dep), ): @@ -786,25 +802,20 @@ async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep "/orgs/all/jobs/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"] ) async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep)): - """Retry backgound migration job""" + """Retry backgound job that doesn't belong to an org, e.g. migration job""" if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") job = await ops.get_background_job(job_id) + org = None if job.oid: org = await ops.org_ops.get_org_by_id(job.oid) - # Use default org for org-less jobs without oid for now, until we - # can rework retry_background_job - elif job.type == BgJobType.OPTIMIZE_PAGES: - org = await ops.org_ops.get_default_org() - else: - return HTTPException(status_code=404, detail="job_not_found") return await ops.retry_background_job(job_id, org) @router.post("/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"]) - async def retry_background_job( + async def retry_org_background_job( job_id: str, org: Organization = Depends(org_crawl_dep), ): @@ -821,14 +832,41 @@ async def retry_all_failed_background_jobs(user: User = Depends(user_dep)): return await ops.retry_all_failed_background_jobs() - @router.post("/retryFailed", response_model=SuccessResponse) - async def retry_failed_background_jobs( + @router.post("/retryFailed", response_model=SuccessResponse, tags=["jobs"]) + async def retry_failed_org_background_jobs( org: Organization = Depends(org_crawl_dep), ): """Retry failed background jobs""" - return await ops.retry_failed_background_jobs(org) + return await ops.retry_failed_org_background_jobs(org) + + @app.get( + "/orgs/all/jobs", response_model=PaginatedBackgroundJobResponse, tags=["jobs"] + ) + async def list_all_background_jobs( + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + success: Optional[bool] = None, + jobType: Optional[str] = None, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + user: User = Depends(user_dep), + ): + """Retrieve paginated list of background jobs""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + jobs, total = await ops.list_background_jobs( + org=None, + page_size=pageSize, + page=page, + success=success, + job_type=jobType, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(jobs, total, page, pageSize) - @router.get("", response_model=PaginatedBackgroundJobResponse) + @router.get("", response_model=PaginatedBackgroundJobResponse, tags=["jobs"]) async def list_background_jobs( org: Organization = Depends(org_crawl_dep), pageSize: int = DEFAULT_PAGE_SIZE, @@ -840,7 +878,7 @@ async def list_background_jobs( ): """Retrieve paginated list of background jobs""" jobs, total = await ops.list_background_jobs( - org, + org=org, page_size=pageSize, page=page, success=success, From 25c824414ec6da8b1a5b084cb134a98266c039bf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 16:36:29 -0500 Subject: [PATCH 15/59] Remove crawl_type from optimize_pages bg job --- backend/btrixcloud/background_jobs.py | 4 ---- backend/btrixcloud/crawlmanager.py | 2 -- backend/btrixcloud/main_bg.py | 2 +- .../btrixcloud/migrations/migration_0042_page_filenames.py | 4 +--- backend/btrixcloud/models.py | 1 - backend/btrixcloud/pages.py | 6 +----- 6 files changed, 3 insertions(+), 16 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 6f00c6ef2b..690d46d893 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -427,14 +427,12 @@ async def create_re_add_org_pages_job( async def create_optimize_crawl_pages_job( self, - crawl_type: Optional[str] = None, existing_job_id: Optional[str] = None, ): """Create job to optimize crawl pages""" try: job_id = await self.crawl_manager.run_optimize_pages_job( - crawl_type=crawl_type, existing_job_id=existing_job_id, ) if existing_job_id: @@ -453,7 +451,6 @@ async def create_optimize_crawl_pages_job( else: optimize_pages_job = OptimizePagesJob( id=job_id, - crawl_type=crawl_type, started=dt_now(), ) @@ -661,7 +658,6 @@ async def retry_background_job( if job.type == BgJobType.OPTIMIZE_PAGES: await self.create_optimize_crawl_pages_job( - job.crawl_type, existing_job_id=job_id, ) return {"success": True} diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index d77e547996..4ec32c4606 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -169,7 +169,6 @@ async def run_re_add_org_pages_job( async def run_optimize_pages_job( self, - crawl_type: Optional[str] = None, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -183,7 +182,6 @@ async def run_optimize_pages_job( job_id, job_type=BgJobType.OPTIMIZE_PAGES.value, migration_job=True, - crawl_type=crawl_type, ) async def _run_bg_job_with_ops_classes( diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 2d33dd3e4a..205d2bb60f 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -33,7 +33,7 @@ async def main(): # Run job (generic) if job_type == BgJobType.OPTIMIZE_PAGES: try: - await page_ops.optimize_crawl_pages(version=2, crawl_type=crawl_type) + await page_ops.optimize_crawl_pages(version=2) return 0 # pylint: disable=broad-exception-caught except Exception: diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py index 5cb57f69f8..3a5b5723cc 100644 --- a/backend/btrixcloud/migrations/migration_0042_page_filenames.py +++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py @@ -31,9 +31,7 @@ async def migrate_up(self): return try: - await self.background_job_ops.create_optimize_crawl_pages_job( - crawl_type="crawl" - ) + await self.background_job_ops.create_optimize_crawl_pages_job() # pylint: disable=broad-exception-caught except Exception as err: print( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index c5f7dcd9ee..33d4109012 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2607,7 +2607,6 @@ class OptimizePagesJob(BackgroundJob): """Model for tracking jobs to optimize pages across all orgs""" type: Literal[BgJobType.OPTIMIZE_PAGES] = BgJobType.OPTIMIZE_PAGES - crawl_type: Optional[str] = None # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 7c50b263bb..e93c057a58 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -978,15 +978,11 @@ async def set_archived_item_page_counts(self, crawl_id: str): {"$set": {"uniquePageCount": unique_page_count, "pageCount": page_count}}, ) - async def optimize_crawl_pages( - self, version: int = 2, crawl_type: Optional[str] = None - ): + async def optimize_crawl_pages(self, version: int = 2): """Iterate through crawls, optimizing pages""" while True: # Pull new crawl and set isMigrating match_query = {"version": {"$ne": version}, "isMigrating": {"$ne": True}} - if crawl_type in ("crawl", "upload"): - match_query["type"] = crawl_type next_crawl = await self.crawls.find_one_and_update( match_query, {"$set": {"isMigrating": True}} From 1891ef2b43384b0ddd3edb8d360200c97601b646 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:08:01 -0500 Subject: [PATCH 16/59] Handle running crawls in page optimization --- backend/btrixcloud/pages.py | 64 ++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index e93c057a58..0ffe26c53c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -980,29 +980,55 @@ async def set_archived_item_page_counts(self, crawl_id: str): async def optimize_crawl_pages(self, version: int = 2): """Iterate through crawls, optimizing pages""" - while True: - # Pull new crawl and set isMigrating - match_query = {"version": {"$ne": version}, "isMigrating": {"$ne": True}} - next_crawl = await self.crawls.find_one_and_update( - match_query, {"$set": {"isMigrating": True}} - ) - if next_crawl is None: - break + async def process_finished_crawls(): + while True: + # Pull new finished crawl and set isMigrating + match_query = { + "version": {"$ne": version}, + "isMigrating": {"$ne": True}, + "finished": {"$ne": None}, + } - crawl_id = next_crawl.get("_id") + next_crawl = await self.crawls.find_one_and_update( + match_query, {"$set": {"isMigrating": True}} + ) + if next_crawl is None: + break - # Re-add crawl pages if at least one page doesn't have filename set - has_page_no_filename = await self.pages.find_one( - {"crawl_id": crawl_id, "filename": None} - ) - if has_page_no_filename: - await self.re_add_crawl_pages(crawl_id) + crawl_id = next_crawl.get("_id") - # Update crawl version and unset isMigrating - await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$set": {"version": version, "isMigrating": False}} - ) + # Re-add crawl pages if at least one page doesn't have filename set + has_page_no_filename = await self.pages.find_one( + {"crawl_id": crawl_id, "filename": None} + ) + if has_page_no_filename: + await self.re_add_crawl_pages(crawl_id) + + # Update crawl version and unset isMigrating + await self.crawls.find_one_and_update( + {"_id": crawl_id}, + {"$set": {"version": version, "isMigrating": False}}, + ) + + await process_finished_crawls() + + # Wait for running crawls from before migration to finish, and then process + # again when they're done to make sure everything's been handled + while True: + match_query = { + "version": {"$ne": version}, + "isMigrating": {"$ne": True}, + "finished": None, + } + running_crawl = await self.crawls.find_one(match_query) + + if not running_crawl: + break + + time.sleep(30) + + await process_finished_crawls() # Wait until all pods are fully done before returning. For k8s job # parallelism to work as expected, pods must only return exit code 0 From b2d97f0c531939c140117029490e39c7aec4797c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:10:17 -0500 Subject: [PATCH 17/59] Remove isMigrating filter from running crawl check --- backend/btrixcloud/pages.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 0ffe26c53c..d8405562af 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1018,7 +1018,6 @@ async def process_finished_crawls(): while True: match_query = { "version": {"$ne": version}, - "isMigrating": {"$ne": True}, "finished": None, } running_crawl = await self.crawls.find_one(match_query) From ccd7f1e5b56e56db386783297dc17635cd741936 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:13:06 -0500 Subject: [PATCH 18/59] Fix bg job retry --- backend/btrixcloud/background_jobs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 690d46d893..3b6a4fc2bb 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -683,7 +683,7 @@ async def retry_org_background_job( job.replica_storage, primary_file_path, primary_endpoint, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} @@ -696,21 +696,21 @@ async def retry_org_background_job( job.object_type, job.replica_storage, force_start_immediately=True, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} if job.type == BgJobType.DELETE_ORG: await self.create_delete_org_job( org, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} if job.type == BgJobType.RECALCULATE_ORG_STATS: await self.create_recalculate_org_stats_job( org, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} @@ -718,7 +718,7 @@ async def retry_org_background_job( await self.create_re_add_org_pages_job( org.id, job.crawl_type, - existing_job_id=job_id, + existing_job_id=job.id, ) return {"success": True} From 19d2547fe78dec3c076ebb79da4083ec3996fe89 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:19:14 -0500 Subject: [PATCH 19/59] Cast job to right type in retry --- backend/btrixcloud/background_jobs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 3b6a4fc2bb..52e1b62f73 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -669,6 +669,7 @@ async def retry_org_background_job( ) -> Dict[str, Union[bool, Optional[str]]]: """Retry background job specific to one org""" if job.type == BgJobType.CREATE_REPLICA: + job = cast(CreateReplicaJob, job) file = await self.get_replica_job_file(job, org) primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) primary_endpoint, bucket_suffix = self.strip_bucket( @@ -688,6 +689,7 @@ async def retry_org_background_job( return {"success": True} if job.type == BgJobType.DELETE_REPLICA: + job = cast(DeleteReplicaJob, job) file = await self.get_replica_job_file(job, org) await self.create_delete_replica_job( org, @@ -701,6 +703,7 @@ async def retry_org_background_job( return {"success": True} if job.type == BgJobType.DELETE_ORG: + job = cast(DeleteOrgJob, job) await self.create_delete_org_job( org, existing_job_id=job.id, @@ -708,6 +711,7 @@ async def retry_org_background_job( return {"success": True} if job.type == BgJobType.RECALCULATE_ORG_STATS: + job = cast(RecalculateOrgStatsJob, job) await self.create_recalculate_org_stats_job( org, existing_job_id=job.id, @@ -715,6 +719,7 @@ async def retry_org_background_job( return {"success": True} if job.type == BgJobType.READD_ORG_PAGES: + job = cast(ReAddOrgPagesJob, job) await self.create_re_add_org_pages_job( org.id, job.crawl_type, From 7bc33974bbd927cb4a50ac73f29c52e66286f533 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:27:34 -0500 Subject: [PATCH 20/59] Add API endpoint to launch migrate crawls job --- backend/btrixcloud/background_jobs.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 52e1b62f73..e87a7fd82a 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -29,6 +29,7 @@ StorageRef, User, SuccessResponse, + SuccessResponseId, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now @@ -815,6 +816,18 @@ async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep return await ops.retry_background_job(job_id, org) + @app.post( + "/orgs/all/jobs/migrateCrawls", response_model=SuccessResponse, tags=["jobs"] + ) + async def create_migrate_crawls_job(job_id: str, user: User = Depends(user_dep)): + """Launch background job to migrate all crawls to v2 with optimized pages""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + job_id = await ops.create_optimize_crawl_pages_job() + + return {"sucess": True, "id": job_id} + @router.post("/{job_id}/retry", response_model=SuccessResponse, tags=["jobs"]) async def retry_org_background_job( job_id: str, From 538d4a3eb5ceb495091ac74c12b2bcbe912abd16 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 17:34:37 -0500 Subject: [PATCH 21/59] Use SuccessResponseId model to include job id in response --- backend/btrixcloud/background_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index e87a7fd82a..52239a0188 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -817,7 +817,7 @@ async def retry_background_job_no_org(job_id: str, user: User = Depends(user_dep return await ops.retry_background_job(job_id, org) @app.post( - "/orgs/all/jobs/migrateCrawls", response_model=SuccessResponse, tags=["jobs"] + "/orgs/all/jobs/migrateCrawls", response_model=SuccessResponseId, tags=["jobs"] ) async def create_migrate_crawls_job(job_id: str, user: User = Depends(user_dep)): """Launch background job to migrate all crawls to v2 with optimized pages""" From 7d27feb0288bd894d7aab31cf2ed05bd61cffd51 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Feb 2025 18:05:16 -0500 Subject: [PATCH 22/59] Add first draft of upgrade notes in docs --- .../docs/docs/deploy/admin/upgrade-notes.md | 19 +++++++++++++++++++ frontend/docs/mkdocs.yml | 1 + 2 files changed, 20 insertions(+) create mode 100644 frontend/docs/docs/deploy/admin/upgrade-notes.md diff --git a/frontend/docs/docs/deploy/admin/upgrade-notes.md b/frontend/docs/docs/deploy/admin/upgrade-notes.md new file mode 100644 index 0000000000..21a422dc29 --- /dev/null +++ b/frontend/docs/docs/deploy/admin/upgrade-notes.md @@ -0,0 +1,19 @@ +# Upgrade Notes + +Some Browsertrix releases include long-running data migrations that may need to be monitored. This guide covers important information for such releases. + +## Browsertrix 1.14 + +Browsertrix 1.14, which introduces public collections, has several data migrations which affect crawl and upload objects as well as their pages. + +Migration 0042 in particular annotates all crawl pages in the database with information which is used to optimize loading times for crawl and collection replay. Because it must iterate through all crawl pages, this process can take a long time in deployments with many crawls and pages. + +In order to keep this optimization from blocking deployment, migration 0042 starts a parallelized background job that migrates the important data. + +If this background job fail for any reason, the superadmin will receive a background job failure notification. The status of the background job can also be checked or retried at any time using superadmin-only background job API endpoints as needed: + +- List all background jobs: `GET /orgs/all/jobs` +- Get background job: `GET /orgs/all/jobs/{job_id}` +- Retry background job: `POST /orgs/all/jobs/{job_id}/retry` + +For more details on these and other available API endpoints, consult the [Browsertrix API documentation](/api). diff --git a/frontend/docs/mkdocs.yml b/frontend/docs/mkdocs.yml index 008d9a49a0..5ee2a33173 100644 --- a/frontend/docs/mkdocs.yml +++ b/frontend/docs/mkdocs.yml @@ -83,6 +83,7 @@ nav: - deploy/ansible/microk8s.md - deploy/ansible/k3s.md - Administration: + - deploy/admin/upgrade-notes.md - deploy/admin/org-import-export.md - Development: - develop/index.md From 821c02d0d989d5fb916799d38efb6e2748358e10 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 15:36:14 -0800 Subject: [PATCH 23/59] cleanup: - simplify group query for unique pages - remove unused code - convert array index to getter --- backend/btrixcloud/background_jobs.py | 2 +- backend/btrixcloud/pages.py | 63 +++------------------------ 2 files changed, 8 insertions(+), 57 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 52239a0188..c71df5e156 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -756,7 +756,7 @@ async def retry_all_failed_background_jobs( bg_tasks = set() async for job in self.jobs.find({"success": False}): org = None - if job["oid"]: + if job.get("oid"): org = await self.org_ops.get_org_by_id(job["oid"]) task = asyncio.create_task(self.retry_background_job(job["_id"], org)) bg_tasks.add(task) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index d8405562af..70f07d8c44 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -3,7 +3,6 @@ # pylint: disable=too-many-lines import asyncio -import os import re import time import traceback @@ -127,53 +126,6 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): traceback.print_exc() print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) - async def add_crawl_wacz_filename_to_pages(self, crawl_id: str): - """Add WACZ filename and additional fields to existing pages in crawl if not already set""" - try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id) - if not crawl.resources: - return - - for wacz_file in crawl.resources: - # Strip oid directory from filename - filename = os.path.basename(wacz_file.name) - - stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file]) - for page_dict in stream: - if not page_dict.get("url"): - continue - - page_id = page_dict.get("id") - - if not page_id: - continue - - if page_id: - try: - page_id = UUID(page_id) - # pylint: disable=broad-exception-caught - except Exception: - continue - - await self.pages.find_one_and_update( - {"_id": page_id}, - { - "$set": { - "filename": filename, - "depth": page_dict.get("depth"), - "isSeed": page_dict.get("seed", False), - "favIconUrl": page_dict.get("favIconUrl"), - } - }, - ) - # pylint: disable=broad-exception-caught, raise-missing-from - except Exception as err: - traceback.print_exc() - print( - f"Error adding filename to pages from item {crawl_id} to db: {err}", - flush=True, - ) - def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool ) -> Page: @@ -955,17 +907,16 @@ def get_crawl_type_from_pages_route(self, request: Request): async def get_unique_page_count(self, crawl_ids: List[str]) -> int: """Get count of unique page URLs across list of archived items""" - # unique_pages = await self.pages.distinct( - # "url", {"crawl_id": {"$in": crawl_ids}} - # ) - count = 0 cursor = self.pages.aggregate( - [{"$match": {"crawl_id": {"$in": crawl_ids}}}, {"$group": {"_id": "$url"}}] + [ + {"$match": {"crawl_id": {"$in": crawl_ids}}}, + {"$group": {"_id": "$url"}}, + {"$count": "urls"}, + ] ) - async for _res in cursor: - count += 1 + res = await cursor.to_list(1) - return count + return res[0].get("urls") async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" From d311692bfbc1e00e8162a34de891969af3c2b328 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 15:42:44 -0800 Subject: [PATCH 24/59] add logging to optimize pages job --- backend/btrixcloud/pages.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 70f07d8c44..8cadfee55c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -945,16 +945,21 @@ async def process_finished_crawls(): match_query, {"$set": {"isMigrating": True}} ) if next_crawl is None: + print("No more done crawls to migrate") break crawl_id = next_crawl.get("_id") + print("Processing crawl: " + crawl_id) # Re-add crawl pages if at least one page doesn't have filename set has_page_no_filename = await self.pages.find_one( {"crawl_id": crawl_id, "filename": None} ) if has_page_no_filename: + print("Re-importing pages to migrate to v2") await self.re_add_crawl_pages(crawl_id) + else: + print("Pages already have filename, set to v2") # Update crawl version and unset isMigrating await self.crawls.find_one_and_update( @@ -976,6 +981,7 @@ async def process_finished_crawls(): if not running_crawl: break + print("Running crawls remain, waiting for them to finish") time.sleep(30) await process_finished_crawls() @@ -987,6 +993,7 @@ async def process_finished_crawls(): in_progress = await self.crawls.find_one({"isMigrating": True}) if in_progress is None: break + print("Unmigrated crawls remain, finishing job") time.sleep(5) From 07d3c2543af19937084073402a9126dd4cf9bf07 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 15:45:10 -0800 Subject: [PATCH 25/59] logging --- backend/btrixcloud/pages.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 8cadfee55c..84abcc60c6 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -945,7 +945,7 @@ async def process_finished_crawls(): match_query, {"$set": {"isMigrating": True}} ) if next_crawl is None: - print("No more done crawls to migrate") + print("No more finished crawls to migrate") break crawl_id = next_crawl.get("_id") @@ -979,6 +979,7 @@ async def process_finished_crawls(): running_crawl = await self.crawls.find_one(match_query) if not running_crawl: + print("No running crawls remain") break print("Running crawls remain, waiting for them to finish") From 9fd26bf374da5bdb2df85c80d2e27cf0633d834f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 16:07:54 -0800 Subject: [PATCH 26/59] check for empty result --- backend/btrixcloud/pages.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 84abcc60c6..d0060ee48a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -915,8 +915,7 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int: ] ) res = await cursor.to_list(1) - - return res[0].get("urls") + return res[0].get("urls") if res else 0 async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" From 39ab851fd1669f7707f31bb3504862376399d686 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 16:08:19 -0800 Subject: [PATCH 27/59] version: bump to 1.14.0-beta.5 --- backend/btrixcloud/version.py | 2 +- chart/Chart.yaml | 2 +- chart/values.yaml | 4 ++-- frontend/package.json | 2 +- version.txt | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index de6e390544..9644fe3fe2 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.4" +__version__ = "1.14.0-beta.5" diff --git a/chart/Chart.yaml b/chart/Chart.yaml index d074ec0492..f749517563 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.4 +version: v1.14.0-beta.5 dependencies: - name: btrix-admin-logging diff --git a/chart/values.yaml b/chart/values.yaml index c3aefbb85c..f006ec7937 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.4" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.5" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -161,7 +161,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.4" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.5" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index 67fb1506de..0dfaaa4901 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.4", + "version": "1.14.0-beta.5", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/version.txt b/version.txt index 377350eda3..33f1d187ca 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.14.0-beta.4 +1.14.0-beta.5 From f71a89ecb623a4942f1ba7d7b754f67c8fa4d899 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 16:13:17 -0800 Subject: [PATCH 28/59] fix response model --- backend/btrixcloud/background_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index c71df5e156..279f534ea3 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -792,7 +792,7 @@ async def get_org_background_job( """Retrieve information for background job""" return await ops.get_background_job(job_id, org.id) - @app.get("/orgs/all/jobs/{job_id}", response_model=SuccessResponse, tags=["jobs"]) + @app.get("/orgs/all/jobs/{job_id}", response_model=AnyJob, tags=["jobs"]) async def get_background_job_all_orgs(job_id: str, user: User = Depends(user_dep)): """Get background job from any org""" if not user.is_superuser: From 533f78c22842215efd38563d2a82ca978aa3c078 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 17:09:25 -0800 Subject: [PATCH 29/59] add retries for add_crawl_pages_to_db_from_wacz() add typing --- backend/btrixcloud/colls.py | 2 +- backend/btrixcloud/pages.py | 98 +++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 91c54687c3..de557555c7 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -518,7 +518,7 @@ async def list_collections( async def get_collection_crawl_resources( self, coll_id: UUID, include_preloads=False - ): + ) -> tuple[List[CrawlFileOut], List[PreloadResource], bool]: """Return pre-signed resources for all collection crawl files.""" # Ensure collection exists _ = await self.get_collection_raw(coll_id) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index d0060ee48a..7098fd4355 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -4,13 +4,14 @@ import asyncio import re -import time import traceback import urllib.parse from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 +from remotezip import RemoteIOError + from fastapi import Depends, HTTPException, Request, Response import pymongo @@ -81,50 +82,75 @@ async def set_ops(self, background_job_ops: BackgroundJobOps): """Set ops classes as needed""" self.background_job_ops = background_job_ops - async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): + async def add_crawl_pages_to_db_from_wacz( + self, crawl_id: str, batch_size=100, num_retries=5 + ): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] - try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id) - stream = await self.storage_ops.sync_stream_wacz_pages( - crawl.resources or [] - ) - new_uuid = crawl.type == "upload" - seed_count = 0 - non_seed_count = 0 - for page_dict in stream: - if not page_dict.get("url"): - continue + retry = 0 + while True: + try: + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + stream = await self.storage_ops.sync_stream_wacz_pages( + crawl.resources or [] + ) + new_uuid = crawl.type == "upload" + seed_count = 0 + non_seed_count = 0 + for page_dict in stream: + if not page_dict.get("url"): + continue + + page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get( + "seed" + ) - page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed") + if page_dict.get("isSeed"): + seed_count += 1 + else: + non_seed_count += 1 - if page_dict.get("isSeed"): - seed_count += 1 - else: - non_seed_count += 1 + if len(pages_buffer) > batch_size: + await self._add_pages_to_db(crawl_id, pages_buffer) + pages_buffer = [] + + pages_buffer.append( + self._get_page_from_dict( + page_dict, crawl_id, crawl.oid, new_uuid + ) + ) - if len(pages_buffer) > batch_size: + # Add any remaining pages in buffer to db + if pages_buffer: await self._add_pages_to_db(crawl_id, pages_buffer) - pages_buffer = [] - pages_buffer.append( - self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid) + await self.set_archived_item_page_counts(crawl_id) + + print( + f"Added pages for crawl {crawl_id}: " + + f"{seed_count} Seed, {non_seed_count} Non-Seed", + flush=True, ) - # Add any remaining pages in buffer to db - if pages_buffer: - await self._add_pages_to_db(crawl_id, pages_buffer) + except RemoteIOError as rio: + msg = str(rio) + if msg.startswith("503") or msg.startswith("429"): + if retry < num_retries: + retry += 1 + print(f"Retrying, {retry} of {num_retries}, {msg}") + await asyncio.sleep(5) + continue - await self.set_archived_item_page_counts(crawl_id) + print(f"No more retries, {msg}") - print( - f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed", - flush=True, - ) - # pylint: disable=broad-exception-caught, raise-missing-from - except Exception as err: - traceback.print_exc() - print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) + # pylint: disable=broad-exception-caught, raise-missing-from + except Exception as err: + traceback.print_exc() + print( + f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True + ) + + break def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool @@ -982,7 +1008,7 @@ async def process_finished_crawls(): break print("Running crawls remain, waiting for them to finish") - time.sleep(30) + await asyncio.sleep(30) await process_finished_crawls() @@ -994,7 +1020,7 @@ async def process_finished_crawls(): if in_progress is None: break print("Unmigrated crawls remain, finishing job") - time.sleep(5) + await asyncio.sleep(5) # ============================================================================ From 25c6e3a3311359d68dcbe632468320fb921d7bc1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 18:58:37 -0800 Subject: [PATCH 30/59] back to single background_job.yaml, just pass scale as param --- backend/btrixcloud/crawlmanager.py | 12 ++--- chart/app-templates/background_job.yaml | 8 +++- chart/app-templates/migration_job.yaml | 59 ------------------------- 3 files changed, 10 insertions(+), 69 deletions(-) delete mode 100644 chart/app-templates/migration_job.yaml diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 4ec32c4606..8e55c543c1 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -168,8 +168,7 @@ async def run_re_add_org_pages_job( ) async def run_optimize_pages_job( - self, - existing_job_id: Optional[str] = None, + self, existing_job_id: Optional[str] = None, scale=3 ) -> str: """run job to recalculate storage stats for the org""" @@ -179,9 +178,7 @@ async def run_optimize_pages_job( job_id = f"optimize-pages-{secrets.token_hex(5)}" return await self._run_bg_job_with_ops_classes( - job_id, - job_type=BgJobType.OPTIMIZE_PAGES.value, - migration_job=True, + job_id, job_type=BgJobType.OPTIMIZE_PAGES.value, scale=scale ) async def _run_bg_job_with_ops_classes( @@ -189,7 +186,6 @@ async def _run_bg_job_with_ops_classes( job_id: str, job_type: str, oid: Optional[str] = None, - migration_job: bool = False, **kwargs, ) -> str: """run background job with access to ops classes""" @@ -204,9 +200,7 @@ async def _run_bg_job_with_ops_classes( if oid: params["oid"] = oid - template = "migration_job.yaml" if migration_job else "background_job.yaml" - - data = self.templates.env.get_template(template).render(params) + data = self.templates.env.get_template("background_job.yaml").render(params) await self.create_from_yaml(data, namespace=DEFAULT_NAMESPACE) diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 8c02f21091..45a269addc 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -5,13 +5,18 @@ metadata: labels: role: "background-job" job_type: {{ job_type }} +{% if oid %} btrix.org: {{ oid }} +{% endif %} spec: ttlSecondsAfterFinished: 90 backoffLimit: 3 template: spec: + {% if scale %} + parallelism: {{ scale }} + {% endif %} restartPolicy: Never priorityClassName: bg-job podFailurePolicy: @@ -35,9 +40,10 @@ spec: - name: BG_JOB_TYPE value: {{ job_type }} +{% if oid %} - name: OID value: {{ oid }} - +{% endif %} - name: CRAWL_TYPE value: {{ crawl_type }} diff --git a/chart/app-templates/migration_job.yaml b/chart/app-templates/migration_job.yaml deleted file mode 100644 index d48fe5fac8..0000000000 --- a/chart/app-templates/migration_job.yaml +++ /dev/null @@ -1,59 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: "{{ id }}" - labels: - role: "background-job" - job_type: {{ job_type }} - -spec: - ttlSecondsAfterFinished: 90 - backoffLimit: 3 - template: - spec: - parallelism: 3 - restartPolicy: Never - priorityClassName: bg-job - podFailurePolicy: - rules: - - action: FailJob - onExitCodes: - containerName: btrixbgjob - operator: NotIn - values: [0] - - volumes: - - name: ops-configs - secret: - secretName: ops-configs - - containers: - - name: btrixbgjob - image: {{ backend_image }} - imagePullPolicy: {{ pull_policy }} - env: - - name: BG_JOB_TYPE - value: {{ job_type }} - - - name: CRAWL_TYPE - value: {{ crawl_type }} - - envFrom: - - configMapRef: - name: backend-env-config - - secretRef: - name: mongo-auth - - volumeMounts: - - name: ops-configs - mountPath: /ops-configs/ - - command: ["python3", "-m", "btrixcloud.main_bg"] - - resources: - limits: - memory: "500Mi" - - requests: - memory: "250Mi" - cpu: "200m" From b4dcfd4e9ee469b356ea4996f9952b031424e346 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 19:59:54 -0800 Subject: [PATCH 31/59] fix parallelism setting --- chart/app-templates/background_job.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 45a269addc..7472e139e0 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -12,11 +12,11 @@ metadata: spec: ttlSecondsAfterFinished: 90 backoffLimit: 3 - template: - spec: {% if scale %} - parallelism: {{ scale }} + parallelism: {{ scale }} {% endif %} + template: + spec: restartPolicy: Never priorityClassName: bg-job podFailurePolicy: From 35b788362e63bc964d4893d9f12782f67e6a375e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 21:07:06 -0800 Subject: [PATCH 32/59] retry at the wacz level, not crawl --- backend/btrixcloud/pages.py | 89 +++++++++++++--------------------- backend/btrixcloud/storages.py | 59 ++++++++++++++-------- 2 files changed, 72 insertions(+), 76 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 7098fd4355..0bd16de91e 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -10,8 +10,6 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from remotezip import RemoteIOError - from fastapi import Depends, HTTPException, Request, Response import pymongo @@ -87,70 +85,49 @@ async def add_crawl_pages_to_db_from_wacz( ): """Add pages to database from WACZ files""" pages_buffer: List[Page] = [] - retry = 0 - while True: - try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id) - stream = await self.storage_ops.sync_stream_wacz_pages( - crawl.resources or [] - ) - new_uuid = crawl.type == "upload" - seed_count = 0 - non_seed_count = 0 - for page_dict in stream: - if not page_dict.get("url"): - continue - - page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get( - "seed" - ) - - if page_dict.get("isSeed"): - seed_count += 1 - else: - non_seed_count += 1 + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + try: + stream = await self.storage_ops.sync_stream_wacz_pages( + crawl.resources or [], num_retries + ) + new_uuid = crawl.type == "upload" + seed_count = 0 + non_seed_count = 0 + for page_dict in stream: + if not page_dict.get("url"): + continue - if len(pages_buffer) > batch_size: - await self._add_pages_to_db(crawl_id, pages_buffer) - pages_buffer = [] + page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed") - pages_buffer.append( - self._get_page_from_dict( - page_dict, crawl_id, crawl.oid, new_uuid - ) - ) + if page_dict.get("isSeed"): + seed_count += 1 + else: + non_seed_count += 1 - # Add any remaining pages in buffer to db - if pages_buffer: + if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) + pages_buffer = [] - await self.set_archived_item_page_counts(crawl_id) - - print( - f"Added pages for crawl {crawl_id}: " - + f"{seed_count} Seed, {non_seed_count} Non-Seed", - flush=True, + pages_buffer.append( + self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid) ) - except RemoteIOError as rio: - msg = str(rio) - if msg.startswith("503") or msg.startswith("429"): - if retry < num_retries: - retry += 1 - print(f"Retrying, {retry} of {num_retries}, {msg}") - await asyncio.sleep(5) - continue + # Add any remaining pages in buffer to db + if pages_buffer: + await self._add_pages_to_db(crawl_id, pages_buffer) - print(f"No more retries, {msg}") + await self.set_archived_item_page_counts(crawl_id) - # pylint: disable=broad-exception-caught, raise-missing-from - except Exception as err: - traceback.print_exc() - print( - f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True - ) + print( + f"Added pages for crawl {crawl_id}: " + + f"{seed_count} Seed, {non_seed_count} Non-Seed", + flush=True, + ) - break + # pylint: disable=broad-exception-caught, raise-missing-from + except Exception as err: + traceback.print_exc() + print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) def _get_page_from_dict( self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 620c4b293c..1e63af983d 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -18,6 +18,7 @@ from itertools import chain import asyncio +import time import heapq import zlib import json @@ -495,12 +496,14 @@ async def _delete_file( return status_code == 204 async def sync_stream_wacz_pages( - self, wacz_files: List[CrawlFileOut] + self, wacz_files: List[CrawlFileOut], num_retries=5 ) -> Iterator[Dict[Any, Any]]: """Return stream of pages specified WACZ""" loop = asyncio.get_event_loop() - resp = await loop.run_in_executor(None, self._sync_get_pages, wacz_files) + resp = await loop.run_in_executor( + None, self._sync_get_pages, wacz_files, num_retries + ) return resp @@ -600,8 +603,7 @@ def organize_based_on_instance_number( return stream_json_lines(heap_iter, log_levels, contexts) def _sync_get_pages( - self, - wacz_files: List[CrawlFileOut], + self, wacz_files: List[CrawlFileOut], num_retries=5 ) -> Iterator[Dict[Any, Any]]: """Generate stream of page dicts from specified WACZs""" @@ -631,22 +633,39 @@ def stream_page_lines( for wacz_file in wacz_files: wacz_url = self.resolve_internal_access_path(wacz_file.path) - with RemoteZip(wacz_url) as remote_zip: - page_files: List[ZipInfo] = [ - f - for f in remote_zip.infolist() - if f.filename.startswith("pages/") - and f.filename.endswith(".jsonl") - and not f.is_dir() - ] - for pagefile_zipinfo in page_files: - page_generators.append( - stream_page_lines( - pagefile_zipinfo, - wacz_url, - wacz_file.name, - ) - ) + + retry = 0 + + while True: + try: + with RemoteZip(wacz_url) as remote_zip: + page_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("pages/") + and f.filename.endswith(".jsonl") + and not f.is_dir() + ] + for pagefile_zipinfo in page_files: + page_generators.append( + stream_page_lines( + pagefile_zipinfo, + wacz_url, + wacz_file.name, + ) + ) + except Exception as exc: + msg = str(exc) + if msg.startswith("503") or msg.startswith("429"): + if retry < num_retries: + retry += 1 + print(f"Retrying, {retry} of {num_retries}, {msg}") + time.sleep(30) + continue + + print(f"No more retries for error: {msg}, skipping {wacz_url}") + + break return chain.from_iterable(page_generators) From ed66a26af796c3e30b491729161af7bf5b04c99c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 21:35:27 -0800 Subject: [PATCH 33/59] bg jobs: make single crawl readd a background job as well, if crawl_id is provided, process single crawl --- backend/btrixcloud/background_jobs.py | 4 ++++ backend/btrixcloud/crawlmanager.py | 2 ++ backend/btrixcloud/main_bg.py | 8 +++++++- backend/btrixcloud/models.py | 3 ++- backend/btrixcloud/pages.py | 13 +++++++------ backend/btrixcloud/storages.py | 11 +++++------ chart/app-templates/background_job.yaml | 5 +++++ 7 files changed, 32 insertions(+), 14 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 279f534ea3..afbb25e871 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -384,6 +384,7 @@ async def create_re_add_org_pages_job( self, oid: UUID, crawl_type: Optional[str] = None, + crawl_id: Optional[str] = None, existing_job_id: Optional[str] = None, ): """Create job to (re)add all pages in an org, optionally filtered by crawl type""" @@ -392,6 +393,7 @@ async def create_re_add_org_pages_job( job_id = await self.crawl_manager.run_re_add_org_pages_job( oid=str(oid), crawl_type=crawl_type, + crawl_id=crawl_id, existing_job_id=existing_job_id, ) if existing_job_id: @@ -412,6 +414,7 @@ async def create_re_add_org_pages_job( id=job_id, oid=oid, crawl_type=crawl_type, + crawl_id=crawl_id, started=dt_now(), ) @@ -724,6 +727,7 @@ async def retry_org_background_job( await self.create_re_add_org_pages_job( org.id, job.crawl_type, + job.crawl_id, existing_job_id=job.id, ) return {"success": True} diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 8e55c543c1..a14f87afb6 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -151,6 +151,7 @@ async def run_re_add_org_pages_job( self, oid: str, crawl_type: Optional[str] = None, + crawl_id: Optional[str] = None, existing_job_id: Optional[str] = None, ) -> str: """run job to recalculate storage stats for the org""" @@ -165,6 +166,7 @@ async def run_re_add_org_pages_job( job_type=BgJobType.READD_ORG_PAGES.value, oid=oid, crawl_type=crawl_type, + crawl_id=crawl_id, ) async def run_optimize_pages_job( diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index 205d2bb60f..dc1541b3c9 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -13,10 +13,12 @@ job_type = os.environ.get("BG_JOB_TYPE") oid = os.environ.get("OID") crawl_type = os.environ.get("CRAWL_TYPE") +crawl_id = os.environ.get("CRAWL_ID") # ============================================================================ # pylint: disable=too-many-function-args, duplicate-code, too-many-locals, too-many-return-statements +# pylint: disable=too-many-branches async def main(): """run background job with access to ops classes""" @@ -70,7 +72,11 @@ async def main(): if job_type == BgJobType.READD_ORG_PAGES: try: - await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) + if not crawl_id: + await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) + else: + await page_ops.add_crawl_pages_to_db_from_wacz(crawl_id=crawl_id) + await coll_ops.recalculate_org_collection_stats(org) return 0 # pylint: disable=broad-exception-caught diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 33d4109012..cd6b94b30b 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2596,10 +2596,11 @@ class RecalculateOrgStatsJob(BackgroundJob): # ============================================================================ class ReAddOrgPagesJob(BackgroundJob): - """Model for tracking jobs to readd an org's pages""" + """Model for tracking jobs to readd pages for an org or single crawl""" type: Literal[BgJobType.READD_ORG_PAGES] = BgJobType.READD_ORG_PAGES crawl_type: Optional[str] = None + crawl_id: Optional[str] = None # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 0bd16de91e..68c8d6af3a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -29,7 +29,6 @@ PageNoteDelete, QARunBucketStats, StartedResponse, - StartedResponseBool, UpdatedResponse, DeletedResponse, PageNoteAddedResponse, @@ -1047,25 +1046,27 @@ async def re_add_all_crawl_pages( @app.post( "/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", tags=["pages", "crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/uploads/{crawl_id}/pages/reAdd", tags=["pages", "uploads"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) @app.post( "/orgs/{oid}/all-crawls/{crawl_id}/pages/reAdd", tags=["pages", "all-crawls"], - response_model=StartedResponseBool, + response_model=StartedResponse, ) async def re_add_crawl_pages( crawl_id: str, org: Organization = Depends(org_crawl_dep), ): """Re-add pages for crawl (may delete page QA data!)""" - asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) - return {"started": True} + job_id = await ops.background_job_ops.create_re_add_org_pages_job( + org.id, crawl_id=crawl_id + ) + return {"started": job_id or ""} @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 1e63af983d..f3f45ad793 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -656,12 +656,11 @@ def stream_page_lines( ) except Exception as exc: msg = str(exc) - if msg.startswith("503") or msg.startswith("429"): - if retry < num_retries: - retry += 1 - print(f"Retrying, {retry} of {num_retries}, {msg}") - time.sleep(30) - continue + if retry < num_retries: + retry += 1 + print(f"Retrying, {retry} of {num_retries}, {msg}") + time.sleep(30) + continue print(f"No more retries for error: {msg}, skipping {wacz_url}") diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index 7472e139e0..b26c723b94 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -47,6 +47,11 @@ spec: - name: CRAWL_TYPE value: {{ crawl_type }} +{% if crawl_id %} + - name: CRAWL_ID + value: {{ crawl_id }} +{% endif %} + envFrom: - configMapRef: name: backend-env-config From 358bb511651e2ecdf8011effae4dfd0dfc755754 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 21:39:30 -0800 Subject: [PATCH 34/59] add sort --- backend/btrixcloud/pages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 68c8d6af3a..7d9a90840f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -943,7 +943,9 @@ async def process_finished_crawls(): } next_crawl = await self.crawls.find_one_and_update( - match_query, {"$set": {"isMigrating": True}} + match_query, + {"$set": {"isMigrating": True}}, + sort=[("finished", -1)], ) if next_crawl is None: print("No more finished crawls to migrate") From e9f070b8abaa2a6411ddff6f1b7486d033581a5b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 21:42:46 -0800 Subject: [PATCH 35/59] better call --- backend/btrixcloud/main_bg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/main_bg.py b/backend/btrixcloud/main_bg.py index dc1541b3c9..ff80fb4bef 100644 --- a/backend/btrixcloud/main_bg.py +++ b/backend/btrixcloud/main_bg.py @@ -75,7 +75,7 @@ async def main(): if not crawl_id: await page_ops.re_add_all_crawl_pages(org, crawl_type=crawl_type) else: - await page_ops.add_crawl_pages_to_db_from_wacz(crawl_id=crawl_id) + await page_ops.re_add_crawl_pages(crawl_id=crawl_id, oid=org.id) await coll_ops.recalculate_org_collection_stats(org) return 0 From e94f8bedf48c455618788d5d6831c371ae472ab6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 21:43:59 -0800 Subject: [PATCH 36/59] logging --- backend/btrixcloud/storages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f3f45ad793..5c0a241609 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -636,6 +636,8 @@ def stream_page_lines( retry = 0 + print(f" Processing WACZ {wacz_url}") + while True: try: with RemoteZip(wacz_url) as remote_zip: From 01431d7bdb24d1928257f0ab9ac676c9c9fa4181 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 22:11:00 -0800 Subject: [PATCH 37/59] ensure pages are streamed from each wacz on demand --- backend/btrixcloud/storages.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 5c0a241609..49a01f03e2 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -629,8 +629,6 @@ def stream_page_lines( page_json["seed"] = True yield page_json - page_generators: List[Iterator[Dict[Any, Any]]] = [] - for wacz_file in wacz_files: wacz_url = self.resolve_internal_access_path(wacz_file.path) @@ -649,12 +647,10 @@ def stream_page_lines( and not f.is_dir() ] for pagefile_zipinfo in page_files: - page_generators.append( - stream_page_lines( - pagefile_zipinfo, - wacz_url, - wacz_file.name, - ) + yield from stream_page_lines( + pagefile_zipinfo, + wacz_url, + wacz_file.name, ) except Exception as exc: msg = str(exc) @@ -668,8 +664,6 @@ def stream_page_lines( break - return chain.from_iterable(page_generators) - def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]: """Return iterator of lines in remote file as bytes""" with RemoteZip(wacz_url) as remote_zip: From a486571eda90f679c1d47f1e2fe7b30a9c5f85f7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 22:26:35 -0800 Subject: [PATCH 38/59] ignore dupes on insertMany --- backend/btrixcloud/pages.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 7d9a90840f..4795a665f5 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -104,7 +104,7 @@ async def add_crawl_pages_to_db_from_wacz( non_seed_count += 1 if len(pages_buffer) > batch_size: - await self._add_pages_to_db(crawl_id, pages_buffer) + await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) pages_buffer = [] pages_buffer.append( @@ -113,7 +113,7 @@ async def add_crawl_pages_to_db_from_wacz( # Add any remaining pages in buffer to db if pages_buffer: - await self._add_pages_to_db(crawl_id, pages_buffer) + await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) await self.set_archived_item_page_counts(crawl_id) @@ -162,7 +162,7 @@ def _get_page_from_dict( p.compute_page_type() return p - async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]): + async def _add_pages_to_db(self, crawl_id: str, pages: List[Page], ordered=True): """Add batch of pages to db in one insert""" result = await self.pages.insert_many( [ @@ -170,7 +170,8 @@ async def _add_pages_to_db(self, crawl_id: str, pages: List[Page]): exclude_unset=True, exclude_none=True, exclude_defaults=True ) for page in pages - ] + ], + ordered=ordered, ) if not result.inserted_ids: # pylint: disable=broad-exception-raised From d0c274e0c3f68dd40d1d3c86c8bb20f02995edab Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 22:39:59 -0800 Subject: [PATCH 39/59] attempt dedup within wacz --- backend/btrixcloud/storages.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 49a01f03e2..f6a2dc024b 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -622,9 +622,20 @@ def stream_page_lines( ) line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) + + dupe_set = set() + for line in line_iter: page_json = _parse_json(line.decode("utf-8", errors="ignore")) + + _id = page_json.get("id") + if _id and _id in dupe_set: + continue + + dupe_set.add(_id) + page_json["filename"] = os.path.basename(wacz_filename) + if filename == "pages/pages.jsonl": page_json["seed"] = True yield page_json From 5d949aa6dbbd7979f1ff4c123aca8d6fbced860c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 Feb 2025 22:51:35 -0800 Subject: [PATCH 40/59] ignore dupe errors? --- backend/btrixcloud/pages.py | 12 ++++++++++-- backend/btrixcloud/storages.py | 11 ----------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4795a665f5..5328f0143a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -104,7 +104,12 @@ async def add_crawl_pages_to_db_from_wacz( non_seed_count += 1 if len(pages_buffer) > batch_size: - await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) + try: + await self._add_pages_to_db( + crawl_id, pages_buffer, ordered=False + ) + except Exception as e: + print("Error inserting, probably dupe", e) pages_buffer = [] pages_buffer.append( @@ -113,7 +118,10 @@ async def add_crawl_pages_to_db_from_wacz( # Add any remaining pages in buffer to db if pages_buffer: - await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) + try: + await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) + except Exception as e: + print("Error inserting, probably dupe", e) await self.set_archived_item_page_counts(crawl_id) diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index f6a2dc024b..49a01f03e2 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -622,20 +622,9 @@ def stream_page_lines( ) line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) - - dupe_set = set() - for line in line_iter: page_json = _parse_json(line.decode("utf-8", errors="ignore")) - - _id = page_json.get("id") - if _id and _id in dupe_set: - continue - - dupe_set.add(_id) - page_json["filename"] = os.path.basename(wacz_filename) - if filename == "pages/pages.jsonl": page_json["seed"] = True yield page_json From 18301e36556f94e55fdd2271126fde25d327f171 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 00:35:09 -0800 Subject: [PATCH 41/59] update logging, add crawl_id + url index --- backend/btrixcloud/pages.py | 3 +++ backend/btrixcloud/storages.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 5328f0143a..13c055e8ca 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -74,6 +74,9 @@ def __init__( async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + await self.pages.create_index( + [("crawl_id", pymongo.HASHED), ("url", pymongo.ASCENDING)] + ) async def set_ops(self, background_job_ops: BackgroundJobOps): """Set ops classes as needed""" diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 49a01f03e2..768e255d7e 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -629,12 +629,16 @@ def stream_page_lines( page_json["seed"] = True yield page_json + count = 0 + total = len(wacz_files) + for wacz_file in wacz_files: wacz_url = self.resolve_internal_access_path(wacz_file.path) retry = 0 + count += 1 - print(f" Processing WACZ {wacz_url}") + print(f" Processing {count} of {total} WACZ {wacz_url}") while True: try: From 0936ccfacc9eed6750271d09b5c600f7c503c8e5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 00:39:25 -0800 Subject: [PATCH 42/59] lint fix --- backend/btrixcloud/pages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 13c055e8ca..6bb9776af2 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -111,6 +111,7 @@ async def add_crawl_pages_to_db_from_wacz( await self._add_pages_to_db( crawl_id, pages_buffer, ordered=False ) + # pylint: disable=broad-exception-caught except Exception as e: print("Error inserting, probably dupe", e) pages_buffer = [] @@ -123,6 +124,7 @@ async def add_crawl_pages_to_db_from_wacz( if pages_buffer: try: await self._add_pages_to_db(crawl_id, pages_buffer, ordered=False) + # pylint: disable=broad-exception-caught except Exception as e: print("Error inserting, probably dupe", e) From 8742657f32873992c0df5a7f99f54aed0db37fe4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 01:32:39 -0800 Subject: [PATCH 43/59] optimize pages: don't use $facet, don't compute actual total --- backend/btrixcloud/pages.py | 46 ++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 6bb9776af2..5f054007c4 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -700,7 +700,7 @@ async def list_collection_pages( if isinstance(depth, int): query["depth"] = depth - aggregate = [{"$match": query}] + aggregate: list[dict[str, object]] = [{"$match": query}] if sort_by: # Sorting options to add: @@ -726,30 +726,34 @@ async def list_collection_pages( # default sort: seeds first, then by timestamp aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) - aggregate.extend( - [ - { - "$facet": { - "items": [ - {"$skip": skip}, - {"$limit": page_size}, - ], - "total": [{"$count": "count"}], - } - }, - ] - ) + # aggregate.extend( + # [ + # { + # "$facet": { + # "items": [ + # {"$skip": skip}, + # {"$limit": page_size}, + # ], + # "total": [{"$count": "count"}], + # } + # }, + # ] + # ) + aggregate.extend([{"$skip": skip}, {"$limit": page_size}]) # Get total cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - result = results[0] - items = result["items"] + results = await cursor.to_list(length=page_size) + items = results + # result = results[0] + # items = result["items"] - try: - total = int(result["total"][0]["count"]) - except (IndexError, ValueError): - total = 0 + # try: + # total = int(result["total"][0]["count"]) + # except (IndexError, ValueError): + # total = 0 + + total = await self.pages.estimated_document_count() return [PageOut.from_dict(data) for data in items], total From 0c805f10a049e76a2344fb3a9f207cd9871dd31c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 01:38:12 -0800 Subject: [PATCH 44/59] add index for default sort --- backend/btrixcloud/pages.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 5f054007c4..4709425e0d 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -77,6 +77,13 @@ async def init_index(self): await self.pages.create_index( [("crawl_id", pymongo.HASHED), ("url", pymongo.ASCENDING)] ) + await self.pages.create_index( + [ + ("crawl_id", pymongo.HASHED), + ("isSeed", pymongo.DESCENDING), + ("ts", pymongo.ASCENDING), + ] + ) async def set_ops(self, background_job_ops: BackgroundJobOps): """Set ops classes as needed""" @@ -739,7 +746,9 @@ async def list_collection_pages( # }, # ] # ) - aggregate.extend([{"$skip": skip}, {"$limit": page_size}]) + if skip: + aggregate.append({"$skip": skip}) + aggregate.append({"$limit": page_size}) # Get total cursor = self.pages.aggregate(aggregate) From d379570a057a8ef1f3c02ff7d2030d580d5b2515 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 11:07:19 -0800 Subject: [PATCH 45/59] remove uniquge page counts from org metrics --- backend/btrixcloud/orgs.py | 9 --------- backend/test/test_org.py | 3 --- 2 files changed, 12 deletions(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 7c9f7558c0..3279231f06 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -968,12 +968,6 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: all_archived_item_ids = crawl_ids + upload_ids - unique_page_count = await self.page_ops.get_unique_page_count( - all_archived_item_ids - ) - crawl_unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) - upload_unique_page_count = await self.page_ops.get_unique_page_count(upload_ids) - profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} @@ -998,9 +992,6 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "pageCount": page_count, "crawlPageCount": crawl_page_count, "uploadPageCount": upload_page_count, - "uniquePageCount": unique_page_count, - "crawlUniquePageCount": crawl_unique_page_count, - "uploadUniquePageCount": upload_unique_page_count, "profileCount": profile_count, "workflowsRunningCount": workflows_running_count, "maxConcurrentCrawls": max_concurrent_crawls, diff --git a/backend/test/test_org.py b/backend/test/test_org.py index a5e3a4cf48..57c0b8fcce 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -569,9 +569,6 @@ def test_org_metrics(crawler_auth_headers, default_org_id): assert data["uploadCount"] >= 0 assert data["archivedItemCount"] == data["crawlCount"] + data["uploadCount"] assert data["pageCount"] > 0 - assert data["uniquePageCount"] > 0 - assert data["crawlUniquePageCount"] > 0 - assert data["uploadUniquePageCount"] >= 0 assert data["profileCount"] >= 0 assert data["workflowsRunningCount"] >= 0 assert data["workflowsQueuedCount"] >= 0 From 6ad27e188de584d02d3855e53ac4a433188809c9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 11:16:32 -0800 Subject: [PATCH 46/59] update model --- backend/btrixcloud/models.py | 3 --- backend/btrixcloud/orgs.py | 7 ------- 2 files changed, 10 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index cd6b94b30b..13f2cab4fe 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2117,9 +2117,6 @@ class OrgMetrics(BaseModel): pageCount: int crawlPageCount: int uploadPageCount: int - uniquePageCount: int - crawlUniquePageCount: int - uploadUniquePageCount: int profileCount: int workflowsRunningCount: int maxConcurrentCrawls: int diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 3279231f06..ad915e9734 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -947,9 +947,6 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: crawl_page_count = 0 upload_page_count = 0 - crawl_ids = [] - upload_ids = [] - async for item_data in self.crawls_db.find({"oid": org.id}): item = BaseCrawl.from_dict(item_data) if item.state not in SUCCESSFUL_STATES: @@ -958,16 +955,12 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: if item.type == "crawl": crawl_count += 1 crawl_page_count += item.pageCount or 0 - crawl_ids.append(item.id) if item.type == "upload": upload_count += 1 upload_page_count += item.pageCount or 0 - upload_ids.append(item.id) if item.pageCount: page_count += item.pageCount - all_archived_item_ids = crawl_ids + upload_ids - profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} From b299941bb248d36e9fdf22f4d8008174b69a1070 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 11:37:31 -0800 Subject: [PATCH 47/59] remove resources from collection list --- backend/btrixcloud/colls.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index de557555c7..64438f89c5 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -490,12 +490,6 @@ async def list_collections( collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: - res["resources"], res["preloadResources"], _ = ( - await self.get_collection_crawl_resources( - res["_id"], include_preloads=not public_colls_out - ) - ) - thumbnail = res.get("thumbnail") if thumbnail: image_file = ImageFile(**thumbnail) From 06e193b270772a9378284a0d469817d16d8ebbde Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 12:51:47 -0800 Subject: [PATCH 48/59] optimize coll /replay.json - move preloadResources to be precomputed in update_collection_counts_and_tags() - only query list of crawls once, reuse ids - remove facet from list_collection_pages(), support passing in crawlIds --- backend/btrixcloud/colls.py | 65 +++++++++++++++--------------------- backend/btrixcloud/models.py | 1 - backend/btrixcloud/pages.py | 55 ++++++++++-------------------- 3 files changed, 43 insertions(+), 78 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 64438f89c5..052e9bb930 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -45,6 +45,7 @@ PublicOrgDetails, CollAccessType, PageUrlCount, + PageOut, PageIdTimestamp, PaginatedPageUrlCountResponse, UpdateCollHomeUrl, @@ -53,7 +54,6 @@ ImageFilePreparer, MIN_UPLOAD_PART_SIZE, PublicCollOut, - PreloadResource, ) from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin @@ -347,14 +347,14 @@ async def get_collection_out( result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: - result["resources"], result["preloadResources"], pages_optimized = ( - await self.get_collection_crawl_resources( - coll_id, include_preloads=True - ) + result["resources"], crawl_ids, pages_optimized = ( + await self.get_collection_crawl_resources(coll_id) ) - initial_pages, result["totalPages"] = ( - await self.page_ops.list_collection_pages(coll_id, page_size=25) + initial_pages: List[PageOut] = await self.page_ops.list_collection_pages( + coll_id, + crawl_ids=crawl_ids, + page_size=25, ) public = "public/" if public_or_unlisted_only else "" @@ -511,14 +511,13 @@ async def list_collections( return collections, total async def get_collection_crawl_resources( - self, coll_id: UUID, include_preloads=False - ) -> tuple[List[CrawlFileOut], List[PreloadResource], bool]: + self, coll_id: UUID + ) -> tuple[List[CrawlFileOut], List[str], bool]: """Return pre-signed resources for all collection crawl files.""" # Ensure collection exists _ = await self.get_collection_raw(coll_id) resources = [] - preload_resources: List[PreloadResource] = [] pages_optimized = True crawls, _ = await self.crawl_ops.list_all_base_crawls( @@ -528,39 +527,16 @@ async def get_collection_crawl_resources( cls_type=CrawlOutWithResources, ) + crawl_ids = [] + for crawl in crawls: + crawl_ids.append(crawl.id) if crawl.resources: resources.extend(crawl.resources) if crawl.version != 2: - include_preloads = False pages_optimized = False - if include_preloads: - no_page_items = await self.get_collection_resources_with_no_pages(crawls) - for item in no_page_items: - preload_resources.append(item) - - return resources, preload_resources, pages_optimized - - async def get_collection_resources_with_no_pages( - self, crawls: List[CrawlOutWithResources] - ) -> List[PreloadResource]: - """Return wacz files in collection that have no pages""" - resources_no_pages: List[PreloadResource] = [] - - for crawl in crawls: - _, page_count = await self.page_ops.list_pages(crawl.id) - if page_count == 0 and crawl.resources: - for resource in crawl.resources: - resources_no_pages.append( - PreloadResource( - name=os.path.basename(resource.name), - crawlId=crawl.id, - hasPages=False, - ) - ) - - return resources_no_pages + return resources, crawl_ids, pages_optimized async def get_collection_names(self, uuids: List[UUID]): """return object of {_id, names} given list of collection ids""" @@ -646,6 +622,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): tags = [] crawl_ids = [] + preload_resources = [] coll = await self.get_collection(collection_id) org = await self.orgs.get_org_by_id(coll.oid) @@ -663,7 +640,16 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): _, crawl_page_count = await self.page_ops.list_pages( crawl.id, org, page_size=1_000_000 ) - page_count += crawl_page_count + if crawl_page_count == 0: + for file in files: + preload_resources.append( + { + "name": os.path.basename(file.filename), + "crawlId": crawl.id, + } + ) + else: + page_count += crawl_page_count # pylint: disable=broad-exception-caught except Exception: pass @@ -686,6 +672,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): "uniquePageCount": unique_page_count, "totalSize": total_size, "tags": sorted_tags, + "preloadResources": preload_resources, } }, ) @@ -1056,7 +1043,7 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)): try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: - results[collection.name], _, _ = ( + results[collection.name], _ = ( await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 13f2cab4fe..13de0f1ec4 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1418,7 +1418,6 @@ class PreloadResource(BaseModel): name: str crawlId: str - hasPages: bool # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4709425e0d..0162d2319a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -654,6 +654,7 @@ async def list_pages( async def list_collection_pages( self, coll_id: UUID, + crawl_ids=None, org: Optional[Organization] = None, search: Optional[str] = None, url: Optional[str] = None, @@ -666,16 +667,17 @@ async def list_collection_pages( sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, public_or_unlisted_only=False, - ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: - """List all pages in collection, with optional filtering""" + ) -> List[PageOut]: + """Query pages in collection, with filtering sorting. No total returned for optimization""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements # Zero-index page for query page = page - 1 skip = page_size * page - crawl_ids = await self.coll_ops.get_collection_crawl_ids( - coll_id, public_or_unlisted_only - ) + if crawl_ids is not None: + crawl_ids = await self.coll_ops.get_collection_crawl_ids( + coll_id, public_or_unlisted_only + ) query: dict[str, object] = { "crawl_id": {"$in": crawl_ids}, @@ -733,38 +735,15 @@ async def list_collection_pages( # default sort: seeds first, then by timestamp aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) - # aggregate.extend( - # [ - # { - # "$facet": { - # "items": [ - # {"$skip": skip}, - # {"$limit": page_size}, - # ], - # "total": [{"$count": "count"}], - # } - # }, - # ] - # ) if skip: aggregate.append({"$skip": skip}) aggregate.append({"$limit": page_size}) - # Get total cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=page_size) - items = results - # result = results[0] - # items = result["items"] - - # try: - # total = int(result["total"][0]["count"]) - # except (IndexError, ValueError): - # total = 0 - total = await self.pages.estimated_document_count() + results = await cursor.to_list(length=page_size) - return [PageOut.from_dict(data) for data in items], total + return [PageOut.from_dict(data) for data in results] async def re_add_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete existing pages for crawl and re-add from WACZs.""" @@ -1254,7 +1233,7 @@ async def get_crawl_pages_list( @app.get( "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], - response_model=PaginatedPageOutResponse, + response_model=List[PageOut], ) async def get_public_collection_pages_list( coll_id: UUID, @@ -1270,9 +1249,9 @@ async def get_public_collection_pages_list( page: int = 1, sortBy: Optional[str] = None, sortDirection: Optional[int] = -1, - ): + ) -> List[PageOut]: """Retrieve paginated list of pages in collection""" - pages, total = await ops.list_collection_pages( + pages = await ops.list_collection_pages( coll_id=coll_id, org=org, search=search, @@ -1290,7 +1269,7 @@ async def get_public_collection_pages_list( response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" - return paginated_format(pages, total, page, pageSize) + return pages @app.options( "/orgs/{oid}/collections/{coll_id}/pages", @@ -1311,7 +1290,7 @@ async def get_replay_preflight(response: Response): @app.get( "/orgs/{oid}/collections/{coll_id}/pages", tags=["pages", "collections"], - response_model=PaginatedPageOutResponse, + response_model=List[PageOut], ) async def get_collection_pages_list( coll_id: UUID, @@ -1327,9 +1306,9 @@ async def get_collection_pages_list( page: int = 1, sortBy: Optional[str] = None, sortDirection: Optional[int] = -1, - ): + ) -> List[PageOut]: """Retrieve paginated list of pages in collection""" - pages, total = await ops.list_collection_pages( + pages = await ops.list_collection_pages( coll_id=coll_id, org=org, search=search, @@ -1345,7 +1324,7 @@ async def get_collection_pages_list( ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" - return paginated_format(pages, total, page, pageSize) + return pages @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", From 1dafa5ba823630658b5efa20e2cab7879e78c708 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 13:22:36 -0800 Subject: [PATCH 49/59] optimize page snapshot query: - rename /urls -> /pageSnapshots - remove facet, just return results - frontend: update to new model, don't check total, just look at results list length --- backend/btrixcloud/colls.py | 51 +++++++------------ .../collections/select-collection-page.ts | 22 ++++---- 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 052e9bb930..5c9df8e878 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -780,15 +780,15 @@ async def get_org_public_collections( return OrgPublicCollections(org=public_org_details, collections=collections) - async def list_urls_in_collection( + async def list_page_snapshots_in_collection( self, coll_id: UUID, oid: UUID, url_prefix: Optional[str] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, - ) -> Tuple[List[PageUrlCount], int]: - """List all URLs in collection sorted desc by snapshot count unless prefix is specified""" + ) -> List[PageUrlCount]: + """List all page URLs in collection sorted desc by snapshot count unless prefix is specified""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements # Zero-index page for query page = page - 1 @@ -797,13 +797,13 @@ async def list_urls_in_collection( crawl_ids = await self.get_collection_crawl_ids(coll_id) match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}} - sort_query: dict[str, int] = {"count": -1, "_id": 1} + sort_query: dict[str, int] = {"count": -1, "ts": 1} if url_prefix: url_prefix = urllib.parse.unquote(url_prefix) regex_pattern = f"^{re.escape(url_prefix)}" match_query["url"] = {"$regex": regex_pattern, "$options": "i"} - sort_query = {"_id": 1} + sort_query = {"ts": 1} aggregate: List[Dict[str, Union[int, object]]] = [{"$match": match_query}] @@ -818,28 +818,15 @@ async def list_urls_in_collection( }, {"$sort": sort_query}, {"$set": {"url": "$_id"}}, - { - "$facet": { - "items": [ - {"$skip": skip}, - {"$limit": page_size}, - ], - "total": [{"$count": "count"}], - } - }, ] ) + if skip: + aggregate.append({"$skip": skip}) + aggregate.append({"$limit": page_size}) # Get total cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - result = results[0] - items = result["items"] - - try: - total = int(result["total"][0]["count"]) - except (IndexError, ValueError): - total = 0 + results = await cursor.to_list(length=page_size) return [ PageUrlCount( @@ -852,8 +839,8 @@ async def list_urls_in_collection( for p in data.get("pages", []) ], ) - for data in items - ], total + for data in results + ] async def set_home_url( self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization @@ -986,11 +973,11 @@ async def delete_thumbnail(self, coll_id: UUID, org: Organization): # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep): +def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep) -> CollectionOps: """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments - colls = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) + colls : CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) org_crawl_dep = orgs.org_crawl_dep org_viewer_dep = orgs.org_viewer_dep @@ -1043,7 +1030,7 @@ async def get_collection_all(org: Organization = Depends(org_viewer_dep)): try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: - results[collection.name], _ = ( + results[collection.name], _, _ = ( await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: @@ -1244,9 +1231,9 @@ async def download_public_collection( return await colls.download_collection(coll.id, org) @app.get( - "/orgs/{oid}/collections/{coll_id}/urls", + "/orgs/{oid}/collections/{coll_id}/pageSnapshots", tags=["collections"], - response_model=PaginatedPageUrlCountResponse, + response_model=List[PageUrlCount] ) async def get_collection_url_list( coll_id: UUID, @@ -1254,16 +1241,16 @@ async def get_collection_url_list( urlPrefix: Optional[str] = None, pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, - ): + ) -> List[PageUrlCount]: """Retrieve paginated list of urls in collection sorted by snapshot count""" - pages, total = await colls.list_urls_in_collection( + pages = await colls.list_page_snapshots_in_collection( coll_id=coll_id, oid=oid, url_prefix=urlPrefix, page_size=pageSize, page=page, ) - return paginated_format(pages, total, page, pageSize) + return pages @app.post( "/orgs/{oid}/collections/{coll_id}/home-url", diff --git a/frontend/src/features/collections/select-collection-page.ts b/frontend/src/features/collections/select-collection-page.ts index 6ff78849d1..6385005a66 100644 --- a/frontend/src/features/collections/select-collection-page.ts +++ b/frontend/src/features/collections/select-collection-page.ts @@ -18,7 +18,7 @@ import queryString from "query-string"; import { BtrixElement } from "@/classes/BtrixElement"; import type { Combobox } from "@/components/ui/combobox"; -import type { APIPaginatedList, APIPaginationQuery } from "@/types/api"; +import type { APIPaginationQuery } from "@/types/api"; import type { Collection } from "@/types/collection"; import type { UnderlyingFunction } from "@/types/utils"; import { tw } from "@/utils/tailwind"; @@ -144,11 +144,11 @@ export class SelectCollectionPage extends BtrixElement { pageSize: 1, }); - if (!pageUrls.total) { + if (!pageUrls.length) { return; } - const startPage = pageUrls.items[0]; + const startPage = pageUrls[0]; if (this.input) { this.input.value = this.url ?? startPage.url; @@ -337,7 +337,7 @@ export class SelectCollectionPage extends BtrixElement { if (!results) return; - if (results.total === 0) { + if (results.length === 0) { if (this.input) { this.pageUrlError = msg( "Page not found in collection. Please check the URL and try again", @@ -348,9 +348,9 @@ export class SelectCollectionPage extends BtrixElement { // Clear selection this.selectedPage = undefined; this.selectedSnapshot = undefined; - } else if (results.total === 1) { + } else if (results.length === 1) { // Choose only option, e.g. for copy-paste - this.selectedPage = this.formatPage(this.searchResults.value.items[0]); + this.selectedPage = this.formatPage(this.searchResults.value[0]); this.selectedSnapshot = this.selectedPage.snapshots[0]; } }; @@ -371,9 +371,7 @@ export class SelectCollectionPage extends BtrixElement { ) => { if (!results) return; - const { items } = results; - - if (!items.length) { + if (!results.length) { return html` ${msg("No matching page found.")} @@ -382,7 +380,7 @@ export class SelectCollectionPage extends BtrixElement { } return html` - ${items.map((item: Page) => { + ${results.map((item: Page) => { return html` >( - `/orgs/${this.orgId}/collections/${id}/urls?${query}`, + return this.api.fetch( + `/orgs/${this.orgId}/collections/${id}/pageSnapshots?${query}`, { signal }, ); } From c95022aeffec918fc9103bda84710052f468fb74 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 13:27:49 -0800 Subject: [PATCH 50/59] crawl /replay.json optimization: - rename list_collection_pages -> list_replay_query_pages to be used for replay page querying for collection or single crawl, no page totals - use list_replay_query_pages in crawl /replay.json --- backend/btrixcloud/basecrawls.py | 4 ++-- backend/btrixcloud/colls.py | 10 ++++++---- backend/btrixcloud/pages.py | 11 +++++++---- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index ed71afe081..2777ebbcd3 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -171,8 +171,8 @@ async def get_crawl_out( res["collections"] = await self.colls.get_collection_names(coll_ids) if res.get("version", 1) == 2: - res["initialPages"], _ = await self.page_ops.list_pages( - crawlid, is_seed=True, page_size=25 + res["initialPages"], _ = await self.page_ops.list_replay_query_pages( + crawl_ids=[crawlid], is_seed=True, page_size=25 ) oid = res.get("oid") diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 5c9df8e878..41601f37ce 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -351,7 +351,7 @@ async def get_collection_out( await self.get_collection_crawl_resources(coll_id) ) - initial_pages: List[PageOut] = await self.page_ops.list_collection_pages( + initial_pages: List[PageOut] = await self.page_ops.list_replay_query_pages( coll_id, crawl_ids=crawl_ids, page_size=25, @@ -973,11 +973,13 @@ async def delete_thumbnail(self, coll_id: UUID, org: Organization): # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep) -> CollectionOps: +def init_collections_api( + app, mdb, orgs, storage_ops, event_webhook_ops, user_dep +) -> CollectionOps: """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments - colls : CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) + colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) org_crawl_dep = orgs.org_crawl_dep org_viewer_dep = orgs.org_viewer_dep @@ -1233,7 +1235,7 @@ async def download_public_collection( @app.get( "/orgs/{oid}/collections/{coll_id}/pageSnapshots", tags=["collections"], - response_model=List[PageUrlCount] + response_model=List[PageUrlCount], ) async def get_collection_url_list( coll_id: UUID, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 0162d2319a..178fec6db2 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -651,10 +651,10 @@ async def list_pages( return [PageOut.from_dict(data) for data in items], total - async def list_collection_pages( + async def list_replay_query_pages( self, - coll_id: UUID, - crawl_ids=None, + coll_id: Optional[UUID] = None, + crawl_ids: Optional[List[str]] = None, org: Optional[Organization] = None, search: Optional[str] = None, url: Optional[str] = None, @@ -674,7 +674,10 @@ async def list_collection_pages( page = page - 1 skip = page_size * page - if crawl_ids is not None: + if crawl_ids is None and coll_id is None: + raise Exception("either crawl_ids or coll_id must be provided") + + if coll_id and crawl_ids is None: crawl_ids = await self.coll_ops.get_collection_crawl_ids( coll_id, public_or_unlisted_only ) From e3e36513dc9efceaa418b2d0a92af25785f20bc4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 13:32:27 -0800 Subject: [PATCH 51/59] cleanup --- backend/btrixcloud/colls.py | 6 +++--- backend/btrixcloud/pages.py | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 41601f37ce..116ccb3c3d 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -6,7 +6,7 @@ from datetime import datetime from collections import Counter from uuid import UUID, uuid4 -from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union +from typing import Optional, List, TYPE_CHECKING, cast, Dict, Any, Union import os import re import urllib.parse @@ -47,7 +47,6 @@ PageUrlCount, PageOut, PageIdTimestamp, - PaginatedPageUrlCountResponse, UpdateCollHomeUrl, User, ImageFile, @@ -788,7 +787,8 @@ async def list_page_snapshots_in_collection( page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, ) -> List[PageUrlCount]: - """List all page URLs in collection sorted desc by snapshot count unless prefix is specified""" + """List all page URLs in collection sorted desc by snapshot count + unless prefix is specified""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements # Zero-index page for query page = page - 1 diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 178fec6db2..35cbc20f54 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -675,7 +675,9 @@ async def list_replay_query_pages( skip = page_size * page if crawl_ids is None and coll_id is None: - raise Exception("either crawl_ids or coll_id must be provided") + raise HTTPException( + status_code=400, detail="either crawl_ids or coll_id must be provided" + ) if coll_id and crawl_ids is None: crawl_ids = await self.coll_ops.get_collection_crawl_ids( @@ -1014,7 +1016,7 @@ async def process_finished_crawls(): # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep -): +) -> PageOps: """init pages API""" # pylint: disable=invalid-name @@ -1254,7 +1256,7 @@ async def get_public_collection_pages_list( sortDirection: Optional[int] = -1, ) -> List[PageOut]: """Retrieve paginated list of pages in collection""" - pages = await ops.list_collection_pages( + pages = await ops.list_replay_query_pages( coll_id=coll_id, org=org, search=search, @@ -1311,7 +1313,7 @@ async def get_collection_pages_list( sortDirection: Optional[int] = -1, ) -> List[PageOut]: """Retrieve paginated list of pages in collection""" - pages = await ops.list_collection_pages( + pages = await ops.list_replay_query_pages( coll_id=coll_id, org=org, search=search, From 0efe8224ac774e14ed385ec4f32198da60b00956 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 14:24:39 -0800 Subject: [PATCH 52/59] fix typo --- backend/btrixcloud/basecrawls.py | 2 +- backend/btrixcloud/colls.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 2777ebbcd3..ee852bf336 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -171,7 +171,7 @@ async def get_crawl_out( res["collections"] = await self.colls.get_collection_names(coll_ids) if res.get("version", 1) == 2: - res["initialPages"], _ = await self.page_ops.list_replay_query_pages( + res["initialPages"] = await self.page_ops.list_replay_query_pages( crawl_ids=[crawlid], is_seed=True, page_size=25 ) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 116ccb3c3d..d88c63d994 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -822,6 +822,7 @@ async def list_page_snapshots_in_collection( ) if skip: aggregate.append({"$skip": skip}) + aggregate.append({"$limit": page_size}) # Get total From 4d6b21c37667931745b664b9ca7f3243f44f0d08 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 14:27:19 -0800 Subject: [PATCH 53/59] lint --- backend/btrixcloud/crawlmanager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index a14f87afb6..fdc45db94a 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -21,6 +21,7 @@ # ============================================================================ +# pylint: disable=too-many-public-methods class CrawlManager(K8sAPI): """abstract crawl manager""" From 77caa4064d06a2abeff9e9024ad5f02157fe8013 Mon Sep 17 00:00:00 2001 From: Emma Segal-Grossman Date: Wed, 19 Feb 2025 18:12:11 -0500 Subject: [PATCH 54/59] Delay rendering components doing data fetching in collection settings dialog until open (#2414) Delays rendering contents of the collection settings dialog until actually needed. Various fetches that internal components were running were causing slowdowns in other parts of the app, so this should resolve some of that. --- .../collections/collection-edit-dialog.ts | 188 +++++++++--------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/frontend/src/features/collections/collection-edit-dialog.ts b/frontend/src/features/collections/collection-edit-dialog.ts index 13350e832e..fed4f33913 100644 --- a/frontend/src/features/collections/collection-edit-dialog.ts +++ b/frontend/src/features/collections/collection-edit-dialog.ts @@ -215,101 +215,106 @@ export class CollectionEdit extends BtrixElement { if (this.dirty) e.preventDefault(); }} class="h-full [--width:var(--btrix-screen-desktop)]" - > - ${this.collection + >${this.isDialogVisible ? html` -
{ - void this.checkChanged(); - }} - @sl-input=${() => { - void this.checkChanged(); - }} - @sl-change=${() => { - void this.checkChanged(); - }} - > - ) => { - this.tab = e.detail; + ${this.collection + ? html` + { + void this.checkChanged(); + }} + @sl-input=${() => { + void this.checkChanged(); + }} + @sl-change=${() => { + void this.checkChanged(); + }} + > + ) => { + this.tab = e.detail; + }} + class="part-[content]:pt-4" + > + ${this.renderTab({ + panel: "general", + icon: "easel3-fill", + string: msg("Presentation"), + })} + ${this.renderTab({ + panel: "sharing", + icon: "globe2", + string: msg("Sharing"), + })} + + + ${renderPresentation.bind(this)()} + + + + + + + + + ` + : html` +
+ +
+ `} +
+ { + // Using reset method instead of type="reset" fixes + // incorrect getRootNode in Chrome + (await this.form).reset(); }} - class="part-[content]:pt-4" + >${this.dirty + ? msg("Discard Changes") + : msg("Cancel")} + ${this.dirty + ? html`${msg("Unsaved changes.")}` + : nothing} + ${this.errorTab !== null + ? html`${msg("Please review issues with your changes.")}` + : nothing} + { + // Using submit method instead of type="submit" fixes + // incorrect getRootNode in Chrome + const form = await this.form; + const submitInput = form.querySelector( + 'input[type="submit"]', + ); + form.requestSubmit(submitInput); + }} + >${msg("Save")} - ${this.renderTab({ - panel: "general", - icon: "easel3-fill", - string: msg("Presentation"), - })} - ${this.renderTab({ - panel: "sharing", - icon: "globe2", - string: msg("Sharing"), - })} - - - ${renderPresentation.bind(this)()} - - - - - - - - - ` - : html` -
-
- `} -
- { - // Using reset method instead of type="reset" fixes - // incorrect getRootNode in Chrome - (await this.form).reset(); - }} - >${this.dirty ? msg("Discard Changes") : msg("Cancel")} - ${this.dirty - ? html`${msg("Unsaved changes.")}` - : nothing} - ${this.errorTab !== null - ? html`${msg("Please review issues with your changes.")}` - : nothing} - { - // Using submit method instead of type="submit" fixes - // incorrect getRootNode in Chrome - const form = await this.form; - const submitInput = form.querySelector( - 'input[type="submit"]', - ); - form.requestSubmit(submitInput); - }} - >${msg("Save")} -
+ ` + : nothing} ${this.renderReplay()}`; } @@ -317,6 +322,7 @@ export class CollectionEdit extends BtrixElement { private renderReplay() { if (this.replayWebPage) return; if (!this.collection) return; + if (!this.isDialogVisible) return; if (!this.collection.crawlCount) return; const replaySource = `/api/orgs/${this.orgId}/collections/${this.collectionId}/replay.json`; From d6e76c4186a7736974ed2c3905ce98d2c429a014 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 15:41:02 -0800 Subject: [PATCH 55/59] optimize page snapshots, add non-group alternative --- backend/btrixcloud/colls.py | 57 +++++++++++++++++++++++++++++++++++-- backend/btrixcloud/pages.py | 7 ++++- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index d88c63d994..ddd82e5171 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -780,6 +780,61 @@ async def get_org_public_collections( return OrgPublicCollections(org=public_org_details, collections=collections) async def list_page_snapshots_in_collection( + self, + coll_id: UUID, + url_prefix: Optional[str] = None, + page_size: int = DEFAULT_PAGE_SIZE, + ) -> List[PageUrlCount]: + """List all page URLs in collection sorted desc by snapshot count + unless prefix is specified""" + # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements + # Zero-index page for query + + crawl_ids = await self.get_collection_crawl_ids(coll_id) + + match_query: dict[str, object] = {"crawl_id": {"$in": crawl_ids}} + sort_query: dict[str, int] = {"isSeed": -1, "url": 1, "ts": 1} + + if url_prefix: + url_prefix = urllib.parse.unquote(url_prefix) + regex_pattern = f"^{re.escape(url_prefix)}" + match_query["url"] = {"$regex": regex_pattern, "$options": "i"} + # sort_query = {"ts": 1} + + aggregate: List[Dict[str, Union[int, object]]] = [ + {"$match": match_query}, + {"$sort": sort_query}, + ] + + aggregate.append({"$limit": page_size * len(crawl_ids)}) + + # Get total + print(aggregate) + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=page_size * len(crawl_ids)) + + url_counts: dict[str, PageUrlCount] = {} + for result in results: + url = result.get("url") + count = url_counts.get(url) + if not count: + count = PageUrlCount(url=url, snapshots=[], count=0) + url_counts[url] = count + count.snapshots.append( + PageIdTimestamp( + pageId=result.get("_id"), + ts=result.get("ts"), + status=result.get("status", 200), + ) + ) + count.count += 1 + + if len(url_counts) >= page_size: + break + + return list(url_counts.values()) + + async def list_page_snapshots_in_collection_old( self, coll_id: UUID, oid: UUID, @@ -1248,10 +1303,8 @@ async def get_collection_url_list( """Retrieve paginated list of urls in collection sorted by snapshot count""" pages = await colls.list_page_snapshots_in_collection( coll_id=coll_id, - oid=oid, url_prefix=urlPrefix, page_size=pageSize, - page=page, ) return pages diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 35cbc20f54..6a17fd135b 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -75,7 +75,12 @@ async def init_index(self): """init index for pages db collection""" await self.pages.create_index([("crawl_id", pymongo.HASHED)]) await self.pages.create_index( - [("crawl_id", pymongo.HASHED), ("url", pymongo.ASCENDING)] + [ + ("crawl_id", pymongo.HASHED), + ("isSeed", pymongo.DESCENDING), + ("url", pymongo.ASCENDING), + ("ts", pymongo.ASCENDING), + ] ) await self.pages.create_index( [ From 85d4c6f143d88f04a40445c7472a9759bae73eb0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 16:09:04 -0800 Subject: [PATCH 56/59] fix page_count update? --- backend/btrixcloud/colls.py | 5 +++-- backend/btrixcloud/models.py | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index ddd82e5171..a44a8eadf3 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -636,9 +636,10 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size += file.size try: - _, crawl_page_count = await self.page_ops.list_pages( - crawl.id, org, page_size=1_000_000 + crawl_page_count = await self.pages.count_documents( + {"crawl_id": crawl.id} ) + if crawl_page_count == 0: for file in files: preload_resources.append( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 13de0f1ec4..a080443033 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1385,7 +1385,6 @@ class CrawlOutWithResources(CrawlOut): collections: Optional[List[CollIdName]] = [] initialPages: List[PageOut] = [] - totalPages: Optional[int] = None pagesQueryUrl: str = "" @@ -1514,7 +1513,6 @@ class CollOut(BaseMongoModel): allowPublicDownload: bool = True initialPages: List[PageOut] = [] - totalPages: Optional[int] = None preloadResources: List[PreloadResource] = [] pagesQueryUrl: str = "" From bf6f24b9d02f0fa74ba7d95120e4e1914aa38d2d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2025 16:40:43 -0800 Subject: [PATCH 57/59] tweak page snapshot sort order --- backend/btrixcloud/colls.py | 3 +-- backend/btrixcloud/pages.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index a44a8eadf3..54de17d981 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -794,13 +794,12 @@ async def list_page_snapshots_in_collection( crawl_ids = await self.get_collection_crawl_ids(coll_id) match_query: dict[str, object] = {"crawl_id": {"$in": crawl_ids}} - sort_query: dict[str, int] = {"isSeed": -1, "url": 1, "ts": 1} + sort_query: dict[str, int] = {"isSeed": -1, "ts": 1, "url": 1} if url_prefix: url_prefix = urllib.parse.unquote(url_prefix) regex_pattern = f"^{re.escape(url_prefix)}" match_query["url"] = {"$regex": regex_pattern, "$options": "i"} - # sort_query = {"ts": 1} aggregate: List[Dict[str, Union[int, object]]] = [ {"$match": match_query}, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 6a17fd135b..df52275fed 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -78,8 +78,8 @@ async def init_index(self): [ ("crawl_id", pymongo.HASHED), ("isSeed", pymongo.DESCENDING), - ("url", pymongo.ASCENDING), ("ts", pymongo.ASCENDING), + ("url", pymongo.ASCENDING), ] ) await self.pages.create_index( From 96c715aad0356bd9fd5e8d712b66789f9b430297 Mon Sep 17 00:00:00 2001 From: emma Date: Thu, 20 Feb 2025 01:33:55 -0500 Subject: [PATCH 58/59] wip --- backend/btrixcloud/colls.py | 4 +- backend/btrixcloud/models.py | 1 + frontend/src/components/ui/combobox.ts | 26 +++++++++-- frontend/src/components/ui/index.ts | 1 + .../components/ui/menu-item-without-focus.ts | 22 +++++++++ frontend/src/context/popup-boundary.ts | 8 ++++ .../collections/collection-edit-dialog.ts | 6 +++ .../collections/select-collection-page.ts | 46 +++++++++++++------ frontend/src/utils/css.ts | 13 +++--- 9 files changed, 100 insertions(+), 27 deletions(-) create mode 100644 frontend/src/components/ui/menu-item-without-focus.ts create mode 100644 frontend/src/context/popup-boundary.ts diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 54de17d981..493a583a38 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -818,7 +818,9 @@ async def list_page_snapshots_in_collection( url = result.get("url") count = url_counts.get(url) if not count: - count = PageUrlCount(url=url, snapshots=[], count=0) + count = PageUrlCount( + url=url, title=result.get("title"), snapshots=[], count=0 + ) url_counts[url] = count count.snapshots.append( PageIdTimestamp( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a080443033..346dab22f1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1373,6 +1373,7 @@ class PageUrlCount(BaseModel): """Model for counting pages by URL""" url: AnyHttpUrl + title: Optional[str] = None count: int = 0 snapshots: List[PageIdTimestamp] = [] diff --git a/frontend/src/components/ui/combobox.ts b/frontend/src/components/ui/combobox.ts index a8de1bfa80..b45dc79b0e 100644 --- a/frontend/src/components/ui/combobox.ts +++ b/frontend/src/components/ui/combobox.ts @@ -1,5 +1,6 @@ +import { consume } from "@lit/context"; import type { SlMenu, SlMenuItem, SlPopup } from "@shoelace-style/shoelace"; -import { css, html, LitElement, type PropertyValues } from "lit"; +import { css, html, type PropertyValues } from "lit"; import { customElement, property, @@ -8,6 +9,8 @@ import { state, } from "lit/decorators.js"; +import { TailwindElement } from "@/classes/TailwindElement"; +import { popupBoundary } from "@/context/popup-boundary"; import { dropdown } from "@/utils/css"; /** @@ -20,7 +23,7 @@ import { dropdown } from "@/utils/css"; * @event request-close */ @customElement("btrix-combobox") -export class Combobox extends LitElement { +export class Combobox extends TailwindElement { static styles = [ dropdown, css` @@ -34,6 +37,13 @@ export class Combobox extends LitElement { @property({ type: Boolean }) open = false; + @property({ type: Boolean }) + loading = false; + + @consume({ context: popupBoundary }) + @state() + autoSizeBoundary?: Element | Element[] | undefined; + @state() isActive = true; @@ -69,22 +79,25 @@ export class Combobox extends LitElement { } render() { + console.log(this.autoSizeBoundary); return html` -
+
diff --git a/frontend/src/components/ui/index.ts b/frontend/src/components/ui/index.ts index a2e674a1ff..53a6cf838f 100644 --- a/frontend/src/components/ui/index.ts +++ b/frontend/src/components/ui/index.ts @@ -39,3 +39,4 @@ import("./tag-input"); import("./tag"); import("./time-input"); import("./user-language-select"); +import("./menu-item-without-focus"); diff --git a/frontend/src/components/ui/menu-item-without-focus.ts b/frontend/src/components/ui/menu-item-without-focus.ts new file mode 100644 index 0000000000..1984e7ec39 --- /dev/null +++ b/frontend/src/components/ui/menu-item-without-focus.ts @@ -0,0 +1,22 @@ +/** A version of that doesn't steal focus on mouseover */ + +import { SlMenuItem } from "@shoelace-style/shoelace"; +import { customElement } from "lit/decorators.js"; + +@customElement("btrix-menu-item") +// @ts-expect-error this shouldn't be allowed, but idk of an easier way without +// forking the whole component +export class BtrixMenuItem extends SlMenuItem { + private readonly handleMouseOver = (event: MouseEvent) => { + // NOT doing this.focus(); + event.stopPropagation(); + }; + + connectedCallback() { + super.connectedCallback(); + } + + disconnectedCallback() { + super.disconnectedCallback(); + } +} diff --git a/frontend/src/context/popup-boundary.ts b/frontend/src/context/popup-boundary.ts new file mode 100644 index 0000000000..e395e63296 --- /dev/null +++ b/frontend/src/context/popup-boundary.ts @@ -0,0 +1,8 @@ +import { createContext } from "@lit/context"; + +/** + * Boundary for custom instances to use, e.g. when inside a dialog + */ +export const popupBoundary = createContext( + "popup-boundary", +); diff --git a/frontend/src/features/collections/collection-edit-dialog.ts b/frontend/src/features/collections/collection-edit-dialog.ts index fed4f33913..1de4ad23a2 100644 --- a/frontend/src/features/collections/collection-edit-dialog.ts +++ b/frontend/src/features/collections/collection-edit-dialog.ts @@ -1,3 +1,4 @@ +import { provide } from "@lit/context"; import { localized, msg, str } from "@lit/localize"; import { Task, TaskStatus } from "@lit/task"; import { type SlRequestCloseEvent } from "@shoelace-style/shoelace"; @@ -22,6 +23,7 @@ import { type SelectCollectionPage } from "./select-collection-page"; import { BtrixElement } from "@/classes/BtrixElement"; import type { Dialog } from "@/components/ui/dialog"; import { type TabGroupPanel } from "@/components/ui/tab-group/tab-panel"; +import { popupBoundary } from "@/context/popup-boundary"; import { type Collection, type CollectionThumbnailSource, @@ -118,6 +120,9 @@ export class CollectionEdit extends BtrixElement { @query("btrix-collection-snapshot-preview") public readonly thumbnailPreview?: CollectionSnapshotPreview | null; + @provide({ context: popupBoundary }) + private popupBoundary: Element | Element[] | undefined; + protected willUpdate(changedProperties: PropertyValues): void { if (changedProperties.has("collectionId") && this.collectionId) { void this.fetchCollection(this.collectionId); @@ -135,6 +140,7 @@ export class CollectionEdit extends BtrixElement { null; this.selectedSnapshot = this.collection?.thumbnailSource ?? null; } + this.popupBoundary = this.dialog; } readonly checkChanged = checkChanged.bind(this); diff --git a/frontend/src/features/collections/select-collection-page.ts b/frontend/src/features/collections/select-collection-page.ts index 6385005a66..45a9f391a0 100644 --- a/frontend/src/features/collections/select-collection-page.ts +++ b/frontend/src/features/collections/select-collection-page.ts @@ -22,6 +22,8 @@ import type { APIPaginationQuery } from "@/types/api"; import type { Collection } from "@/types/collection"; import type { UnderlyingFunction } from "@/types/utils"; import { tw } from "@/utils/tailwind"; +import { timeoutCache } from "@/utils/timeoutCache"; +import { cached } from "@/utils/weakCache"; type Snapshot = { pageId: string; @@ -31,6 +33,7 @@ type Snapshot = { type Page = { url: string; + title?: string; count: number; snapshots: Snapshot[]; }; @@ -175,17 +178,20 @@ export class SelectCollectionPage extends BtrixElement { } private readonly searchResults = new Task(this, { - task: async ([searchValue], { signal }) => { - const pageUrls = await this.getPageUrls( - { - id: this.collectionId!, - urlPrefix: searchValue, - }, - signal, - ); + task: cached( + async ([searchValue], { signal }) => { + const pageUrls = await this.getPageUrls( + { + id: this.collectionId!, + urlPrefix: searchValue, + }, + signal, + ); - return pageUrls; - }, + return pageUrls; + }, + { cacheConstructor: timeoutCache(300) }, + ), args: () => [this.searchQuery] as const, }); @@ -361,6 +367,7 @@ export class SelectCollectionPage extends BtrixElement { this.renderItems( // Render previous value so that dropdown doesn't shift while typing this.searchResults.value, + true, ), complete: this.renderItems, }); @@ -368,10 +375,11 @@ export class SelectCollectionPage extends BtrixElement { private readonly renderItems = ( results: SelectCollectionPage["searchResults"]["value"], + loading = false, ) => { if (!results) return; - if (!results.length) { + if (!loading && !results.length) { return html` ${msg("No matching page found.")} @@ -382,7 +390,7 @@ export class SelectCollectionPage extends BtrixElement { return html` ${results.map((item: Page) => { return html` - { if (this.input) { @@ -395,8 +403,18 @@ export class SelectCollectionPage extends BtrixElement { this.selectedSnapshot = this.selectedPage.snapshots[0]; }} - >${item.url} - + >${item.title + ? html`
${item.title}
+
+ ${item.url} +
` + : html`
+ ${msg("No page title")} +
+
+ ${item.url} +
`} + `; })} `; diff --git a/frontend/src/utils/css.ts b/frontend/src/utils/css.ts index 3420c9a820..9a1c201730 100644 --- a/frontend/src/utils/css.ts +++ b/frontend/src/utils/css.ts @@ -102,7 +102,6 @@ export const animatePulse = css` export const dropdown = css` .dropdown { contain: content; - transform-origin: top left; box-shadow: var(--sl-shadow-medium); } @@ -111,34 +110,34 @@ export const dropdown = css` } .animateShow { - animation: dropdownShow 100ms ease forwards; + animation: dropdownShow 150ms cubic-bezier(0, 0, 0.2, 1) forwards; } .animateHide { - animation: dropdownHide 100ms ease forwards; + animation: dropdownHide 150ms cubic-bezier(0.4, 0, 1, 1) forwards; } @keyframes dropdownShow { from { opacity: 0; - transform: scale(0.9); + transform: translateY(-8px); } to { opacity: 1; - transform: scale(1); + transform: translateY(0); } } @keyframes dropdownHide { from { opacity: 1; - transform: scale(1); + transform: translateY(0); } to { opacity: 0; - transform: scale(0.9); + transform: translateY(-8px); } } `; From faebb64994fb52a7b5ff3cbeee306b26bea01a25 Mon Sep 17 00:00:00 2001 From: emma Date: Thu, 20 Feb 2025 14:54:27 -0500 Subject: [PATCH 59/59] fix mis-sized spinner --- .../features/collections/edit-dialog/presentation-section.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/features/collections/edit-dialog/presentation-section.ts b/frontend/src/features/collections/edit-dialog/presentation-section.ts index f4d913a00e..63be9383b7 100644 --- a/frontend/src/features/collections/edit-dialog/presentation-section.ts +++ b/frontend/src/features/collections/edit-dialog/presentation-section.ts @@ -100,7 +100,7 @@ export default function renderPresentation(this: CollectionEdit) { ` : this.thumbnailPreview?.blobTask.status === TaskStatus.PENDING && !this.blobIsLoaded - ? html`` + ? html`` : nothing}