From e5b1e672c51735c2aba6fb127b02b0c5e1937151 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Nov 2025 13:15:04 -0500 Subject: [PATCH 1/8] Fix bug where all crawls are added to workflow as successful even if failed --- backend/btrixcloud/basecrawls.py | 12 +++++++++++- backend/btrixcloud/crawlconfigs.py | 6 ++++-- backend/btrixcloud/crawls.py | 9 +++++++-- backend/btrixcloud/operator/crawls.py | 8 ++++---- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 44693ddaf2..a98709a0e2 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -372,14 +372,21 @@ async def delete_crawls( size += crawl_size cid = crawl.cid + successful = crawl.state in SUCCESSFUL_STATES if cid: if cids_to_update.get(cid): cids_to_update[cid]["inc"] += 1 cids_to_update[cid]["size"] += crawl_size + if successful: + cids_to_update[cid]["successful"] += 1 else: cids_to_update[cid] = {} cids_to_update[cid]["inc"] = 1 cids_to_update[cid]["size"] = crawl_size + if successful: + cids_to_update[cid]["successful"] = 1 + else: + cids_to_update[cid]["successful"] = 0 if type_ == "crawl": asyncio.create_task( @@ -890,7 +897,10 @@ async def delete_crawls_all_types( for cid, cid_dict in cids_to_update.items(): cid_size = cid_dict["size"] cid_inc = cid_dict["inc"] - await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) + cid_successful = cid_dict["successful"] + await self.crawl_configs.stats_recompute_last( + cid, -cid_size, -cid_inc, -cid_successful + ) if uploads_length: upload_delete_list = DeleteCrawlList(crawl_ids=uploads) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 1dcfabc615..ab2c70aef9 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -948,7 +948,9 @@ async def get_last_successful_crawl_out( return None - async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): + async def stats_recompute_last( + self, cid: UUID, size: int, inc_crawls: int = 1, inc_successful: int = 1 + ): """recompute stats by incrementing size counter and number of crawls""" update_query: dict[str, object] = {} @@ -1005,7 +1007,7 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): "$inc": { "totalSize": size, "crawlCount": inc_crawls, - "crawlSuccessfulCount": inc_crawls, + "crawlSuccessfulCount": inc_successful, }, }, ) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index eb273b5c7e..48303a9bda 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -398,7 +398,10 @@ async def delete_crawls( for cid, cid_dict in cids_to_update.items(): cid_size = cid_dict["size"] cid_inc = cid_dict["inc"] - await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) + cid_successful = cid_dict["successful"] + await self.crawl_configs.stats_recompute_last( + cid, -cid_size, -cid_inc, -cid_successful + ) return count, cids_to_update, quota_reached @@ -896,7 +899,9 @@ async def shutdown_crawl( if not graceful: await self.update_crawl_state(crawl_id, "canceled") crawl = await self.get_crawl(crawl_id, org) - if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1): + if not await self.crawl_configs.stats_recompute_last( + crawl.cid, 0, -1, 0 + ): raise HTTPException( status_code=404, detail=f"crawl_config_not_found: {crawl.cid}", diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 2518f8de97..06436143d6 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1753,11 +1753,10 @@ async def do_crawl_finished_tasks( stats: Optional[OpCrawlStats], ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" - await self.crawl_config_ops.stats_recompute_last( - crawl.cid, status.filesAddedSize, 1 - ) - if state in SUCCESSFUL_STATES and crawl.oid: + await self.crawl_config_ops.stats_recompute_last( + crawl.cid, status.filesAddedSize, 1, 1 + ) await self.page_ops.set_archived_item_page_counts(crawl.id) await self.org_ops.set_last_crawl_finished(crawl.oid) await self.coll_ops.add_successful_crawl_to_collections( @@ -1774,6 +1773,7 @@ async def do_crawl_finished_tasks( ) if state in FAILED_STATES: + await self.crawl_config_ops.stats_recompute_last(crawl.cid, 0, 1, 0) await self.crawl_ops.delete_failed_crawl_files(crawl.id, crawl.oid) await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) From 9aa8444cfc2e712b235856489e71f4de8ac7b4c6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 1 Dec 2025 14:45:53 -0500 Subject: [PATCH 2/8] Explicitly check for successful states --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index ab2c70aef9..0f5337932b 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1571,7 +1571,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): total_size += crawl_size - if res["state"] not in FAILED_STATES: + if res["state"] in SUCCESSFUL_STATES: successful_count += 1 last_crawl = res From 3517329a0a25f2498d93e9d540c103eb00ea19cc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 1 Dec 2025 16:34:15 -0500 Subject: [PATCH 3/8] Add migration to recompute workflow stats --- backend/btrixcloud/db.py | 2 +- ...ration_0055_recompute_crawlconfig_stats.py | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 7f0bfbd9b2..9dc79e3f87 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -38,7 +38,7 @@ ) = object -CURR_DB_VERSION = "0054" +CURR_DB_VERSION = "0055" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py new file mode 100644 index 0000000000..33b4411f0f --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py @@ -0,0 +1,35 @@ +""" +Migration 0055 - Recompute workflow crawl stats +""" + +from btrixcloud.crawlconfigs import stats_recompute_all +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0055" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Recompute crawl workflow stats to fix issue with failed crawls + being added to successfulCrawlCount and workflow size totals. + """ + # pylint: disable=duplicate-code + crawl_configs = self.mdb["crawl_configs"] + crawls = self.mdb["crawls"] + + async for config in crawl_configs.find({"inactive": {"$ne": True}}): + config_id = config["_id"] + try: + await stats_recompute_all(crawl_configs, crawls, config_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print(f"Unable to update workflow {config_id}: {err}", flush=True) From ad1a1ea9f5f80a3d32bfa4e4590aa5e4bb4f71cb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 1 Dec 2025 18:00:00 -0500 Subject: [PATCH 4/8] Remove unused import --- backend/btrixcloud/crawlconfigs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 0f5337932b..c30ad7b387 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -46,7 +46,6 @@ PaginatedSeedResponse, PaginatedConfigRevisionResponse, SUCCESSFUL_STATES, - FAILED_STATES, CrawlerChannel, CrawlerChannels, StartedResponse, From 5942011184e4c07cb3196e1b132282885c6edd8b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 10 Dec 2025 15:29:45 -0500 Subject: [PATCH 5/8] Fix stats_recompute_all, pass in crawl_config_ops as arg --- backend/btrixcloud/crawlconfigs.py | 6 ++++-- backend/btrixcloud/db.py | 3 +++ .../migration_0006_precompute_crawl_stats.py | 13 ++++++++++++- .../migration_0007_colls_and_config_update.py | 13 ++++++++++++- .../migration_0055_recompute_crawlconfig_stats.py | 13 ++++++++++++- 5 files changed, 43 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c30ad7b387..20e1cccc0a 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1543,7 +1543,9 @@ async def validate_custom_behavior(self, url: str) -> Dict[str, bool]: # ============================================================================ # pylint: disable=too-many-locals -async def stats_recompute_all(crawl_configs, crawls, cid: UUID): +async def stats_recompute_all( + crawl_config_ops: CrawlConfigOps, crawl_configs, crawls, cid: UUID +): """Re-calculate and update crawl statistics for config. Should only be called when a crawl completes from operator or on migration @@ -1578,7 +1580,7 @@ async def stats_recompute_all(crawl_configs, crawls, cid: UUID): # only update last_crawl if no crawls running, otherwise # lastCrawl* stats are already for running crawl - running_crawl = await crawl_configs.get_running_crawl(cid) + running_crawl = await crawl_config_ops.get_running_crawl(cid) if last_crawl and not running_crawl: update_query["totalSize"] = total_size diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 9dc79e3f87..a9b7b730b7 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -128,6 +128,7 @@ async def update_and_prepare_db( coll_ops, file_ops, crawl_log_ops, + crawl_config_ops, crawl_manager, ): await drop_indexes(mdb) @@ -164,6 +165,7 @@ async def run_db_migrations( coll_ops: CollectionOps, file_ops: FileUploadOps, crawl_log_ops: CrawlLogOps, + crawl_config_ops: CrawlConfigOps, crawl_manager: CrawlManager, ): """Run database migrations.""" @@ -205,6 +207,7 @@ async def run_db_migrations( coll_ops=coll_ops, file_ops=file_ops, crawl_log_ops=crawl_log_ops, + crawl_config_ops=crawl_config_ops, crawl_manager=crawl_manager, ) if await migration.run(): diff --git a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py index 3af7ebddca..f7354a6cba 100644 --- a/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py +++ b/backend/btrixcloud/migrations/migration_0006_precompute_crawl_stats.py @@ -16,6 +16,8 @@ class Migration(BaseMigration): def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) + self.crawl_config_ops = kwargs.get("crawl_config_ops") + async def migrate_up(self): """Perform migration up. @@ -26,10 +28,19 @@ async def migrate_up(self): crawl_configs = self.mdb["crawl_configs"] crawls = self.mdb["crawls"] + if self.crawl_config_ops is None: + print( + f"Unable to set run migration {MIGRATION_VERSION}, missing crawl_config_ops", + flush=True, + ) + return + async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: - await stats_recompute_all(crawl_configs, crawls, config_id) + await stats_recompute_all( + self.crawl_config_ops, crawl_configs, crawls, config_id + ) # pylint: disable=broad-exception-caught except Exception as err: print(f"Unable to update workflow {config_id}: {err}", flush=True) diff --git a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py index 8d708e4dcf..a0d820f7ea 100644 --- a/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py +++ b/backend/btrixcloud/migrations/migration_0007_colls_and_config_update.py @@ -19,17 +19,28 @@ class Migration(BaseMigration): def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) + self.crawl_config_ops = kwargs.get("crawl_config_ops") + async def migrate_up(self): """Perform migration up.""" # pylint: disable=duplicate-code crawl_configs = self.mdb["crawl_configs"] crawls = self.mdb["crawls"] + if self.crawl_config_ops is None: + print( + f"Unable to set run migration {MIGRATION_VERSION}, missing crawl_config_ops", + flush=True, + ) + return + # Update workflows crawl stats to populate crawlSuccessfulCount async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: - await stats_recompute_all(crawl_configs, crawls, config_id) + await stats_recompute_all( + self.crawl_config_ops, crawl_configs, crawls, config_id + ) # pylint: disable=broad-exception-caught except Exception as err: print(f"Unable to update workflow {config_id}: {err}", flush=True) diff --git a/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py index 33b4411f0f..e87c2829ae 100644 --- a/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py +++ b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py @@ -16,6 +16,8 @@ class Migration(BaseMigration): def __init__(self, mdb, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) + self.crawl_config_ops = kwargs.get("crawl_config_ops") + async def migrate_up(self): """Perform migration up. @@ -26,10 +28,19 @@ async def migrate_up(self): crawl_configs = self.mdb["crawl_configs"] crawls = self.mdb["crawls"] + if self.crawl_config_ops is None: + print( + f"Unable to set run migration {MIGRATION_VERSION}, missing crawl_config_ops", + flush=True, + ) + return + async for config in crawl_configs.find({"inactive": {"$ne": True}}): config_id = config["_id"] try: - await stats_recompute_all(crawl_configs, crawls, config_id) + await stats_recompute_all( + self.crawl_config_ops, crawl_configs, crawls, config_id + ) # pylint: disable=broad-exception-caught except Exception as err: print(f"Unable to update workflow {config_id}: {err}", flush=True) From ce52707f5cc77ffdc5c2322dfba9370a58d217e2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Dec 2025 14:17:09 -0800 Subject: [PATCH 6/8] ensure stats are cleared when there are no crawls --- backend/btrixcloud/crawlconfigs.py | 74 ++++++++++++++++++------------ 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 20e1cccc0a..831fd0d818 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1555,15 +1555,16 @@ async def stats_recompute_all( match_query = {"cid": cid, "finished": {"$ne": None}} count = await crawls.count_documents(match_query) - if count: - update_query["crawlCount"] = count - total_size = 0 - successful_count = 0 + update_query["crawlCount"] = count + + total_size = 0 + successful_count = 0 - last_crawl: Optional[dict[str, object]] = None - last_crawl_size = 0 + last_crawl: Optional[dict[str, object]] = None + last_crawl_size = 0 + if count: async for res in crawls.find(match_query).sort("finished", pymongo.ASCENDING): files = res.get("files", []) crawl_size = 0 @@ -1578,29 +1579,44 @@ async def stats_recompute_all( last_crawl = res last_crawl_size = crawl_size - # only update last_crawl if no crawls running, otherwise - # lastCrawl* stats are already for running crawl - running_crawl = await crawl_config_ops.get_running_crawl(cid) - - if last_crawl and not running_crawl: - update_query["totalSize"] = total_size - update_query["crawlSuccessfulCount"] = successful_count - - update_query["lastCrawlId"] = str(last_crawl.get("_id")) - update_query["lastCrawlStartTime"] = last_crawl.get("started") - update_query["lastStartedBy"] = last_crawl.get("userid") - update_query["lastStartedByName"] = last_crawl.get("userName") - update_query["lastCrawlState"] = last_crawl.get("state") - update_query["lastCrawlSize"] = last_crawl_size - update_query["lastCrawlStats"] = last_crawl.get("stats") - update_query["lastCrawlStopping"] = False - update_query["isCrawlRunning"] = False - - last_crawl_finished = last_crawl.get("finished") - update_query["lastCrawlTime"] = last_crawl_finished - - if last_crawl_finished: - update_query["lastRun"] = last_crawl_finished + # always update these + update_query["crawlSuccessfulCount"] = successful_count + update_query["totalSize"] = total_size + + # only update last_crawl if no crawls running, otherwise + # lastCrawl* stats are already for running crawl + running_crawl = await crawl_config_ops.get_running_crawl(cid) + + if last_crawl and not running_crawl: + update_query["lastCrawlId"] = str(last_crawl.get("_id")) + update_query["lastCrawlStartTime"] = last_crawl.get("started") + update_query["lastStartedBy"] = last_crawl.get("userid") + update_query["lastStartedByName"] = last_crawl.get("userName") + update_query["lastCrawlState"] = last_crawl.get("state") + update_query["lastCrawlSize"] = last_crawl_size + update_query["lastCrawlStats"] = last_crawl.get("stats") + update_query["lastCrawlStopping"] = False + update_query["isCrawlRunning"] = False + + last_crawl_finished = last_crawl.get("finished") + update_query["lastCrawlTime"] = last_crawl_finished + + if last_crawl_finished: + update_query["lastRun"] = last_crawl_finished + + elif not last_crawl: + # ensure all last crawl data is cleared + update_query["lastCrawlId"] = None + update_query["lastCrawlStartTime"] = None + update_query["lastStartedBy"] = None + update_query["lastStartedByName"] = None + update_query["lastCrawlTime"] = None + update_query["lastCrawlState"] = None + update_query["lastCrawlSize"] = 0 + update_query["lastCrawlStats"] = None + update_query["lastCrawlStopping"] = False + update_query["isCrawlRunning"] = False + update_query["lastRun"] = None result = await crawl_configs.find_one_and_update( {"_id": cid, "inactive": {"$ne": True}}, From 4273de6d8a192dfb1ae585ee92c7c99b2b9238e9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Dec 2025 14:48:19 -0800 Subject: [PATCH 7/8] add running crawl check --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 831fd0d818..c74bb822b7 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1604,7 +1604,7 @@ async def stats_recompute_all( if last_crawl_finished: update_query["lastRun"] = last_crawl_finished - elif not last_crawl: + elif not last_crawl and not running_crawl: # ensure all last crawl data is cleared update_query["lastCrawlId"] = None update_query["lastCrawlStartTime"] = None From c607155eb0ca890b10c5dc520733e84b77bb175b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Dec 2025 15:15:56 -0800 Subject: [PATCH 8/8] add typing for stats_recompute_all --- backend/btrixcloud/crawlconfigs.py | 10 +++++++--- .../migration_0055_recompute_crawlconfig_stats.py | 4 +++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c74bb822b7..37b120aa16 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -28,6 +28,7 @@ import aiohttp from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response import pymongo +from motor.motor_asyncio import AsyncIOMotorCollection from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -1544,8 +1545,11 @@ async def validate_custom_behavior(self, url: str) -> Dict[str, bool]: # ============================================================================ # pylint: disable=too-many-locals async def stats_recompute_all( - crawl_config_ops: CrawlConfigOps, crawl_configs, crawls, cid: UUID -): + crawl_config_ops: CrawlConfigOps, + crawl_configs: AsyncIOMotorCollection, + crawls: AsyncIOMotorCollection, + cid: UUID, +) -> bool: """Re-calculate and update crawl statistics for config. Should only be called when a crawl completes from operator or on migration @@ -1624,7 +1628,7 @@ async def stats_recompute_all( return_document=pymongo.ReturnDocument.AFTER, ) - return result + return result is not None # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py index e87c2829ae..8138707bdd 100644 --- a/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py +++ b/backend/btrixcloud/migrations/migration_0055_recompute_crawlconfig_stats.py @@ -2,6 +2,8 @@ Migration 0055 - Recompute workflow crawl stats """ +from motor.motor_asyncio import AsyncIOMotorDatabase + from btrixcloud.crawlconfigs import stats_recompute_all from btrixcloud.migrations import BaseMigration @@ -13,7 +15,7 @@ class Migration(BaseMigration): """Migration class.""" # pylint: disable=unused-argument - def __init__(self, mdb, **kwargs): + def __init__(self, mdb: AsyncIOMotorDatabase, **kwargs): super().__init__(mdb, migration_version=MIGRATION_VERSION) self.crawl_config_ops = kwargs.get("crawl_config_ops")