diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index b0d1713bd5..9e8e63decd 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -98,6 +98,7 @@ async def init_index(self): ("ts", pymongo.ASCENDING), ] ) + await self.pages.create_index([("title", "text")]) async def set_ops(self, background_job_ops: BackgroundJobOps): """Set ops classes as needed""" @@ -759,12 +760,14 @@ async def list_replay_query_pages( if org: query["oid"] = org.id + is_text_search = False if search: - search_regex = re.escape(urllib.parse.unquote(search)) - query["$or"] = [ - {"url": {"$regex": search_regex, "$options": "i"}}, - {"title": {"$regex": search_regex, "$options": "i"}}, - ] + search = urllib.parse.unquote(search) + if search.startswith("http:") or search.startswith("https:"): + query["url"] = {"$gte": search} + else: + query["$text"] = {"$search": search} + is_text_search = True elif url_prefix: url_prefix = urllib.parse.unquote(url_prefix) @@ -805,6 +808,15 @@ async def list_replay_query_pages( raise HTTPException(status_code=400, detail="invalid_sort_direction") aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + elif search: + if is_text_search: + aggregate.extend( + [ + {"$sort": {"score": {"$meta": "textScore"}}}, + ] + ) + else: + aggregate.extend([{"$sort": {"url": 1}}]) else: # default sort: seeds first, then by timestamp aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 0ae9f23c73..e3621b0580 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -643,12 +643,13 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): coll_page_ts = coll_page["ts"] coll_page_title = coll_page["title"] - # Test search filter - partial_title = coll_page_title[:5] + # Test search filter, make sure text search isn't case sensitive + partial_title = "Archiving" + partial_title_lower = partial_title.lower() partial_url = coll_page_url[:8] r = requests.get( - f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}", + f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title_lower}", headers=crawler_auth_headers, ) assert r.status_code == 200