webrecorder · tw4l · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -98,6 +98,7 @@ async def init_index(self):
                 ("ts", pymongo.ASCENDING),
             ]
         )
+        await self.pages.create_index([("title", "text")])
 
     async def set_ops(self, background_job_ops: BackgroundJobOps):
         """Set ops classes as needed"""
@@ -759,12 +760,14 @@ async def list_replay_query_pages(
         if org:
             query["oid"] = org.id
 
+        is_text_search = False
         if search:
-            search_regex = re.escape(urllib.parse.unquote(search))
-            query["$or"] = [
-                {"url": {"$regex": search_regex, "$options": "i"}},
-                {"title": {"$regex": search_regex, "$options": "i"}},
-            ]
+            search = urllib.parse.unquote(search)
+            if search.startswith("http:") or search.startswith("https:"):
+                query["url"] = {"$gte": search}
+            else:
+                query["$text"] = {"$search": search}
+                is_text_search = True
 
         elif url_prefix:
             url_prefix = urllib.parse.unquote(url_prefix)
@@ -805,6 +808,15 @@ async def list_replay_query_pages(
                 raise HTTPException(status_code=400, detail="invalid_sort_direction")
 
             aggregate.extend([{"$sort": {sort_by: sort_direction}}])
+        elif search:
+            if is_text_search:
+                aggregate.extend(
+                    [
+                        {"$sort": {"score": {"$meta": "textScore"}}},
+                    ]
+                )
+            else:
+                aggregate.extend([{"$sort": {"url": 1}}])
         else:
             # default sort: seeds first, then by timestamp
             aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}])

diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py
@@ -643,12 +643,13 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
     coll_page_ts = coll_page["ts"]
     coll_page_title = coll_page["title"]
 
-    # Test search filter
-    partial_title = coll_page_title[:5]
+    # Test search filter, make sure text search isn't case sensitive
+    partial_title = "Archiving"
+    partial_title_lower = partial_title.lower()
     partial_url = coll_page_url[:8]
 
     r = requests.get(
-        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
+        f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title_lower}",
         headers=crawler_auth_headers,
     )
     assert r.status_code == 200