From 191188985c1de4d415c47eaf663fba913080220e Mon Sep 17 00:00:00 2001
From: Daofeng Wu <dolphin.w.e@gmail.com>
Date: Tue, 17 Dec 2024 18:55:34 +0900
Subject: [PATCH] feat(scraper): add matched_hashes to output

---
 .../tools/web/scraper/__test__/incremental.py |  3 ++-
 npiai/tools/web/scraper/app.py                | 20 +++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/npiai/tools/web/scraper/__test__/incremental.py b/npiai/tools/web/scraper/__test__/incremental.py
index e550626..453d27e 100644
--- a/npiai/tools/web/scraper/__test__/incremental.py
+++ b/npiai/tools/web/scraper/__test__/incremental.py
@@ -17,7 +17,8 @@ async def summarize(skip_item_hashes: Set[str] | None = None):
             scraping_type="list-like",
             ancestor_selector=".playbook_list",
             items_selector=".playbook_list .playbook_item",
-            limit=5,
+            limit=20,
+            concurrency=2,
             skip_item_hashes=skip_item_hashes,
             output_columns=[
                 {
diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py
index 05c60be..d253da3 100644
--- a/npiai/tools/web/scraper/app.py
+++ b/npiai/tools/web/scraper/app.py
@@ -46,6 +46,7 @@ class SummaryItem(TypedDict):
 
 class SummaryChunk(TypedDict):
     batch_id: int
+    matched_hashes: List[str]
     items: List[SummaryItem]
 
 
@@ -53,6 +54,7 @@ class SummaryChunk(TypedDict):
 class ParsedResult:
     markdown: str
     hashes: List[str]
+    matched_hashes: List[str]
 
 
 __ID_COLUMN__ = Column(
@@ -201,6 +203,7 @@ async def run_batch():
             await results_queue.put(
                 {
                     "batch_id": current_index,
+                    "matched_hashes": parsed_result.matched_hashes,
                     "items": items_slice,
                 }
             )
@@ -432,7 +435,9 @@ async def _parse_items(
 
         sections = []
         hashes = []
+        matched_hashes = []
         count = 0
+
         marking_tasks = []
 
         # use element handles here to snapshot the items
@@ -441,6 +446,7 @@ async def _parse_items(
             markdown, md5 = self._html_to_md_and_hash(html)
 
             if skip_item_hashes and md5 in skip_item_hashes:
+                matched_hashes.append(md5)
                 continue
 
             # mark the item as visited
@@ -464,7 +470,11 @@ async def _parse_items(
 
         await asyncio.gather(*marking_tasks)
 
-        return ParsedResult(markdown="\n".join(sections), hashes=hashes)
+        return ParsedResult(
+            markdown="\n".join(sections),
+            hashes=hashes,
+            matched_hashes=matched_hashes,
+        )
 
     async def _parse_ancestor(
         self,
@@ -518,12 +528,14 @@ async def _parse_ancestor(
 
         sections = []
         hashes = []
+        matched_hashes = []
         count = 0
 
         for html in htmls:
             markdown, md5 = self._html_to_md_and_hash(html)
 
             if skip_item_hashes and md5 in skip_item_hashes:
+                matched_hashes.append(md5)
                 continue
 
             sections.append(f'<section id="{count}">\n{markdown}\n</section>')
@@ -533,7 +545,11 @@ async def _parse_ancestor(
         if not count:
             return None
 
-        return ParsedResult(markdown="\n".join(sections), hashes=hashes)
+        return ParsedResult(
+            markdown="\n".join(sections),
+            hashes=hashes,
+            matched_hashes=matched_hashes,
+        )
 
     @staticmethod
     def _html_to_md_and_hash(html):