feat(scraper): add matched_hashes to output

npi-ai · Dec 17, 2024 · 1911889 · 1911889
1 parent a6889ba
commit 1911889
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 3 deletions.
diff --git a/npiai/tools/web/scraper/__test__/incremental.py b/npiai/tools/web/scraper/__test__/incremental.py
@@ -17,7 +17,8 @@ async def summarize(skip_item_hashes: Set[str] | None = None):
             scraping_type="list-like",
             ancestor_selector=".playbook_list",
             items_selector=".playbook_list .playbook_item",
-            limit=5,
+            limit=20,
+            concurrency=2,
             skip_item_hashes=skip_item_hashes,
             output_columns=[
                 {

diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py
@@ -46,13 +46,15 @@ class SummaryItem(TypedDict):
 
 class SummaryChunk(TypedDict):
     batch_id: int
+    matched_hashes: List[str]
     items: List[SummaryItem]
 
 
 @dataclass
 class ParsedResult:
     markdown: str
     hashes: List[str]
+    matched_hashes: List[str]
 
 
 __ID_COLUMN__ = Column(
@@ -201,6 +203,7 @@ async def run_batch():
             await results_queue.put(
                 {
                     "batch_id": current_index,
+                    "matched_hashes": parsed_result.matched_hashes,
                     "items": items_slice,
                 }
             )
@@ -432,7 +435,9 @@ async def _parse_items(
 
         sections = []
         hashes = []
+        matched_hashes = []
         count = 0
+
         marking_tasks = []
 
         # use element handles here to snapshot the items
@@ -441,6 +446,7 @@ async def _parse_items(
             markdown, md5 = self._html_to_md_and_hash(html)
 
             if skip_item_hashes and md5 in skip_item_hashes:
+                matched_hashes.append(md5)
                 continue
 
             # mark the item as visited
@@ -464,7 +470,11 @@ async def _parse_items(
 
         await asyncio.gather(*marking_tasks)
 
-        return ParsedResult(markdown="\n".join(sections), hashes=hashes)
+        return ParsedResult(
+            markdown="\n".join(sections),
+            hashes=hashes,
+            matched_hashes=matched_hashes,
+        )
 
     async def _parse_ancestor(
         self,
@@ -518,12 +528,14 @@ async def _parse_ancestor(
 
         sections = []
         hashes = []
+        matched_hashes = []
         count = 0
 
         for html in htmls:
             markdown, md5 = self._html_to_md_and_hash(html)
 
             if skip_item_hashes and md5 in skip_item_hashes:
+                matched_hashes.append(md5)
                 continue
 
             sections.append(f'<section id="{count}">\n{markdown}\n</section>')
@@ -533,7 +545,11 @@ async def _parse_ancestor(
         if not count:
             return None
 
-        return ParsedResult(markdown="\n".join(sections), hashes=hashes)
+        return ParsedResult(
+            markdown="\n".join(sections),
+            hashes=hashes,
+            matched_hashes=matched_hashes,
+        )
 
     @staticmethod
     def _html_to_md_and_hash(html):