From 191188985c1de4d415c47eaf663fba913080220e Mon Sep 17 00:00:00 2001 From: Daofeng Wu Date: Tue, 17 Dec 2024 18:55:34 +0900 Subject: [PATCH] feat(scraper): add matched_hashes to output --- .../tools/web/scraper/__test__/incremental.py | 3 ++- npiai/tools/web/scraper/app.py | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/npiai/tools/web/scraper/__test__/incremental.py b/npiai/tools/web/scraper/__test__/incremental.py index e550626..453d27e 100644 --- a/npiai/tools/web/scraper/__test__/incremental.py +++ b/npiai/tools/web/scraper/__test__/incremental.py @@ -17,7 +17,8 @@ async def summarize(skip_item_hashes: Set[str] | None = None): scraping_type="list-like", ancestor_selector=".playbook_list", items_selector=".playbook_list .playbook_item", - limit=5, + limit=20, + concurrency=2, skip_item_hashes=skip_item_hashes, output_columns=[ { diff --git a/npiai/tools/web/scraper/app.py b/npiai/tools/web/scraper/app.py index 05c60be..d253da3 100644 --- a/npiai/tools/web/scraper/app.py +++ b/npiai/tools/web/scraper/app.py @@ -46,6 +46,7 @@ class SummaryItem(TypedDict): class SummaryChunk(TypedDict): batch_id: int + matched_hashes: List[str] items: List[SummaryItem] @@ -53,6 +54,7 @@ class SummaryChunk(TypedDict): class ParsedResult: markdown: str hashes: List[str] + matched_hashes: List[str] __ID_COLUMN__ = Column( @@ -201,6 +203,7 @@ async def run_batch(): await results_queue.put( { "batch_id": current_index, + "matched_hashes": parsed_result.matched_hashes, "items": items_slice, } ) @@ -432,7 +435,9 @@ async def _parse_items( sections = [] hashes = [] + matched_hashes = [] count = 0 + marking_tasks = [] # use element handles here to snapshot the items @@ -441,6 +446,7 @@ async def _parse_items( markdown, md5 = self._html_to_md_and_hash(html) if skip_item_hashes and md5 in skip_item_hashes: + matched_hashes.append(md5) continue # mark the item as visited @@ -464,7 +470,11 @@ async def _parse_items( await asyncio.gather(*marking_tasks) - return ParsedResult(markdown="\n".join(sections), hashes=hashes) + return ParsedResult( + markdown="\n".join(sections), + hashes=hashes, + matched_hashes=matched_hashes, + ) async def _parse_ancestor( self, @@ -518,12 +528,14 @@ async def _parse_ancestor( sections = [] hashes = [] + matched_hashes = [] count = 0 for html in htmls: markdown, md5 = self._html_to_md_and_hash(html) if skip_item_hashes and md5 in skip_item_hashes: + matched_hashes.append(md5) continue sections.append(f'
\n{markdown}\n
') @@ -533,7 +545,11 @@ async def _parse_ancestor( if not count: return None - return ParsedResult(markdown="\n".join(sections), hashes=hashes) + return ParsedResult( + markdown="\n".join(sections), + hashes=hashes, + matched_hashes=matched_hashes, + ) @staticmethod def _html_to_md_and_hash(html):