Skip to content

Commit

Permalink
feat(scraper): add matched_hashes to output
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Dec 17, 2024
1 parent a6889ba commit 1911889
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
3 changes: 2 additions & 1 deletion npiai/tools/web/scraper/__test__/incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ async def summarize(skip_item_hashes: Set[str] | None = None):
scraping_type="list-like",
ancestor_selector=".playbook_list",
items_selector=".playbook_list .playbook_item",
limit=5,
limit=20,
concurrency=2,
skip_item_hashes=skip_item_hashes,
output_columns=[
{
Expand Down
20 changes: 18 additions & 2 deletions npiai/tools/web/scraper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ class SummaryItem(TypedDict):

class SummaryChunk(TypedDict):
batch_id: int
matched_hashes: List[str]
items: List[SummaryItem]


@dataclass
class ParsedResult:
markdown: str
hashes: List[str]
matched_hashes: List[str]


__ID_COLUMN__ = Column(
Expand Down Expand Up @@ -201,6 +203,7 @@ async def run_batch():
await results_queue.put(
{
"batch_id": current_index,
"matched_hashes": parsed_result.matched_hashes,
"items": items_slice,
}
)
Expand Down Expand Up @@ -432,7 +435,9 @@ async def _parse_items(

sections = []
hashes = []
matched_hashes = []
count = 0

marking_tasks = []

# use element handles here to snapshot the items
Expand All @@ -441,6 +446,7 @@ async def _parse_items(
markdown, md5 = self._html_to_md_and_hash(html)

if skip_item_hashes and md5 in skip_item_hashes:
matched_hashes.append(md5)
continue

# mark the item as visited
Expand All @@ -464,7 +470,11 @@ async def _parse_items(

await asyncio.gather(*marking_tasks)

return ParsedResult(markdown="\n".join(sections), hashes=hashes)
return ParsedResult(
markdown="\n".join(sections),
hashes=hashes,
matched_hashes=matched_hashes,
)

async def _parse_ancestor(
self,
Expand Down Expand Up @@ -518,12 +528,14 @@ async def _parse_ancestor(

sections = []
hashes = []
matched_hashes = []
count = 0

for html in htmls:
markdown, md5 = self._html_to_md_and_hash(html)

if skip_item_hashes and md5 in skip_item_hashes:
matched_hashes.append(md5)
continue

sections.append(f'<section id="{count}">\n{markdown}\n</section>')
Expand All @@ -533,7 +545,11 @@ async def _parse_ancestor(
if not count:
return None

return ParsedResult(markdown="\n".join(sections), hashes=hashes)
return ParsedResult(
markdown="\n".join(sections),
hashes=hashes,
matched_hashes=matched_hashes,
)

@staticmethod
def _html_to_md_and_hash(html):
Expand Down

0 comments on commit 1911889

Please sign in to comment.