diff --git a/npiai/tools/web/scraper/__test__/incremental.py b/npiai/tools/web/scraper/__test__/incremental.py index 453d27e..d8ee669 100644 --- a/npiai/tools/web/scraper/__test__/incremental.py +++ b/npiai/tools/web/scraper/__test__/incremental.py @@ -43,10 +43,12 @@ async def summarize(skip_item_hashes: Set[str] | None = None): start = time.monotonic() count = 0 hashes = set() + matched_hashes = set() async for chunk in stream: count += len(chunk["items"]) print("Chunk:", json.dumps(chunk, indent=2)) + matched_hashes.update(chunk["matched_hashes"]) for item in chunk["items"]: hashes.add(item["hash"]) @@ -54,6 +56,10 @@ async def summarize(skip_item_hashes: Set[str] | None = None): end = time.monotonic() print(f"Summarized {count} items in {end - start:.2f} seconds") + if skip_item_hashes: + print("Matched hashes:", matched_hashes) + print("Unmatched hashes:", skip_item_hashes - matched_hashes) + return hashes