Skip to content

Commit c06e8c6

Browse files
committed
Porting scrape & crawl to newest inferring writer
NOTE: scrape -m is still unsafe
1 parent 8b80fef commit c06e8c6

File tree

3 files changed

+15
-14
lines changed

3 files changed

+15
-14
lines changed

ftest/scrapers/title.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,4 @@
33

44

55
def scrape(row: RowWrapper, soup: WonderfulSoup):
6-
return {"url": row.url, "title": soup.scrape_one("title")}
7-
8-
9-
def titles(row: RowWrapper, soup: WonderfulSoup):
10-
yield soup.scrape_one("title")
6+
return soup.scrape_one("title")

minet/cli/crawl/crawl.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __add_file(self, name: Optional[str], path: str, spider):
8585

8686
if self.format == "csv":
8787
# TODO: ability to pass fieldnames? from spider?
88-
w = casanova.InferringWriter(f, add=["job_id"])
88+
w = casanova.InferringWriter(f, prepend=["job_id"])
8989
elif self.format == "jsonl" or self.format == "ndjson":
9090
w = ndjson.writer(f)
9191
else:
@@ -97,7 +97,7 @@ def __unpack_result(self, result: SuccessfulCrawlResult, data):
9797
job_id = result.job.id
9898

9999
if self.format == "csv":
100-
return (data, [job_id])
100+
return ([job_id], data)
101101

102102
return ({"job_id": job_id, "data": data},)
103103

minet/cli/scrape/scrape.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,19 @@ def action(cli_args):
194194

195195
if cli_args.format == "csv":
196196
if isinstance(scraper, FunctionScraper):
197-
reader = casanova.reader(cli_args.input, total=cli_args.total)
198-
199-
# TODO: support for inferring_enricher
200-
# TODO: support forwarding cases that will yield None
201-
writer = casanova.inferring_writer(
202-
cli_args.output, plural_separator=cli_args.plural_separator
197+
enricher = casanova.inferring_enricher(
198+
cli_args.input,
199+
cli_args.output,
200+
total=cli_args.total,
201+
plural_separator=cli_args.plural_separator,
202+
select=cli_args.select,
203+
mapping_sample_size=512,
204+
buffer_optionals=True,
203205
)
206+
reader = enricher
204207

205208
def writerow(row, item):
206-
writer.writerow(item)
209+
enricher.writerow(row, item)
207210

208211
else:
209212
assert scraper.fieldnames is not None
@@ -356,6 +359,8 @@ def payloads() -> Iterator[ScrapeWorkerPayload]:
356359
assert result.items is not None
357360
items = result.items
358361

362+
print(items)
363+
359364
with writer_lock:
360365
for item in items:
361366
writerow(original_item.row, item)

0 commit comments

Comments
 (0)